コード例 #1
0
def map_str_to_digit(data,
                     features,
                     no_map_features,
                     only_map_features=" ",
                     label=" "):
    no_map_features_index = get_known_features_index(features, no_map_features)
    features_map_info = dict()

    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    fixed_str_features_index = get_known_features_index(
        features, fixed_str_features)

    only_map_features_index = range(len(features))
    if not only_map_features == " ":
        only_map_features_index = get_known_features_index(
            features, only_map_features)
    for fea_pos in range(1, len(features)):
        if not fea_pos in no_map_features_index and fea_pos in only_map_features_index:
            map_info = OrderedDict()
            #feature_map_info = OrderedDict()
            fea_val_cla = feature_value_class(data, fea_pos, label,
                                              fixed_str_features_index)
            # if this feature is a string value, just convert it to value
            if fea_val_cla["str_feature"]:

                data, map_info = map_str_feature_to_value(
                    data, fea_pos, fea_val_cla)
                features_map_info[features[fea_pos]] = map_info
                #features_map_info[].append([feature_map_info])

    digited_data = convert_to_numerical(data, features)
    return digited_data, features_map_info
コード例 #2
0
def new_UserInfo_18(data, features):
    solved_features = ["UserInfo_18"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "UserInfo_18_bined"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = np.zeros((len(data), 1))
    for user in range(data.shape[0]):
        user_age = data[user, fea_indexs]
        if user_age < "22":
            feature_data[user, 0] = 0
        elif user_age < "30":
            feature_data[user, 0] = 1
        elif user_age < "40":
            feature_data[user, 0] = 2
        elif user_age < "50":
            feature_data[user, 0] = 3
        else:
            feature_data[user, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = solved_features)
    print(deleted)
    return new_data, new_features
コード例 #3
0
def new_UserInfo_differ(data,
                        features,
                        key_features,
                        feature_name,
                        deleted_all=True):

    fea_indexs = get_known_features_index(features, key_features)

    new_add_feature = np.array([feature_name])

    feature_data = np.zeros((len(data), 1))
    new_features = np.concatenate((features, new_add_feature))

    for user in range(data.shape[0]):
        if not len(set(list(data[user, fea_indexs]))) == 1:
            feature_data[user, 0] = 1

    new_data = np.concatenate((data, feature_data), axis=1)

    delete_feas = key_features[1:]
    if deleted_all:
        delete_feas = key_features

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = delete_feas)
    print(deleted)
    return new_data, new_features
コード例 #4
0
def thirdParty_one_period_info(data, features, label, period_number):

	based_feature_name = "ThirdParty_Info_Period"
	solved_features = list()

	for i in range(1, 17):
		solved_feature = based_feature_name + period_number[0] + "_" + str(i) 

		solved_features.append(solved_feature)

	indexs = get_known_features_index(features, solved_features)
	one_period_info = OrderedDict()
	one_period_info["missing count"] = 0
	one_period_info["missing contain positive count"] = 0
	one_period_info["missing indexs"] = list()
	one_period_info["missing indexs label"] = list()

	for user in range(data.shape[0]):
		sat_data = list(data[user, indexs])
		if -1 in sat_data:
			one_period_info["missing count"] += 1
			if label[user] == 1:
				one_period_info["missing contain positive count"] += 1
			one_period_info["missing indexs"].append(user)
			one_period_info["missing indexs label"].append(label[user])

	return one_period_info
コード例 #5
0
def new_UserInfo_18(data, features):
	solved_features = ["UserInfo_18"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "UserInfo_18_bined"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	feature_data = np.zeros((len(data), 1))
	for user in range(data.shape[0]):
		user_age = data[user, fea_indexs]
		if user_age < "22":
			feature_data[user, 0] = 0
		elif user_age < "30":
			feature_data[user, 0] = 1
		elif user_age < "40":
			feature_data[user, 0] = 2
		elif user_age < "50":
			feature_data[user, 0] = 3
		else:
			feature_data[user, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = solved_features)
	print(deleted)
	return new_data, new_features
コード例 #6
0
ファイル: features_reduce.py プロジェクト: Heipiao/DataPigs
def correlation_between_properties(data, features):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)

	title = list()
	title.append("features1")
	title.append("features2")
	title.append("calculate_method") 
	title.append("cor")
	title.append("pval")
	save_result(title, "pearsonr_spearmanr_results.csv")
	save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
	for fea_pos in range(len(features)):
		for fea_pos_add in range(fea_pos + 1, len(features)):
			info_result = list()
			info_result.append(features[fea_pos])
			info_result.append(features[fea_pos_add])
			a1 = data[:, fea_pos]
			a2 = data[:, fea_pos_add]
			# they are all not str style features
			if fea_pos not in indexs and fea_pos_add not in indexs:
				info_result.append("pearsonr")
				cor, pval = stats.pearsonr(a1, a2)
			else: # one of them or all of them are str style features
				info_result.append("spearmanr")
				cor, pval = stats.spearmanr(a1, a2)
			cor = round(cor, 3)
			info_result.append(cor)
			info_result.append(pval)
			if abs(cor) >= 0.2:
				save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+")
			if abs(cor) >= 0.86:
				save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
												style = "a+")
コード例 #7
0
def new_EI_5_6_7_8(data, features):
	key_features = ["Education_Info5", "Education_Info6", "Education_Info7", "Education_Info8"]
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "combine_EI_5_6_7_8"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [["1", "AQ", "-1", "T"], ["1", "AQ", "-1", "80"], ["1", "U", "-1", "-1"], 
					["1", "AQ", "-1", "-1"], ["1", "B", "-1", "-1"], ["1", "A", "-1", "-1"],
					["1", "AM", "-1", "80"], ["1", "A", "-1", "F"], ["1", "B", "-1", "AE"], 
					["1", "U", "-1", "AE"], ["1", "AQ", "-1", "V"], ["1", "AM", "-1", "V"]]

	map_to_one = [["1", "A", "-1", "T"], ["1", "AQ", "-1", "F"], ["1", "AM", "-1", "-1"], 
					["1", "AM", "-1", "-1"], ["1", "AM", "-1", "F"], ["1", "AM", "-1", "T"]]
	map_to_two = [["0", "E", "E", "E"]]

	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_5_6_7_8 = list(data[user, fea_indexs])
		if EI_5_6_7_8 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_5_6_7_8 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_5_6_7_8 in map_to_two:
			feature_data[user, 0] = 2
		else:
			feature_data[user, 0] = 3
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features[1:])
	print(deleted)
	return new_data, new_features 
コード例 #8
0
def new_EI_8(data, features):
	solved_features = ["Education_Info8"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "Education_Info8_info_(cat)"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	feature_data = np.zeros((len(data), 1))

	for user in range(data.shape[0]):
		if data[user, fea_indexs[0]] == "V" or data[user, fea_indexs[0]] == "AE":
			feature_data[user, 0] = 0
		elif data[user, fea_indexs[0]] == "80":
			feature_data[user, 0] = 1
		elif data[user, fea_indexs[0]] == "F":
			feature_data[user, 0] = 2
		elif data[user, fea_indexs[0]] == "T":
			feature_data[user, 0] = 3
		elif data[user, fea_indexs[0]] == "-1":
			feature_data[user, 0] = 4
		elif data[user, fea_indexs[0]] == "E":
			feature_data[user, 0] = 5
		else:
			print("error in Education_Info8")

	
	new_data = np.concatenate((data, feature_data), axis = 1)

	print("Education_Info8 solved")
	return new_data, new_features
コード例 #9
0
def new_WI_19(data, features):
	solved_features = ["WeblogInfo_19"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "WeblogInfo_19_info_(cat)"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	feature_data = np.zeros((len(data), 1))

	for user in range(data.shape[0]):
		if data[user, fea_indexs[0]] == "H":
			feature_data[user, 0] = 0
		elif data[user, fea_indexs[0]] == "G":
			feature_data[user, 0] = 1
		elif data[user, fea_indexs[0]] == "J":
			feature_data[user, 0] = 2
		elif data[user, fea_indexs[0]] == "E":
			feature_data[user, 0] = 3
		elif data[user, fea_indexs[0]] == "F":
			feature_data[user, 0] = 4
		elif data[user, fea_indexs[0]] == "D":
			feature_data[user, 0] = 5
		else:
			feature_data[user, 0] = 6

	
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
								delete_feas_list = solved_features)

	print("WeblogInfo_19 solved")
	print(deleted)
	return new_data, new_features
コード例 #10
0
def new_WI_19(data, features):
    solved_features = ["WeblogInfo_19"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "WeblogInfo_19_info_(cat)"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = np.zeros((len(data), 1))

    for user in range(data.shape[0]):
        if data[user, fea_indexs[0]] == "H":
            feature_data[user, 0] = 0
        elif data[user, fea_indexs[0]] == "G":
            feature_data[user, 0] = 1
        elif data[user, fea_indexs[0]] == "J":
            feature_data[user, 0] = 2
        elif data[user, fea_indexs[0]] == "E":
            feature_data[user, 0] = 3
        elif data[user, fea_indexs[0]] == "F":
            feature_data[user, 0] = 4
        elif data[user, fea_indexs[0]] == "D":
            feature_data[user, 0] = 5
        else:
            feature_data[user, 0] = 6

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
           delete_feas_list = solved_features)

    print("WeblogInfo_19 solved")
    print(deleted)
    return new_data, new_features
コード例 #11
0
def new_EI_1_2_3_4(data, features):
	key_features = ["Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4"]
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "combine_EI_1_2_3_4"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], 
					["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"],
					["1", "A", "毕业", "AR"]]
	map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], 
					["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"],
					["1", "AQ", "毕业", "V"]]
	map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]]
	map_to_three = [["0", "E", "E", "E"]]

	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_1_2_3_4 = list(data[user, fea_indexs])
		if EI_1_2_3_4 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_1_2_3_4 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_1_2_3_4 in map_to_two:
			feature_data[user, 0] = 2
		elif EI_1_2_3_4 in map_to_three:
			feature_data[user, 0] = 3
		else:
			print("error!!!!")
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features[1:])
	print(deleted)
	return new_data, new_features
コード例 #12
0
def new_EI_8(data, features):
    solved_features = ["Education_Info8"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "Education_Info8_info_(cat)"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = np.zeros((len(data), 1))

    for user in range(data.shape[0]):
        if data[user, fea_indexs[0]] == "V" or data[user,
                                                    fea_indexs[0]] == "AE":
            feature_data[user, 0] = 0
        elif data[user, fea_indexs[0]] == "80":
            feature_data[user, 0] = 1
        elif data[user, fea_indexs[0]] == "F":
            feature_data[user, 0] = 2
        elif data[user, fea_indexs[0]] == "T":
            feature_data[user, 0] = 3
        elif data[user, fea_indexs[0]] == "-1":
            feature_data[user, 0] = 4
        elif data[user, fea_indexs[0]] == "E":
            feature_data[user, 0] = 5
        else:
            print("error in Education_Info8")

    new_data = np.concatenate((data, feature_data), axis=1)

    print("Education_Info8 solved")
    return new_data, new_features
コード例 #13
0
def new_WI_20_by_present(data, features):
    solved_features = ["WeblogInfo_20"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "WeblogInfo_20_present_info_(cat)"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))
    none_finded_combine = OrderedDict()
    feature_data = np.zeros((len(data), 1))
    map_to_zero = [
        'F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O',
        'C13', 'I6', 'C16', 'I7', 'I10'
    ]
    map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15']
    map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19']
    map_to_three = ['I3', 'U', 'C21', 'I4']
    map_to_four = ['I5']
    map_to_five = ['-1']
    for user in range(data.shape[0]):
        fea_value = data[user, fea_indexs[0]]
        if fea_value in map_to_zero:
            feature_data[user, 0] = 0
        elif fea_value in map_to_one:
            feature_data[user, 0] = 1
        elif fea_value in map_to_two:
            feature_data[user, 0] = 2
        elif fea_value in map_to_three:
            feature_data[user, 0] = 3
        elif fea_value in map_to_four:
            feature_data[user, 0] = 4
        elif fea_value in map_to_five:
            feature_data[user, 0] = 5
        else:
            # print("error")
            # print(fea_value)
            if fea_value not in none_finded_combine.keys():
                none_finded_combine[fea_value] = list()
            none_finded_combine[fea_value].append(user)

    for fea_value, users in none_finded_combine.items():
        if fea_value[0] == "-1":
            feature_data[users, 0] = 5
        if len(users) < 20:
            feature_data[users, 0] = 0
        elif len(users) < 100:
            feature_data[users, 0] = 1
        elif len(users) < 1000:
            feature_data[users, 0] = 2
        elif len(users) < 5000:
            feature_data[users, 0] = 3
        else:
            feature_data[users, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
            delete_feas_list = solved_features)
    print("WeblogInfo_20 solved present")
    print(deleted)
    return new_data, new_features
コード例 #14
0
def new_WI_20_by_present(data, features):
	solved_features = ["WeblogInfo_20"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "WeblogInfo_20_present_info_(cat)"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))
	none_finded_combine = OrderedDict()
	feature_data = np.zeros((len(data), 1))
	map_to_zero = ['F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10']
	map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15']
	map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19']
	map_to_three = ['I3', 'U', 'C21', 'I4']
	map_to_four = ['I5']
	map_to_five = ['-1']
	for user in range(data.shape[0]):
		fea_value = data[user, fea_indexs[0]]
		if fea_value in map_to_zero:
			feature_data[user, 0] = 0
		elif fea_value in map_to_one:
			feature_data[user, 0] = 1
		elif fea_value in map_to_two:
			feature_data[user, 0] = 2
		elif fea_value in map_to_three:
			feature_data[user, 0] = 3
		elif fea_value in map_to_four:
			feature_data[user, 0] = 4
		elif fea_value in map_to_five:
			feature_data[user, 0] = 5
		else:
			# print("error")
			# print(fea_value)
			if fea_value not in none_finded_combine.keys():
				none_finded_combine[fea_value] = list()
			none_finded_combine[fea_value].append(user)

	for fea_value, users in none_finded_combine.items():
		if fea_value[0] == "-1":
			feature_data[users, 0] = 5
		if len(users) < 20:
			feature_data[users, 0] = 0
		elif len(users) < 100:
			feature_data[users, 0] = 1
		elif len(users) < 1000:
			feature_data[users, 0] = 2
		elif len(users) < 5000:
			feature_data[users, 0] = 3
		else:
			feature_data[users, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
 								delete_feas_list = solved_features)
	print("WeblogInfo_20 solved present")
	print(deleted)
	return new_data, new_features
コード例 #15
0
def extract_data_by_features(data, features, needed_features):
    needed_features_index = get_known_features_index(features, needed_features)
    new_data = np.ones((data.shape[0], len(needed_features)), dtype=np.int64)
    print(len(needed_features_index))
    print(new_data.shape)
    for i in range(len(needed_features)):
        new_data[:, i] = data[:, needed_features_index[i]]

    new_features = needed_features
    return new_data, new_features
コード例 #16
0
ファイル: features_reduce.py プロジェクト: CSJLOVEJX/DataPigs
def extract_data_by_features(data, features, needed_features):
	needed_features_index = get_known_features_index(features, needed_features)
	new_data = np.ones((data.shape[0], len(needed_features)), dtype=np.int64)
	print(len(needed_features_index))
	print(new_data.shape)
	for i in range(len(needed_features)):
		new_data[:, i] = data[:, needed_features_index[i]]

	new_features = needed_features
	return new_data, new_features
コード例 #17
0
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \
         contain_special_features):
    map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \
           dir_name = "resultData/features_map")

    print(map_experience)
    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    fixed_str_features_index = get_known_features_index(
        features, fixed_str_features)

    digited_special_str_features_index = get_known_features_index(features, \
               digited_special_str_features)
    contain_special_features_index = get_known_features_index(features, \
               contain_special_features)
    remember = list()
    for fea_pos in range(1, len(features)):
        # str style features + str .. but not digited + the str we want to digit
        if fea_pos in fixed_str_features_index and \
         fea_pos not in digited_special_str_features_index and \
         fea_pos in contain_special_features_index:
            # the ListingInfo may be reverse !!!
            if features[fea_pos] == "ListingInfo" and int(
                    data[0, fea_pos].split("/")[0]) < 1000:
                data = reverse_date(data, fea_pos)
            if features[fea_pos] in map_experience.keys():
                for i in range(len(data)):
                    if data[i, fea_pos] == "-1":
                        continue
                    try:
                        data[i, fea_pos] = map_experience[features[fea_pos]][
                            data[i, fea_pos]]
                    except:
                        if i < 50:
                            print(features[fea_pos])
                            print(map_experience[features[fea_pos]])
                            print(map_experience[features[fea_pos]][data[
                                i, fea_pos]])
                        remember.append(i)  # this is a error value
    #print(remember)
    data = np.delete(data, remember, 0)
    digited_data = convert_to_numerical(data, features)
    return digited_data
コード例 #18
0
def compare_features_info2(data, features, key_features):
	fea_indexs = get_known_features_index(features, key_features)
	compare_result = OrderedDict()

	for user in range(data.shape[0]):
		# user_id = data[user, 0]
		combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs]))
		if combine_data not in compare_result.keys():
			compare_result[combine_data] = 0
		compare_result[combine_data] += 1
	return compare_result
コード例 #19
0
def one_features_info2(data, features, key_feature):
	fea_index = get_known_features_index(features, key_feature)

	user_value_info = dict()
	for user in range(data.shape[0]):
		value = list(data[user, fea_index])[0]
		if not value in user_value_info.keys():
			user_value_info[value] = 0
		user_value_info[value] += 1

	return user_value_info
コード例 #20
0
def one_features_info(data, features, label, key_feature):
	fea_index = get_known_features_index(features, key_feature)

	user_value_info = dict()
	for user in range(data.shape[0]):
		value = list(data[user, fea_index])[0]
		if not value in user_value_info.keys():
			user_value_info[value] = [0, 0]
		user_value_info[value][0] += 1
		if label[user] == 1:
			user_value_info[value][1] += 1
	return user_value_info
コード例 #21
0
def fill_all_missing(data, features, label = None):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label, indexs)
		if not fea_val_cla[-1]._present_num == 0:
			if fea_pos == 5:
				print(fea_val_cla)
			data = fill_the_missing(data, fea_pos, fea_val_cla, label)
	#write_to_deleted_features_area(np.array(deleted_feas))
	return data, features
コード例 #22
0
ファイル: features_reduce.py プロジェクト: CSJLOVEJX/DataPigs
def according_coefficient_variation_delete(data, features):
	waiting_to_delete = np.array(load_result("complex_value_features.csv"))
	waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size,))
	#print(waiting_to_delete)
	indexs = get_known_features_index(features, waiting_to_delete)
	coefficient_variation_info = OrderedDict()
	for fea_pos in indexs:
		try:
			coefficient_variation_fea = stats.variation(data[:, fea_pos])
			coefficient_variation_info[features[fea_pos]] = coefficient_variation_fea
		except:
			pass
	return coefficient_variation_info
コード例 #23
0
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \
									contain_special_features):
	map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \
								dir_name = "resultData/features_map")

	print(map_experience)
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	fixed_str_features_index = get_known_features_index(features, fixed_str_features)

	digited_special_str_features_index = get_known_features_index(features, \
												digited_special_str_features)
	contain_special_features_index = get_known_features_index(features, \
												contain_special_features)
	remember = list()
	for fea_pos in range(1, len(features)):
		# str style features + str .. but not digited + the str we want to digit
		if fea_pos in fixed_str_features_index and \
			fea_pos not in digited_special_str_features_index and \
			fea_pos in contain_special_features_index:
			# the ListingInfo may be reverse !!!
			if features[fea_pos] == "ListingInfo" and int(data[0, fea_pos].split("/")[0]) < 1000:
				data = reverse_date(data, fea_pos)
			if features[fea_pos] in map_experience.keys():
				for i in range(len(data)):
					if data[i, fea_pos] == "-1":
						continue
					try:
						data[i, fea_pos] = map_experience[features[fea_pos]][data[i, fea_pos]]
					except:
						if i < 50:
							print(features[fea_pos])
							print(map_experience[features[fea_pos]])
							print(map_experience[features[fea_pos]][data[i, fea_pos]])
						remember.append(i) # this is a error value
	#print(remember)			
	data = np.delete(data, remember, 0)
	digited_data = convert_to_numerical(data, features)
	return digited_data
コード例 #24
0
def according_coefficient_variation_delete(data, features):
    waiting_to_delete = np.array(load_result("complex_value_features.csv"))
    waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size, ))
    #print(waiting_to_delete)
    indexs = get_known_features_index(features, waiting_to_delete)
    coefficient_variation_info = OrderedDict()
    for fea_pos in indexs:
        try:
            coefficient_variation_fea = stats.variation(data[:, fea_pos])
            coefficient_variation_info[
                features[fea_pos]] = coefficient_variation_fea
        except:
            pass
    return coefficient_variation_info
コード例 #25
0
def missed_instances_info(data, features, label, key_feature = None):
	missed_infos = OrderedDict()
	users_miss_info = OrderedDict()


	if key_feature == None:
		indexs = range(len(features))
		key_data = data
		key_feature = features
	else:
		indexs = get_known_features_index(features, key_feature)
		key_data = data[:, indexs]
		print(key_data.shape)
		print(key_data.shape[0])
		print(key_data.shape[1])

	missed_infos["missed_instances_sum"] = 0
	missed_infos["positive_sum"] = 0
	missed_infos["negitive_sum"] = 0

	for i in range(key_data.shape[0]):
		user_miss_count = 0
		missed_features_index = list()
		missed_features = list()
		flag = 0
		user_idx = data[i, 0]
		users_miss_info[user_idx] = OrderedDict()

		for j in range(key_data.shape[1]):
			if key_data[i, j] == -1:
				flag = 1
				user_miss_count += 1
				missed_features_index.append(indexs[j])
				missed_features.append(key_feature[j])
		# exist miss in this line
		if flag == 1:
			missed_infos["missed_instances_sum"] += 1
			if label[i] == 1:
				missed_infos["positive_sum"] += 1
			else:
				missed_infos["negitive_sum"] += 1

		# if this uer missed, statistic information
		if user_miss_count:
			users_miss_info[user_idx]["missed_count"] = user_miss_count
			users_miss_info[user_idx]["miss_features_indexs"] = missed_features_index
			users_miss_info[user_idx]["missed_features"] = missed_features
			users_miss_info[user_idx]["label"] = label[i]

	return missed_infos, users_miss_info
コード例 #26
0
def map_str_to_digit(data, features, no_map_features, only_map_features = " ", label = " "):
	no_map_features_index = get_known_features_index(features, no_map_features)
	features_map_info = dict()

	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	fixed_str_features_index = get_known_features_index(features, fixed_str_features)

	only_map_features_index = range(len(features))
	if not only_map_features == " ":
		only_map_features_index = get_known_features_index(features, only_map_features)
	for fea_pos in range(1, len(features)):
		if not fea_pos in no_map_features_index and fea_pos in only_map_features_index:
			map_info = OrderedDict()
			#feature_map_info = OrderedDict()
			fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index)
			# if this feature is a string value, just convert it to value
			if fea_val_cla["str_feature"]:

				data, map_info = map_str_feature_to_value(data, fea_pos, fea_val_cla)
				features_map_info[features[fea_pos]] = map_info
				#features_map_info[].append([feature_map_info])

	digited_data = convert_to_numerical(data, features)
	return digited_data, features_map_info
コード例 #27
0
def sta_thirdParty_info(data, features, type_number, label = None):

	based_feature_name = "ThirdParty_Info_Period"
	solved_features = list()
	for i in range(1, 7):
		solved_feature = based_feature_name + str(i) + "_" + type_number[0]

		solved_features.append(solved_feature)

	indexs = get_known_features_index(features, solved_features)

	# print(solved_features)
	# print(indexs)

	users_sta_name = ["average", "Standard deviation", "coefficient of variation", "max", "min"]

	sta_name = "ms_type" + type_number[0]
	#print("sta_name: ", sta_name)
	users_stability = OrderedDict()

	for user in range(data.shape[0]):
		users_stability[user] = OrderedDict()
		if not label == None:
			users_stability[user]["label"] = label[user]
		calculate_data = data[user, indexs]
		users_stability[user]["value"] = list(calculate_data)

		users_stability[user][sta_name] = dict()
		fea_average = round(float(np.mean(calculate_data)), 3)
		fea_std = round(float(np.std(calculate_data)), 3)
		if fea_average == 0:
			fea_cv = 0
		else:
			fea_cv = round(float(fea_std / fea_average), 3)
		max_v = np.amax(calculate_data)
		min_v = np.amin(calculate_data)

		users_stability[user][sta_name]["average"] = fea_average
		users_stability[user][sta_name]["Standard deviation"] = fea_std
		users_stability[user][sta_name]["cv"] = fea_cv
		users_stability[user][sta_name]["max"] = max_v
		users_stability[user][sta_name]["min"] = min_v


	return users_stability
コード例 #28
0
def new_UserInfo_19_20(data, features):
    solved_features = ["UserInfo_19", "UserInfo_20"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "UserInfo_19_20_wrong_province_city"
    new_add_feature = np.array([feature_name])

    feature_data = np.zeros((len(data), 1))
    new_features = np.concatenate((features, new_add_feature))

    for user in range(data.shape[0]):
        if str(-1) in list(data[user, fea_indexs]):
            feature_data[user, 0] = 1

    new_data = np.concatenate((data, feature_data), axis=1)

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = solved_features)
    print(deleted)
    return new_data, new_features
コード例 #29
0
def new_UserInfo_19_20(data, features):
	solved_features = ["UserInfo_19", "UserInfo_20"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "UserInfo_19_20_wrong_province_city"
	new_add_feature = np.array([feature_name])

	feature_data = np.zeros((len(data), 1))
	new_features = np.concatenate((features, new_add_feature))

	for user in range(data.shape[0]):
		if str(-1) in list(data[user, fea_indexs]):
			feature_data[user, 0] = 1

	new_data = np.concatenate((data, feature_data), axis = 1)

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = solved_features)
	print(deleted)
	return new_data, new_features
コード例 #30
0
def new_EI_1_2_3_4(data, features):
    key_features = [
        "Education_Info1", "Education_Info2", "Education_Info3",
        "Education_Info4"
    ]
    fea_indexs = get_known_features_index(features, key_features)
    feature_name = "combine_EI_1_2_3_4"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    ##### map rules ####
    map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"],
                   ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"],
                   ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"],
                   ["1", "A", "毕业", "AR"]]
    map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"],
                  ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"],
                  ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"],
                  ["1", "AQ", "毕业", "V"]]
    map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"],
                  ["1", "AM", "毕业", "F"]]
    map_to_three = [["0", "E", "E", "E"]]

    feature_data = np.ones((len(data), 1))
    for user in range(data.shape[0]):
        EI_1_2_3_4 = list(data[user, fea_indexs])
        if EI_1_2_3_4 in map_to_zero:
            feature_data[user, 0] = 0
        elif EI_1_2_3_4 in map_to_one:
            feature_data[user, 0] = 1
        elif EI_1_2_3_4 in map_to_two:
            feature_data[user, 0] = 2
        elif EI_1_2_3_4 in map_to_three:
            feature_data[user, 0] = 3
        else:
            print("error!!!!")
    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = key_features[1:])
    print(deleted)
    return new_data, new_features
コード例 #31
0
def compare_features_info(data, features, label, key_features):
	fea_indexs = get_known_features_index(features, key_features)
	compare_result = OrderedDict()
	compare_result["num_differ"] = 0
	compare_result["different_combine_info"] = dict()
	compare_result["num_differ_positive"] = 0
	compare_result["num_same"] = 0
	compare_result["same_combine_info"] = dict()
	compare_result["num_same_positive"] = 0
	compare_result["num_same_miss"] = 0
	for user in range(data.shape[0]):
		# user_id = data[user, 0]
		combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs]))
		province = list(data[user, fea_indexs])[0]
		pro_info = list(data[user, fea_indexs])[1]
		if not len(set(list(data[user, fea_indexs]))) == 1:
			compare_result["num_differ"] += 1
			if not combine_data in compare_result["different_combine_info"].keys():
				compare_result["different_combine_info"][combine_data] = [0, 0]

			compare_result["different_combine_info"][combine_data][0] += 1

			if label[user] == 1:
				compare_result["num_differ_positive"] += 1
				compare_result["different_combine_info"][combine_data][1] += 1
		else:
			compare_result["num_same"] += 1

			if not combine_data in compare_result["same_combine_info"].keys():
				compare_result["same_combine_info"][combine_data] = [0, 0]

			compare_result["same_combine_info"][combine_data][0] += 1

			if label[user] == 1:
				compare_result["num_same_positive"] += 1
				compare_result["same_combine_info"][combine_data][1] += 1

			if str(-1) == list(data[user, fea_indexs])[0]:
				compare_result["num_same_miss"] += 1
	return compare_result
コード例 #32
0
def new_UserInfo_11_12_13(data, features):
	solved_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "UserInfo_11_12_13_info"
	new_add_feature = np.array([feature_name])

	feature_data = np.zeros((len(data), 1))
	new_features = np.concatenate((features, new_add_feature))

	for user in range(data.shape[0]):
		combine_data = list(map(int, list(data[user, fea_indexs])))
		add = reduce(lambda x, y: x * 2 + y, combine_data)
		if add > 0:
			feature_data[user, 0] = add
		else:
			feature_data[user, 0] = 0

	new_data = np.concatenate((data, feature_data), axis = 1)

	print("extract from UserInfo 11 12 13")
	return new_data, new_features
コード例 #33
0
def new_UserInfo_11_12_13(data, features):
    solved_features = ["UserInfo_11", "UserInfo_12", "UserInfo_13"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "UserInfo_11_12_13_info"
    new_add_feature = np.array([feature_name])

    feature_data = np.zeros((len(data), 1))
    new_features = np.concatenate((features, new_add_feature))

    for user in range(data.shape[0]):
        combine_data = list(map(int, list(data[user, fea_indexs])))
        add = reduce(lambda x, y: x * 2 + y, combine_data)
        if add > 0:
            feature_data[user, 0] = add
        else:
            feature_data[user, 0] = 0

    new_data = np.concatenate((data, feature_data), axis=1)

    print("extract from UserInfo 11 12 13")
    return new_data, new_features
コード例 #34
0
def correlation_between_properties(data, features):
    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    indexs = get_known_features_index(features, fixed_str_features)

    title = list()
    title.append("features1")
    title.append("features2")
    title.append("calculate_method")
    title.append("cor")
    title.append("pval")
    save_result(title, "pearsonr_spearmanr_results.csv")
    save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
    for fea_pos in range(len(features)):
        for fea_pos_add in range(fea_pos + 1, len(features)):
            info_result = list()
            info_result.append(features[fea_pos])
            info_result.append(features[fea_pos_add])
            a1 = data[:, fea_pos]
            a2 = data[:, fea_pos_add]
            # they are all not str style features
            if fea_pos not in indexs and fea_pos_add not in indexs:
                info_result.append("pearsonr")
                cor, pval = stats.pearsonr(a1, a2)
            else:  # one of them or all of them are str style features
                info_result.append("spearmanr")
                cor, pval = stats.spearmanr(a1, a2)
            cor = round(cor, 3)
            info_result.append(cor)
            info_result.append(pval)
            if abs(cor) >= 0.2:
                save_result(info_result,
                            "pearsonr_spearmanr_results.csv",
                            style="a+")
            if abs(cor) >= 0.86:
                save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
                        style = "a+")
コード例 #35
0
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all = True):
	
	fea_indexs = get_known_features_index(features, key_features)

	
	new_add_feature = np.array([feature_name])

	feature_data = np.zeros((len(data), 1))
	new_features = np.concatenate((features, new_add_feature))

	for user in range(data.shape[0]):
		if not len(set(list(data[user, fea_indexs]))) == 1:
			feature_data[user, 0] = 1

	new_data = np.concatenate((data, feature_data), axis = 1)

	delete_feas = key_features[1:]
	if deleted_all:
		delete_feas = key_features

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = delete_feas)
	print(deleted)
	return new_data, new_features
コード例 #36
0
def save_features_info(data,
                       features,
                       label,
                       file_name,
                       dir_name="resultData"):

    file_path = os.path.join(os.getcwd(), dir_name, file_name)
    first_line = np.array(['features_name', 'str_feature', \
         'num_values', \
         'average|most_presentS',
         'postitive(average|most_present)', \
         'negitive(average|most_present)', \
         'num_positive', \
         'num_negitive', 'feature_value_info'])

    fixed_str_features = np.array(load_result("str_features.csv"))[0]
    indexs = get_known_features_index(features, fixed_str_features)

    with open(file_path, "w", newline='') as csv_file:
        spamwriter = csv.writer(csv_file)
        spamwriter.writerow(first_line)

    from solve_data import feature_value_class, FeatureInData

    from collections import OrderedDict

    for fea_pos in range(1, len(features)):
        feature_info = list()
        feature_info.append(features[fea_pos])

        fea_val_cla = feature_value_class(data,
                                          fea_pos,
                                          label,
                                          fixed_str_features_index=indexs)
        feature_info.append(fea_val_cla["str_feature"])
        feature_info.append(fea_val_cla["num_of_value"])
        if fea_val_cla["str_feature"]:
            feature_info.append(fea_val_cla["most_presentS"])
            try:
                l = label[0, 0]
                feature_info.append(fea_val_cla["most_presentS_positive"])
                feature_info.append(fea_val_cla["most_presentS_negitive"])
            except:
                feature_info.append("None")
                feature_info.append("None")
        else:
            feature_info.append(fea_val_cla["average"])
            try:
                l = label[0, 0]
                feature_info.append(fea_val_cla["average_positive"])
                feature_info.append(fea_val_cla["average_negitive"])
            except:
                feature_info.append("None")
                feature_info.append("None")
        try:
            l = label[0, 0]
            feature_info.append(fea_val_cla["num_positive"])
            feature_info.append(fea_val_cla["num_negitive"])
        except:
            feature_info.append("None")
            feature_info.append("None")
        for k, v in fea_val_cla.items():
            if isinstance(v, FeatureInData):
                value_info = OrderedDict()
                value_info["value"] = k
                value_info["present_num"] = v._present_num
                try:
                    l = label[0, 0]
                    value_info[
                        "respond_positive_num"] = v._respond_positive_num
                    value_info[
                        "respond_negitive_num"] = v._respond_negitive_num
                except:
                    pass
                feature_info.append(value_info)
        with open(file_path, "a+", newline='') as csv_file:
            spamwriter = csv.writer(csv_file)
            spamwriter.writerow(feature_info)
コード例 #37
0
def new_UserInfo_22_23_combine2(data, features):
    key_features = ["UserInfo_22", "UserInfo_23"]
    print("combine2")
    fea_indexs = get_known_features_index(features, key_features)
    feature_name = "UserInfo_combine2_by_present_22_23"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    ##### map rules ####
    map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'],
                   ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'],
                   ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'],
                   ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'],
                   ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'],
                   ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'],
                   ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'],
                   ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'],
                   ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'],
                   ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'],
                   ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'],
                   ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']]

    map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'],
                  ['已婚', '专科毕业']]

    map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'],
                  ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'],
                  ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'],
                  ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'],
                  ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']]
    map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'],
                    ['未婚', 'AB'], ['未婚', 'G']]
    map_to_four = [['D', 'D']]
    map_to_five = [['-1', '-1']]
    none_finded_combine = OrderedDict()
    feature_data = np.ones((len(data), 1))
    for user in range(data.shape[0]):
        EI_22_23 = list(data[user, fea_indexs])
        if EI_22_23 in map_to_zero:
            feature_data[user, 0] = 0
        elif EI_22_23 in map_to_one:
            feature_data[user, 0] = 1
        elif EI_22_23 in map_to_two:
            feature_data[user, 0] = 2
        elif EI_22_23 in map_to_three:
            feature_data[user, 0] = 3
        elif EI_22_23 in map_to_four:
            feature_data[user, 0] = 4
        elif EI_22_23 in map_to_five:
            feature_data[user, 0] = 5
        else:
            EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23)
            if EI_22_23_str not in none_finded_combine.keys():
                none_finded_combine[EI_22_23_str] = list()
            none_finded_combine[EI_22_23_str].append(user)

    for EI_combine, users in none_finded_combine.items():
        EI_combine = EI_combine.split("_")
        if EI_combine[0] == "-1" and EI_combine[1] == "-1":
            feature_data[users, 0] = 5
        if len(users) < 10:
            feature_data[users, 0] = 0
        elif len(users) < 20:
            feature_data[users, 0] = 1
        elif len(users) < 100:
            feature_data[users, 0] = 2
        elif len(users) < 1000:
            feature_data[users, 0] = 3
        else:
            feature_data[users, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = key_features)
    print(deleted)
    return new_data, new_features
コード例 #38
0
def save_features_info(data, features, label, file_name, dir_name = "resultData"):

	file_path = os.path.join(os.getcwd(), dir_name, file_name)
	first_line = np.array(['features_name', 'str_feature', \
						'num_values', \
						'average|most_presentS',
						'postitive(average|most_present)', \
						'negitive(average|most_present)', \
						'num_positive', \
						'num_negitive', 'feature_value_info'])

	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)

	with open(file_path, "w", newline='') as csv_file:
		spamwriter = csv.writer(csv_file)
		spamwriter.writerow(first_line)

	from solve_data import feature_value_class, FeatureInData

	from collections import OrderedDict

	for fea_pos in range(1, len(features)):
		feature_info = list()
		feature_info.append(features[fea_pos])

		fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index = indexs)
		feature_info.append(fea_val_cla["str_feature"])
		feature_info.append(fea_val_cla["num_of_value"])
		if fea_val_cla["str_feature"]:
			feature_info.append(fea_val_cla["most_presentS"])
			try:
				l = label[0, 0]
				feature_info.append(fea_val_cla["most_presentS_positive"])
				feature_info.append(fea_val_cla["most_presentS_negitive"])
			except:
				feature_info.append("None")
				feature_info.append("None")
		else:
			feature_info.append(fea_val_cla["average"])
			try:
				l = label[0, 0]
				feature_info.append(fea_val_cla["average_positive"])
				feature_info.append(fea_val_cla["average_negitive"])
			except:
				feature_info.append("None")
				feature_info.append("None")
		try:
			l = label[0, 0]
			feature_info.append(fea_val_cla["num_positive"])
			feature_info.append(fea_val_cla["num_negitive"])
		except:
			feature_info.append("None")
			feature_info.append("None")
		for k, v in fea_val_cla.items():
			if isinstance(v, FeatureInData):
				value_info = OrderedDict()
				value_info["value"] = k
				value_info["present_num"] = v._present_num
				try:
					l = label[0, 0]
					value_info["respond_positive_num"] = v._respond_positive_num
					value_info["respond_negitive_num"] = v._respond_negitive_num
				except:
					pass
				feature_info.append(value_info)
		with open(file_path, "a+", newline='') as csv_file:
			spamwriter = csv.writer(csv_file)
			spamwriter.writerow(feature_info)
コード例 #39
0
def new_UserInfo_22_23_combine2(data, features):
	key_features = ["UserInfo_22", "UserInfo_23"]
	print("combine2")
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "UserInfo_combine2_by_present_22_23"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], 
					['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], 
					['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], 
					['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], 
					['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], 
					['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], 
					['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], 
					['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], 
					['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], 
					['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']]

	map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']]

	map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], 
					['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], 
					['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], 
					['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']]
	map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']]
	map_to_four = [['D', 'D']]
	map_to_five = [['-1', '-1']]
	none_finded_combine = OrderedDict()
	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_22_23 = list(data[user, fea_indexs])
		if EI_22_23 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_22_23 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_22_23 in map_to_two:
			feature_data[user, 0] = 2
		elif EI_22_23 in map_to_three:
			feature_data[user, 0] = 3
		elif EI_22_23 in map_to_four:
			feature_data[user, 0] = 4
		elif EI_22_23 in map_to_five:
			feature_data[user, 0] = 5
		else:
			EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23)
			if EI_22_23_str not in none_finded_combine.keys():
				none_finded_combine[EI_22_23_str] = list()
			none_finded_combine[EI_22_23_str].append(user)

	for EI_combine, users in none_finded_combine.items():
		EI_combine = EI_combine.split("_")
		if EI_combine[0] == "-1" and EI_combine[1] == "-1":
			feature_data[users, 0] = 5
		if len(users) < 10:
			feature_data[users, 0] = 0
		elif len(users) < 20:
			feature_data[users, 0] = 1
		elif len(users) < 100:
			feature_data[users, 0] = 2
		elif len(users) < 1000:
			feature_data[users, 0] = 3
		else:
			feature_data[users, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features)
	print(deleted)
	return new_data, new_features