コード例 #1
0
def new_WI_19(data, features):
    solved_features = ["WeblogInfo_19"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "WeblogInfo_19_info_(cat)"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = np.zeros((len(data), 1))

    for user in range(data.shape[0]):
        if data[user, fea_indexs[0]] == "H":
            feature_data[user, 0] = 0
        elif data[user, fea_indexs[0]] == "G":
            feature_data[user, 0] = 1
        elif data[user, fea_indexs[0]] == "J":
            feature_data[user, 0] = 2
        elif data[user, fea_indexs[0]] == "E":
            feature_data[user, 0] = 3
        elif data[user, fea_indexs[0]] == "F":
            feature_data[user, 0] = 4
        elif data[user, fea_indexs[0]] == "D":
            feature_data[user, 0] = 5
        else:
            feature_data[user, 0] = 6

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
           delete_feas_list = solved_features)

    print("WeblogInfo_19 solved")
    print(deleted)
    return new_data, new_features
コード例 #2
0
def new_EI_5_6_7_8(data, features):
	key_features = ["Education_Info5", "Education_Info6", "Education_Info7", "Education_Info8"]
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "combine_EI_5_6_7_8"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [["1", "AQ", "-1", "T"], ["1", "AQ", "-1", "80"], ["1", "U", "-1", "-1"], 
					["1", "AQ", "-1", "-1"], ["1", "B", "-1", "-1"], ["1", "A", "-1", "-1"],
					["1", "AM", "-1", "80"], ["1", "A", "-1", "F"], ["1", "B", "-1", "AE"], 
					["1", "U", "-1", "AE"], ["1", "AQ", "-1", "V"], ["1", "AM", "-1", "V"]]

	map_to_one = [["1", "A", "-1", "T"], ["1", "AQ", "-1", "F"], ["1", "AM", "-1", "-1"], 
					["1", "AM", "-1", "-1"], ["1", "AM", "-1", "F"], ["1", "AM", "-1", "T"]]
	map_to_two = [["0", "E", "E", "E"]]

	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_5_6_7_8 = list(data[user, fea_indexs])
		if EI_5_6_7_8 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_5_6_7_8 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_5_6_7_8 in map_to_two:
			feature_data[user, 0] = 2
		else:
			feature_data[user, 0] = 3
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features[1:])
	print(deleted)
	return new_data, new_features 
コード例 #3
0
def new_UserInfo_18(data, features):
	solved_features = ["UserInfo_18"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "UserInfo_18_bined"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	feature_data = np.zeros((len(data), 1))
	for user in range(data.shape[0]):
		user_age = data[user, fea_indexs]
		if user_age < "22":
			feature_data[user, 0] = 0
		elif user_age < "30":
			feature_data[user, 0] = 1
		elif user_age < "40":
			feature_data[user, 0] = 2
		elif user_age < "50":
			feature_data[user, 0] = 3
		else:
			feature_data[user, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = solved_features)
	print(deleted)
	return new_data, new_features
コード例 #4
0
def new_UserInfo_18(data, features):
    solved_features = ["UserInfo_18"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "UserInfo_18_bined"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = np.zeros((len(data), 1))
    for user in range(data.shape[0]):
        user_age = data[user, fea_indexs]
        if user_age < "22":
            feature_data[user, 0] = 0
        elif user_age < "30":
            feature_data[user, 0] = 1
        elif user_age < "40":
            feature_data[user, 0] = 2
        elif user_age < "50":
            feature_data[user, 0] = 3
        else:
            feature_data[user, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = solved_features)
    print(deleted)
    return new_data, new_features
コード例 #5
0
def replace_miss(data, features, label = "", for_train = True):
	delete_fea_index = []
	missing_num = []
	new_data = data.copy()
	new_features = features.copy()

	if for_train:
		SAVE_DIR = "resultData"
	else:
		SAVE_DIR = "resultData/test/"
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label)
		if not fea_val_cla[-1]._present_num == 0:
			new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \
												delete_fea_index, missing_num)
	if for_train:
		new_data, new_features, deleted_feas = delete_features(new_data, new_features, \
															delete_fea_pos = delete_fea_index)

		save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features)

		save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \
					np.array(deleted_feas), dir_name = SAVE_DIR)


	return new_data, new_features
コード例 #6
0
def pipeline_for_features_solved(for_train=True, saved_area="resultData"):
    if for_train:
        print("**************** Train ************************")
        data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv"
    else:
        print("**************** Test ************************")
        data_file_name = "PPD_Master_GBK_2_Test_Set.csv"
    data, features, label = load_data_for_solve(data_file_name, for_train)
    data, features = replace_miss(data, features, label, for_train)

    deleted_features_in_train = load_all_deleted_features_during_train(
        deleted_features_file_label="deleted_")
    data, features, deleted = delete_features(
        data, features, delete_feas_list=deleted_features_in_train)
    print(deleted)

    data, features = solve_user_info_package(data,
                                             features,
                                             saved_dir=saved_area)

    data, features = solve_weblog_info_package(data,
                                               features,
                                               saved_dir=saved_area)

    data, features = solve_thirdparty_info_package(data,
                                                   features,
                                                   saved_dir=saved_area)

    data, features = extract_log_update_package(data, features, for_train)

    return data, features
コード例 #7
0
def new_WI_19(data, features):
	solved_features = ["WeblogInfo_19"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "WeblogInfo_19_info_(cat)"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	feature_data = np.zeros((len(data), 1))

	for user in range(data.shape[0]):
		if data[user, fea_indexs[0]] == "H":
			feature_data[user, 0] = 0
		elif data[user, fea_indexs[0]] == "G":
			feature_data[user, 0] = 1
		elif data[user, fea_indexs[0]] == "J":
			feature_data[user, 0] = 2
		elif data[user, fea_indexs[0]] == "E":
			feature_data[user, 0] = 3
		elif data[user, fea_indexs[0]] == "F":
			feature_data[user, 0] = 4
		elif data[user, fea_indexs[0]] == "D":
			feature_data[user, 0] = 5
		else:
			feature_data[user, 0] = 6

	
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
								delete_feas_list = solved_features)

	print("WeblogInfo_19 solved")
	print(deleted)
	return new_data, new_features
コード例 #8
0
def replace_miss(data, features, label = "", for_train = True, is_round_two = False):
	delete_fea_index = []
	missing_num = []
	new_data = data.copy()
	new_features = features.copy()

	if for_train:
		if is_round_two:
			SAVE_DIR = "resultData_two"
		else:
			SAVE_DIR = "resultData"
	else:
		if is_round_two:
			SAVE_DIR = "resultData_two/test/"
		else:
			SAVE_DIR = "resultData/test/"
	threshold = int(data.shape[0] * 2 / 3)
	print("threshold: ", threshold)
	#!start from range(1,...) is because the first line of the feature is the id, useless
	for fea_pos in range(1, len(features)):
		fea_val_cla = feature_value_class(data, fea_pos, label)
		if not fea_val_cla[-1]._present_num == 0:
			new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \
												delete_fea_index, missing_num, threshold)
	if for_train and not is_round_two:
		new_data, new_features, deleted_feas = delete_features(new_data, new_features, \
															delete_fea_pos = delete_fea_index)
		print("delete while training: ", deleted_feas)
		#save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features)

		save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \
					np.array(deleted_feas), dir_name = SAVE_DIR)


	return new_data, new_features
コード例 #9
0
def view_each_features(data, features):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    str_style_features = np.array(load_result("str_features.csv")[0])
    str_features_index = get_known_features_index(features, str_style_features)

    x = range(len(data))

    for fea_pos in range(len(features)):
        feature_name = features[fea_pos]
        # not draw the str style features
        if fea_pos in str_features_index:
            file_path = "view_data_area/csj/" + "(str" + str(
                fea_pos) + ")" + feature_name + ".png"
            # print(fea_pos)
            # print(features[fea_pos])
        else:
            file_path = "view_data_area/csj/" + str(
                fea_pos) + ")" + feature_name + ".png"
        y = data[:, fea_pos]
        plt.scatter(x, y)

        plt.xlabel("instances(30000)")
        plt.ylabel("value")
        plt.title(feature_name + " value " + "distributed " + "in instances")
        plt.ylim(-2)
        # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15)
        # plt.legend((rect,),(feature_name + "`s value",))

        #print(file_path)
        plt.savefig(file_path)
        plt.close()
コード例 #10
0
ファイル: view_data.py プロジェクト: CSJLOVEJX/DataPigs
def view_each_features(data, features):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	str_style_features = np.array(load_result("str_features.csv")[0])
	str_features_index = get_known_features_index(features, str_style_features)

	x = range(len(data))

	for fea_pos in range(len(features)):
		feature_name = features[fea_pos]
		# not draw the str style features
		if fea_pos in str_features_index:
			file_path = "view_data_area/csj/" + "(str" + str(fea_pos) + ")" + feature_name +  ".png"
			# print(fea_pos)
			# print(features[fea_pos])
		else:
			file_path = "view_data_area/csj/" + str(fea_pos) + ")" + feature_name +  ".png"
		y = data[:, fea_pos]
		plt.scatter(x, y)
		


		plt.xlabel("instances(30000)")
		plt.ylabel("value")
		plt.title(feature_name + " value " + "distributed " + "in instances")
		plt.ylim(-2)
		# rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15)
		# plt.legend((rect,),(feature_name + "`s value",))

		#print(file_path)
		plt.savefig(file_path)
		plt.close()
コード例 #11
0
def new_EI_1_2_3_4(data, features):
	key_features = ["Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4"]
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "combine_EI_1_2_3_4"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], 
					["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"],
					["1", "A", "毕业", "AR"]]
	map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], 
					["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"],
					["1", "AQ", "毕业", "V"]]
	map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]]
	map_to_three = [["0", "E", "E", "E"]]

	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_1_2_3_4 = list(data[user, fea_indexs])
		if EI_1_2_3_4 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_1_2_3_4 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_1_2_3_4 in map_to_two:
			feature_data[user, 0] = 2
		elif EI_1_2_3_4 in map_to_three:
			feature_data[user, 0] = 3
		else:
			print("error!!!!")
	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features[1:])
	print(deleted)
	return new_data, new_features
コード例 #12
0
def new_UserInfo_differ(data,
                        features,
                        key_features,
                        feature_name,
                        deleted_all=True):

    fea_indexs = get_known_features_index(features, key_features)

    new_add_feature = np.array([feature_name])

    feature_data = np.zeros((len(data), 1))
    new_features = np.concatenate((features, new_add_feature))

    for user in range(data.shape[0]):
        if not len(set(list(data[user, fea_indexs]))) == 1:
            feature_data[user, 0] = 1

    new_data = np.concatenate((data, feature_data), axis=1)

    delete_feas = key_features[1:]
    if deleted_all:
        delete_feas = key_features

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = delete_feas)
    print(deleted)
    return new_data, new_features
コード例 #13
0
def new_WI_20_by_present(data, features):
    solved_features = ["WeblogInfo_20"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "WeblogInfo_20_present_info_(cat)"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))
    none_finded_combine = OrderedDict()
    feature_data = np.zeros((len(data), 1))
    map_to_zero = [
        'F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O',
        'C13', 'I6', 'C16', 'I7', 'I10'
    ]
    map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15']
    map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19']
    map_to_three = ['I3', 'U', 'C21', 'I4']
    map_to_four = ['I5']
    map_to_five = ['-1']
    for user in range(data.shape[0]):
        fea_value = data[user, fea_indexs[0]]
        if fea_value in map_to_zero:
            feature_data[user, 0] = 0
        elif fea_value in map_to_one:
            feature_data[user, 0] = 1
        elif fea_value in map_to_two:
            feature_data[user, 0] = 2
        elif fea_value in map_to_three:
            feature_data[user, 0] = 3
        elif fea_value in map_to_four:
            feature_data[user, 0] = 4
        elif fea_value in map_to_five:
            feature_data[user, 0] = 5
        else:
            # print("error")
            # print(fea_value)
            if fea_value not in none_finded_combine.keys():
                none_finded_combine[fea_value] = list()
            none_finded_combine[fea_value].append(user)

    for fea_value, users in none_finded_combine.items():
        if fea_value[0] == "-1":
            feature_data[users, 0] = 5
        if len(users) < 20:
            feature_data[users, 0] = 0
        elif len(users) < 100:
            feature_data[users, 0] = 1
        elif len(users) < 1000:
            feature_data[users, 0] = 2
        elif len(users) < 5000:
            feature_data[users, 0] = 3
        else:
            feature_data[users, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
            delete_feas_list = solved_features)
    print("WeblogInfo_20 solved present")
    print(deleted)
    return new_data, new_features
コード例 #14
0
def extract_log_update_package(data, features, for_train = True):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	data, features = add_features_from_log(data, features, for_train)
	data, features = add_features_from_update(data, features, for_train)

	return data, features
コード例 #15
0
def extract_log_update_package(data, features, for_train=True):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)

    data, features = add_features_from_log(data, features, for_train)
    data, features = add_features_from_update(data, features, for_train)

    return data, features
コード例 #16
0
def new_WI_20_by_present(data, features):
	solved_features = ["WeblogInfo_20"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "WeblogInfo_20_present_info_(cat)"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))
	none_finded_combine = OrderedDict()
	feature_data = np.zeros((len(data), 1))
	map_to_zero = ['F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10']
	map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15']
	map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19']
	map_to_three = ['I3', 'U', 'C21', 'I4']
	map_to_four = ['I5']
	map_to_five = ['-1']
	for user in range(data.shape[0]):
		fea_value = data[user, fea_indexs[0]]
		if fea_value in map_to_zero:
			feature_data[user, 0] = 0
		elif fea_value in map_to_one:
			feature_data[user, 0] = 1
		elif fea_value in map_to_two:
			feature_data[user, 0] = 2
		elif fea_value in map_to_three:
			feature_data[user, 0] = 3
		elif fea_value in map_to_four:
			feature_data[user, 0] = 4
		elif fea_value in map_to_five:
			feature_data[user, 0] = 5
		else:
			# print("error")
			# print(fea_value)
			if fea_value not in none_finded_combine.keys():
				none_finded_combine[fea_value] = list()
			none_finded_combine[fea_value].append(user)

	for fea_value, users in none_finded_combine.items():
		if fea_value[0] == "-1":
			feature_data[users, 0] = 5
		if len(users) < 20:
			feature_data[users, 0] = 0
		elif len(users) < 100:
			feature_data[users, 0] = 1
		elif len(users) < 1000:
			feature_data[users, 0] = 2
		elif len(users) < 5000:
			feature_data[users, 0] = 3
		else:
			feature_data[users, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
 								delete_feas_list = solved_features)
	print("WeblogInfo_20 solved present")
	print(deleted)
	return new_data, new_features
コード例 #17
0
def remove_thirdparty6(data, features):
	solved_features = "ThirdParty_Info_Period6"
	third_p6_index = find_featuers_index(solved_features, features)


	new_data, new_features, deleted = delete_features(data, features, \
								delete_fea_pos = third_p6_index)

	print("ThirdParty_Info_Period6 all removed")
	print(deleted)
	return new_data, new_features
コード例 #18
0
def remove_thirdparty6(data, features):
    solved_features = "ThirdParty_Info_Period6"
    third_p6_index = find_featuers_index(solved_features, features)


    new_data, new_features, deleted = delete_features(data, features, \
           delete_fea_pos = third_p6_index)

    print("ThirdParty_Info_Period6 all removed")
    print(deleted)
    return new_data, new_features
コード例 #19
0
ファイル: main.py プロジェクト: CSJLOVEJX/DataPigs
def pipeline_for_features_solved(for_train = True, is_round_two = False):
	if not for_train and not is_round_two:
		print("I f**k your mother, cao ni ma de !! SB!!!!!")
		return 0
	if for_train:
		print("**************** Train ************************")
		if is_round_two:
			print("******* Round Two *********")
			data_file_name = "Kesci_Master_9w_gbk_3_2.csv"
			saved_area = "resultData_two"
		else:
			data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv"
			saved_area = "resultData"
	else:
		print("**************** Test ************************")
		if is_round_two:
			print("******* Round Two *********")
			data_file_name = "Kesci_Master_9w_gbk_1_test_set.csv"
			saved_area = "resultData_two/test"
		else:
			data_file_name = "PPD_Master_GBK_2_Test_Set.csv"
			saved_area = "resultData/test"

	data, features, label = load_data_for_solve(data_file_name, for_train, is_round_two)
	data, features = replace_miss(data, features, label, for_train, is_round_two)

	if not for_train or is_round_two:
		print("all deleted: ")
		deleted_features_in_train = load_all_deleted_features_during_train(is_round_two = False, deleted_features_file_label = "deleted_")
		#print(deleted_features_in_train)
		data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)
		print(deleted)

	data, features = solve_user_info_package(data, features, saved_dir = saved_area)
	#save_result(data, "after_solve_user_info.csv", features, dir_name = saved_area)
	data, features = solve_weblog_info_package(data, features, saved_dir = saved_area)
	if for_train and not is_round_two:
		data, features = deleted_web_log_features(data, features, saved_dir = saved_area)

	data, features = solve_thirdparty_info_package(data, features, saved_dir = saved_area)


	data, features = extract_log_update_package(data, features, for_train, is_round_two)

	save_result(data, "data_after_features_processed.csv", features, dir_name = saved_area)

	print("****** all finished *********")
	print("size: (data, features)")
	print(data.shape)

	return data, features
コード例 #20
0
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	data, features = sta_start_missing_period(data, features)
	data, features = remove_thirdparty6(data, features)

	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 
コード例 #21
0
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)

    data, features = sta_start_missing_period(data, features)
    data, features = remove_thirdparty6(data, features)

    data, features = fill_thirdParty_miss(data, features)

    data, features = third_party_stable(data, features)

    data, features = third_party_level(data, features)
    save_result(data,
                "data_after_thirdparty_solved.csv",
                features,
                dir_name=saved_dir)
    return data, features
コード例 #22
0
def new_UserInfo_24_resident_level(data, features):
	resident_detail_level = ["UserInfo_24"]
	digited_residence_data = digit_resident_features(data, features, resident_detail_level)

	new_add_feature = np.array(["UserInfo_24_resident_detail_level"])
	new_features = np.concatenate((features, new_add_feature))
	
	feature_data = digited_residence_data[:, np.where(features == resident_detail_level)[0][0]]
	feature_data = feature_data.reshape((feature_data.size, 1))
	#print(feature_data)
	#print("sdf", feature_data.shape)

	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = resident_detail_level)

	print("UserInfo_24_resident_detail_level" + " solved")
	print(deleted)
	return new_data, new_features
コード例 #23
0
def new_UserInfo_19_20(data, features):
	solved_features = ["UserInfo_19", "UserInfo_20"]
	fea_indexs = get_known_features_index(features, solved_features)

	feature_name = "UserInfo_19_20_wrong_province_city"
	new_add_feature = np.array([feature_name])

	feature_data = np.zeros((len(data), 1))
	new_features = np.concatenate((features, new_add_feature))

	for user in range(data.shape[0]):
		if str(-1) in list(data[user, fea_indexs]):
			feature_data[user, 0] = 1

	new_data = np.concatenate((data, feature_data), axis = 1)

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = solved_features)
	print(deleted)
	return new_data, new_features
コード例 #24
0
def new_UserInfo_19_20(data, features):
    solved_features = ["UserInfo_19", "UserInfo_20"]
    fea_indexs = get_known_features_index(features, solved_features)

    feature_name = "UserInfo_19_20_wrong_province_city"
    new_add_feature = np.array([feature_name])

    feature_data = np.zeros((len(data), 1))
    new_features = np.concatenate((features, new_add_feature))

    for user in range(data.shape[0]):
        if str(-1) in list(data[user, fea_indexs]):
            feature_data[user, 0] = 1

    new_data = np.concatenate((data, feature_data), axis=1)

    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = solved_features)
    print(deleted)
    return new_data, new_features
コード例 #25
0
def new_EI_1_2_3_4(data, features):
    key_features = [
        "Education_Info1", "Education_Info2", "Education_Info3",
        "Education_Info4"
    ]
    fea_indexs = get_known_features_index(features, key_features)
    feature_name = "combine_EI_1_2_3_4"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    ##### map rules ####
    map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"],
                   ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"],
                   ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"],
                   ["1", "A", "毕业", "AR"]]
    map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"],
                  ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"],
                  ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"],
                  ["1", "AQ", "毕业", "V"]]
    map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"],
                  ["1", "AM", "毕业", "F"]]
    map_to_three = [["0", "E", "E", "E"]]

    feature_data = np.ones((len(data), 1))
    for user in range(data.shape[0]):
        EI_1_2_3_4 = list(data[user, fea_indexs])
        if EI_1_2_3_4 in map_to_zero:
            feature_data[user, 0] = 0
        elif EI_1_2_3_4 in map_to_one:
            feature_data[user, 0] = 1
        elif EI_1_2_3_4 in map_to_two:
            feature_data[user, 0] = 2
        elif EI_1_2_3_4 in map_to_three:
            feature_data[user, 0] = 3
        else:
            print("error!!!!")
    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = key_features[1:])
    print(deleted)
    return new_data, new_features
コード例 #26
0
def new_UserInfo_24_resident_level(data, features):
    resident_detail_level = ["UserInfo_24"]
    digited_residence_data = digit_resident_features(data, features,
                                                     resident_detail_level)

    new_add_feature = np.array(["UserInfo_24_resident_detail_level"])
    new_features = np.concatenate((features, new_add_feature))

    feature_data = digited_residence_data[:,
                                          np.where(
                                              features ==
                                              resident_detail_level)[0][0]]
    feature_data = feature_data.reshape((feature_data.size, 1))
    #print(feature_data)
    #print("sdf", feature_data.shape)

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = resident_detail_level)

    print("UserInfo_24_resident_detail_level" + " solved")
    print(deleted)
    return new_data, new_features
コード例 #27
0
def deleted_web_log_features(data, features, saved_dir = "result"):
	from create_new_features import find_featuers_index
	features_name = "WeblogInfo"
	fea_indexs = find_featuers_index(features_name, features)
	# print(fea_indexs)
	weblog_data = data[:, fea_indexs]
	weblog_features = features[fea_indexs]

	correlation_between_properties(weblog_data, weblog_features)
	delete_result = according_properties_correlation_delete()
	#save_result(data, file_name, features, style, dir_name)
	save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv")

	weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
						"WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
						"WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"]
	save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

	delete_result.extend(weblog_delete_needed)
	
	data, features, deleted = delete_features(data, features, \
											delete_feas_list = delete_result)
	print("Train delete(weblog) : ", deleted)
	return data, features
コード例 #28
0
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all = True):
	
	fea_indexs = get_known_features_index(features, key_features)

	
	new_add_feature = np.array([feature_name])

	feature_data = np.zeros((len(data), 1))
	new_features = np.concatenate((features, new_add_feature))

	for user in range(data.shape[0]):
		if not len(set(list(data[user, fea_indexs]))) == 1:
			feature_data[user, 0] = 1

	new_data = np.concatenate((data, feature_data), axis = 1)

	delete_feas = key_features[1:]
	if deleted_all:
		delete_feas = key_features

	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = delete_feas)
	print(deleted)
	return new_data, new_features
コード例 #29
0
# @Author  : chensijia ([email protected])
# @Version : 0.0.0
# @Style   : Python3.5
#
# @Description: 


from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit
from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train
from solve_data import delete_features
import numpy as np



if __name__ == '__main__':
	data, features, label = load_data_for_solve("PPD_Master_GBK_2_Test_Set.csv", for_train = False)

	data, features = replace_miss(data, features, label, for_train = False)
	#save_result(data, "test/data_after_filling_missing_.csv", features)

	deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_features_with_too_many_missing")
	data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train)
	save_result(data, "test_data_after_deleted_features.csv", features, dir_name = "resultData/test/")

	data = strStyle_features_to_digit(data, features, for_train = False, use_experience = True)
	save_result(data, "data_after_digited.csv", features, dir_name= "resultData/test/")
	save_features_info(data, features, label, "info_after_digit_all_features.csv", \
						dir_name = "resultData/test/")


コード例 #30
0
    data, features = third_party_stable(data, features)

    data, features = third_party_level(data, features)
    save_result(data,
                "data_after_thirdparty_solved.csv",
                features,
                dir_name=saved_dir)
    return data, features


if __name__ == '__main__':

    contents = load_result("data_after_solved_weblog.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])

    data = convert_to_numerical(data, features)

    solve_thirdparty_info_package(data, features)

    # calculate_number = ["17"]
    # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0])
    # print(users_sta_name)
    # for i in range(10):
    # 	print(users_stability[i])
    # from create_new_features import find_featuers_index
    # features_name = "ThirdPart"
    # fea_indexs = find_featuers_index(features_name, features)
    # print(fea_indexs)
コード例 #31
0
# if a value in one features is bigger than 20000, besides
#	the positive in it is almost equal to the positive in the train data

if __name__ == '__main__':
    #################### used to calculate the correlation between properties #########
    contents = load_result("data_after_delete_no_discrimination_features.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])

    from map_features_to_digit import convert_to_numerical
    from solve_data import delete_features

    data = convert_to_numerical(data, features)

    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])

    correlation_between_properties(data, features)

    delete_result = according_properties_correlation_delete()
    save_result(delete_result, "deleted_features_with_strong_correlation.csv")


    data, features, deleted_features = delete_features(data, features, \
                  delete_feas_list = delete_result)
    # print(deleted_features)
    save_result(data, "data_after_delete_strong_correlation_features.csv",
                features)
    print(data.shape)

    ###############3 used pca to delete #####################
コード例 #32
0
def view_each_features_label(data, features, label):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    str_style_features = np.array(load_result("str_features.csv")[0])
    str_features_index = get_known_features_index(features, str_style_features)

    new_label = label.reshape((label.size, ))
    x = range(len(data))
    for fea_pos in range(len(features)):
        feature_name = features[fea_pos]
        if fea_pos in str_features_index:
            file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(
                fea_pos) + ")" + feature_name + ".png"
        else:
            file_path = "view_data_area/after_all/with_label_under_mean/" + str(
                fea_pos) + ")" + feature_name + ".png"
        features_info = feature_value_class(data, fea_pos, label,
                                            str_features_index)
        if features_info["num_of_value"] > 30:
            save_result([features[fea_pos]],
                        "complex_value_features.csv",
                        style="a+")
        else:
            if fea_pos not in str_features_index:
                save_result(
                    [features[fea_pos]],
                    "simple_discrete_value_features(nonestrfeatures).csv",
                    style="a+")

        y_positive = data[new_label == 1, fea_pos]
        y_negitive = data[new_label == 0, fea_pos]
        positive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 1
        ])
        negitive_index = np.array([
            index for index in range(len(new_label)) if new_label[index] == 0
        ])
        plt.scatter(positive_index, y_positive, marker='o', color='r', s=10)
        plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10)

        plt.xlabel("instances(30000)")
        plt.ylabel("value")
        if features_info["num_of_value"] < 40:
            plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
               "\n the arrow --> Proportion of positive in that value & in positive")
            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    arrow_data = round(
                        v._respond_positive_num /
                        features_info["num_positive"], 4)
                    arrow_start_position_x = len(data) + 2000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

                    arrow_data = round(
                        v._respond_positive_num / v._present_num, 4)
                    arrow_start_position_x = -4000
                    arrow_start_position_y = int(k)
                    arrow_end_postion_x = arrow_start_position_x
                    arrow_end_postion_y = int(k)
                    plt.annotate(arrow_data, \
                       xy=(arrow_start_position_x,arrow_start_position_y), \
                       xytext=(arrow_end_postion_x,arrow_end_postion_y), \
                       arrowprops=dict(facecolor='blue', shrink=0.02))

        else:
            fea_average = round(np.mean(data[:, fea_pos]), 4)
            fea_std = np.std(data[:, fea_pos])
            fea_oo = round(fea_std / fea_average, 4)
            max_v = np.amax(data[:, fea_pos])
            min_v = np.amin(data[:, fea_pos])
            plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
             "\n degree of fluctuation --> " + str(fea_oo))
            x1 = np.array(range(-5000, 35000))
            y_mean = fea_average * np.ones((x1.size))
            #plt.plot(x1, y_mean, color = 'k', linestyle = "--")
            plt.annotate(fea_average, \
                 xy=(-4000,fea_average), \
                 xytext=(-4000,fea_average), \
                 arrowprops=dict(facecolor='blue', shrink=0.05))
            under_mean_positive = 0
            under_mean_num = 0

            for k, v in features_info.items():
                if isinstance(v, FeatureInData):
                    if k <= fea_average:
                        under_mean_num += v._present_num
                        under_mean_positive += v._respond_positive_num
            ave_posi = round(
                under_mean_positive / features_info["num_positive"], 4)
            plt.annotate(ave_posi, \
              xy=(31000,fea_average), \
              xytext=(31000,fea_average), \
              arrowprops=dict(facecolor='blue', shrink=0.05))
            pos_rat = 0
            pos_rat_whole = 0
            if -1 in features_info.keys():
                pos_rat = features_info[
                    -1]._respond_positive_num / features_info[-1]._present_num
                pos_rat_whole = features_info[
                    -1]._respond_positive_num / features_info["num_positive"]
                plt.annotate(round(pos_rat_whole, 4), \
                  xy=(31000,-1), \
                  xytext=(31000,-1))
                plt.annotate(round(pos_rat, 4), \
                  xy=(-4000,-1), \
                  xytext=(-4000,-1))
            plt.ylim(min_v - 10, fea_average * 2)
            #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
        plt.savefig(file_path)
        plt.close()
コード例 #33
0
	data, features = remove_thirdparty6(data, features)

	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 

if __name__ == '__main__':

	contents = load_result("data_after_solved_weblog.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])

	data = convert_to_numerical(data, features)

	solve_thirdparty_info_package(data, features)

	# calculate_number = ["17"]
	# users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0])
	# print(users_sta_name)
	# for i in range(10):
	# 	print(users_stability[i])
	# from create_new_features import find_featuers_index
	# features_name = "ThirdPart"
	# fea_indexs = find_featuers_index(features_name, features)
	# print(fea_indexs)
コード例 #34
0
    weblog_features = features[fea_indexs]

    print(weblog_data.shape)
    print(weblog_features.shape)
    #save_result(weblog_data, "weblog_data_view.csv", weblog_features)

    # # label_lines = np.array(load_result("train_label_original.csv"))
    # # #print(label_lines.shape)
    # # from save_load_result import convert_to_float
    # # label = convert_to_float(label_lines)

    # # label = label.reshape((label.size, ))
    correlation_between_properties(weblog_data, weblog_features)
    delete_result = according_properties_correlation_delete()
    save_result(delete_result,
                "deleted_weblog_features_with_strong_correlation.csv")
    weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \
                 delete_feas_list = delete_result)
    save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv",
                weblog_features)

    weblog_delete_needed = [
        "WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
        "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
        "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"
    ]
    save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

    new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \
           delete_feas_list = weblog_delete_needed)
    save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
コード例 #35
0
def new_UserInfo_22_23_combine2(data, features):
    key_features = ["UserInfo_22", "UserInfo_23"]
    print("combine2")
    fea_indexs = get_known_features_index(features, key_features)
    feature_name = "UserInfo_combine2_by_present_22_23"
    new_add_feature = np.array([feature_name])
    new_features = np.concatenate((features, new_add_feature))

    ##### map rules ####
    map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'],
                   ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'],
                   ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'],
                   ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'],
                   ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'],
                   ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'],
                   ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'],
                   ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'],
                   ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'],
                   ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'],
                   ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'],
                   ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']]

    map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'],
                  ['已婚', '专科毕业']]

    map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'],
                  ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'],
                  ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'],
                  ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'],
                  ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']]
    map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'],
                    ['未婚', 'AB'], ['未婚', 'G']]
    map_to_four = [['D', 'D']]
    map_to_five = [['-1', '-1']]
    none_finded_combine = OrderedDict()
    feature_data = np.ones((len(data), 1))
    for user in range(data.shape[0]):
        EI_22_23 = list(data[user, fea_indexs])
        if EI_22_23 in map_to_zero:
            feature_data[user, 0] = 0
        elif EI_22_23 in map_to_one:
            feature_data[user, 0] = 1
        elif EI_22_23 in map_to_two:
            feature_data[user, 0] = 2
        elif EI_22_23 in map_to_three:
            feature_data[user, 0] = 3
        elif EI_22_23 in map_to_four:
            feature_data[user, 0] = 4
        elif EI_22_23 in map_to_five:
            feature_data[user, 0] = 5
        else:
            EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23)
            if EI_22_23_str not in none_finded_combine.keys():
                none_finded_combine[EI_22_23_str] = list()
            none_finded_combine[EI_22_23_str].append(user)

    for EI_combine, users in none_finded_combine.items():
        EI_combine = EI_combine.split("_")
        if EI_combine[0] == "-1" and EI_combine[1] == "-1":
            feature_data[users, 0] = 5
        if len(users) < 10:
            feature_data[users, 0] = 0
        elif len(users) < 20:
            feature_data[users, 0] = 1
        elif len(users) < 100:
            feature_data[users, 0] = 2
        elif len(users) < 1000:
            feature_data[users, 0] = 3
        else:
            feature_data[users, 0] = 4

    new_data = np.concatenate((data, feature_data), axis=1)
    new_data, new_features, deleted = delete_features(new_data, new_features, \
             delete_feas_list = key_features)
    print(deleted)
    return new_data, new_features
コード例 #36
0
	fea_indexs = find_featuers_index(features_name, features)
	print(fea_indexs)
	weblog_data = data[:, fea_indexs]
	weblog_features = features[fea_indexs]

	print(weblog_data.shape)
	print(weblog_features.shape)
	#save_result(weblog_data, "weblog_data_view.csv", weblog_features)

	# # label_lines = np.array(load_result("train_label_original.csv"))
	# # #print(label_lines.shape)
	# # from save_load_result import convert_to_float
	# # label = convert_to_float(label_lines)

	# # label = label.reshape((label.size, ))
	correlation_between_properties(weblog_data, weblog_features)
	delete_result = according_properties_correlation_delete()
	save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv")
	weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \
 													delete_feas_list = delete_result)
	save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features)


	weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13",
							"WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28",
							"WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"]
	save_result(weblog_delete_needed, "deleted_useless_weblog.csv")

	new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \
								delete_feas_list = weblog_delete_needed)
	save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
コード例 #37
0
ファイル: view_data.py プロジェクト: CSJLOVEJX/DataPigs
def view_each_features_label(data, features, label):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	str_style_features = np.array(load_result("str_features.csv")[0])
	str_features_index = get_known_features_index(features, str_style_features)

	new_label = label.reshape((label.size,))
	x = range(len(data))
	for fea_pos in range(len(features)):
		feature_name = features[fea_pos]
		if fea_pos in str_features_index:
			file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name +  ".png"
		else:
			file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name +  ".png"
		features_info = feature_value_class(data, fea_pos, label, str_features_index)
		if features_info["num_of_value"] > 30:
			save_result([features[fea_pos]], "complex_value_features.csv", style = "a+")
		else:
			if fea_pos not in str_features_index:
				save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+")


		y_positive = data[new_label == 1, fea_pos]
		y_negitive = data[new_label == 0, fea_pos]
		positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1])
		negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0])
		plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10)
		plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10)

		plt.xlabel("instances(30000)")
		plt.ylabel("value")
		if features_info["num_of_value"] < 40:
			plt.title(feature_name + " value - label " + "distributed " + "in instances" + \
						"\n the arrow --> Proportion of positive in that value & in positive")
			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4)
					arrow_start_position_x = len(data) + 2000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

					arrow_data = round(v._respond_positive_num / v._present_num , 4)
					arrow_start_position_x = -4000
					arrow_start_position_y = int(k)
					arrow_end_postion_x = arrow_start_position_x
					arrow_end_postion_y = int(k)
					plt.annotate(arrow_data, \
								xy=(arrow_start_position_x,arrow_start_position_y), \
								xytext=(arrow_end_postion_x,arrow_end_postion_y), \
								arrowprops=dict(facecolor='blue', shrink=0.02))

		else:
			fea_average = round(np.mean(data[:, fea_pos]), 4)
			fea_std = np.std(data[:, fea_pos])
			fea_oo = round(fea_std / fea_average, 4)
			max_v = np.amax(data[:, fea_pos])
			min_v = np.amin(data[:, fea_pos])
			plt.title(feature_name + " | mean & Proportion of positive under that mean" + \
				"\n degree of fluctuation --> " + str(fea_oo))
			x1 = np.array(range(-5000, 35000))
			y_mean = fea_average * np.ones((x1.size))
			#plt.plot(x1, y_mean, color = 'k', linestyle = "--")
			plt.annotate(fea_average, \
								xy=(-4000,fea_average), \
								xytext=(-4000,fea_average), \
								arrowprops=dict(facecolor='blue', shrink=0.05))
			under_mean_positive = 0
			under_mean_num = 0

			for k, v in features_info.items():
				if isinstance(v, FeatureInData):
					if k <= fea_average:
						under_mean_num += v._present_num
						under_mean_positive += v._respond_positive_num
			ave_posi = round(under_mean_positive / features_info["num_positive"], 4)
			plt.annotate(ave_posi, \
					xy=(31000,fea_average), \
					xytext=(31000,fea_average), \
					arrowprops=dict(facecolor='blue', shrink=0.05))
			pos_rat = 0
			pos_rat_whole = 0
			if -1 in features_info.keys():
				pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num
				pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"]
				plt.annotate(round(pos_rat_whole, 4), \
						xy=(31000,-1), \
						xytext=(31000,-1))
				plt.annotate(round(pos_rat, 4), \
						xy=(-4000,-1), \
						xytext=(-4000,-1))
			plt.ylim(min_v - 10, fea_average * 2)
			#plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10))
		plt.savefig(file_path)
		plt.close()
コード例 #38
0
ファイル: features_reduce.py プロジェクト: CSJLOVEJX/DataPigs
# if a value in one features is bigger than 20000, besides
#	the positive in it is almost equal to the positive in the train data

if __name__ == '__main__':
	#################### used to calculate the correlation between properties #########
	contents = load_result("data_after_delete_no_discrimination_features.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])

	from map_features_to_digit import convert_to_numerical
	from solve_data import delete_features

	data = convert_to_numerical(data, features)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])


	correlation_between_properties(data, features)

	delete_result = according_properties_correlation_delete()
	save_result(delete_result, "deleted_features_with_strong_correlation.csv")

	
	data, features, deleted_features = delete_features(data, features, \
	 													delete_feas_list = delete_result)
	# print(deleted_features)
	save_result(data, "data_after_delete_strong_correlation_features.csv", features)
	print(data.shape)

	###############3 used pca to delete #####################
コード例 #39
0
def new_UserInfo_22_23_combine2(data, features):
	key_features = ["UserInfo_22", "UserInfo_23"]
	print("combine2")
	fea_indexs = get_known_features_index(features, key_features)
	feature_name = "UserInfo_combine2_by_present_22_23"
	new_add_feature = np.array([feature_name])
	new_features = np.concatenate((features, new_add_feature))

	##### map rules ####
	map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], 
					['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], 
					['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], 
					['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], 
					['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], 
					['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], 
					['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], 
					['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], 
					['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], 
					['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']]

	map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']]

	map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], 
					['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], 
					['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], 
					['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']]
	map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']]
	map_to_four = [['D', 'D']]
	map_to_five = [['-1', '-1']]
	none_finded_combine = OrderedDict()
	feature_data = np.ones((len(data), 1))
	for user in range(data.shape[0]):
		EI_22_23 = list(data[user, fea_indexs])
		if EI_22_23 in map_to_zero:
			feature_data[user, 0] = 0
		elif EI_22_23 in map_to_one:
			feature_data[user, 0] = 1
		elif EI_22_23 in map_to_two:
			feature_data[user, 0] = 2
		elif EI_22_23 in map_to_three:
			feature_data[user, 0] = 3
		elif EI_22_23 in map_to_four:
			feature_data[user, 0] = 4
		elif EI_22_23 in map_to_five:
			feature_data[user, 0] = 5
		else:
			EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23)
			if EI_22_23_str not in none_finded_combine.keys():
				none_finded_combine[EI_22_23_str] = list()
			none_finded_combine[EI_22_23_str].append(user)

	for EI_combine, users in none_finded_combine.items():
		EI_combine = EI_combine.split("_")
		if EI_combine[0] == "-1" and EI_combine[1] == "-1":
			feature_data[users, 0] = 5
		if len(users) < 10:
			feature_data[users, 0] = 0
		elif len(users) < 20:
			feature_data[users, 0] = 1
		elif len(users) < 100:
			feature_data[users, 0] = 2
		elif len(users) < 1000:
			feature_data[users, 0] = 3
		else:
			feature_data[users, 0] = 4

	new_data = np.concatenate((data, feature_data), axis = 1)
	new_data, new_features, deleted = delete_features(new_data, new_features, \
										delete_feas_list = key_features)
	print(deleted)
	return new_data, new_features
コード例 #40
0
from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit
from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train
from solve_data import delete_features
import numpy as np

if __name__ == '__main__':
    data, features, label = load_data_for_solve(
        "PPD_Master_GBK_2_Test_Set.csv", for_train=False)

    data, features = replace_miss(data, features, label, for_train=False)
    #save_result(data, "test/data_after_filling_missing_.csv", features)

    deleted_features_in_train = load_all_deleted_features_during_train(
        deleted_features_file_label="deleted_features_with_too_many_missing")
    data, features, deleted = delete_features(
        data, features, delete_feas_list=deleted_features_in_train)
    save_result(data,
                "test_data_after_deleted_features.csv",
                features,
                dir_name="resultData/test/")

    data = strStyle_features_to_digit(data,
                                      features,
                                      for_train=False,
                                      use_experience=True)
    save_result(data,
                "data_after_digited.csv",
                features,
                dir_name="resultData/test/")
    save_features_info(data, features, label, "info_after_digit_all_features.csv", \
         dir_name = "resultData/test/")