コード例 #1
0
def extract_log_update_package(data, features, for_train = True):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	data, features = add_features_from_log(data, features, for_train)
	data, features = add_features_from_update(data, features, for_train)

	return data, features
コード例 #2
0
def extract_log_update_package(data, features, for_train=True):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)

    data, features = add_features_from_log(data, features, for_train)
    data, features = add_features_from_update(data, features, for_train)

    return data, features
コード例 #3
0
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"):
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)

	data, features = sta_start_missing_period(data, features)
	data, features = remove_thirdparty6(data, features)

	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 
コード例 #4
0
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"):
    data, features, deleted = delete_features(
        data, features, delete_feas_list=["Idx", "ListingInfo"])
    data = convert_to_numerical(data, features)

    data, features = sta_start_missing_period(data, features)
    data, features = remove_thirdparty6(data, features)

    data, features = fill_thirdParty_miss(data, features)

    data, features = third_party_stable(data, features)

    data, features = third_party_level(data, features)
    save_result(data,
                "data_after_thirdparty_solved.csv",
                features,
                dir_name=saved_dir)
    return data, features
コード例 #5
0
ファイル: features_reduce.py プロジェクト: CSJLOVEJX/DataPigs
def correlation_between_properties(data, features):
	fixed_str_features = np.array(load_result("str_features.csv"))[0]
	indexs = get_known_features_index(features, fixed_str_features)

	data = convert_to_numerical(data, features)

	title = list()
	title.append("features1")
	title.append("features2")
	title.append("calculate_method") 
	title.append("cor")
	title.append("pval")
	save_result(title, "pearsonr_spearmanr_results.csv")
	save_result(title, "pearsonr_spearmanr_Strong_correlation.csv")
	for fea_pos in range(len(features)):
		for fea_pos_add in range(fea_pos + 1, len(features)):
			info_result = list()
			info_result.append(features[fea_pos])
			info_result.append(features[fea_pos_add])
			a1 = data[:, fea_pos]
			a2 = data[:, fea_pos_add]
			# they are all not str style features
			if fea_pos not in indexs and fea_pos_add not in indexs:
				info_result.append("pearsonr")
				try:
					cor, pval = stats.pearsonr(a1, a2)
				except:
					print("error!!!!:", features[fea_pos])
					print("error!!!!:", features[fea_pos_add])
			else: # one of them or all of them are str style features
				info_result.append("spearmanr")
				cor, pval = stats.spearmanr(a1, a2)
			cor = round(cor, 3)
			info_result.append(cor)
			info_result.append(pval)
			if abs(cor) >= 0.2:
				save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+")
			if abs(cor) >= 0.9:
				save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \
												style = "a+")
コード例 #6
0
	data, features = fill_thirdParty_miss(data, features)

	data, features = third_party_stable(data, features)

	data, features = third_party_level(data, features)
	save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir)
	return data, features 

if __name__ == '__main__':

	contents = load_result("data_after_solved_weblog.csv")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])

	data = convert_to_numerical(data, features)

	solve_thirdparty_info_package(data, features)

	# calculate_number = ["17"]
	# users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0])
	# print(users_sta_name)
	# for i in range(10):
	# 	print(users_stability[i])
	# from create_new_features import find_featuers_index
	# features_name = "ThirdPart"
	# fea_indexs = find_featuers_index(features_name, features)
	# print(fea_indexs)

	# data = data[:, fea_indexs]
	# features = features[fea_indexs]
コード例 #7
0
    while iteral < 100:
        sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \
                bin_label, bins)

        new_bins = compare_and_combine(sta_result)
        if bins == new_bins:
            break
        bins = new_bins

    return sta_result


if __name__ == '__main__':
    contents = load_result("after_Str_features_digited_data.csv")
    features = np.array(contents[0])
    data = np.array(contents[1:])
    label_lines = np.array(load_result("train_label_original.csv"))
    print(label_lines.shape)

    label = convert_to_float(label_lines)

    from map_features_to_digit import convert_to_numerical

    data = convert_to_numerical(data, features)

    #index = np.where(features == "ThirdParty_Info_Period4_1")[0][0]
    index = np.where(features == "WeblogInfo_12")[0][0]
    fea_info = feature_value_class(data, index, label)
    #print(fea_info)
    result = discretization_feature(fea_info)
    print(result)
コード例 #8
0
ファイル: Module_xgboost.py プロジェクト: CSJLOVEJX/DataPigs
if __name__ == '__main__':
	contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All")
	features = np.array(contents[0])
	data = np.array(contents[1:])
	print("data: ", data.shape)
	label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All"))
	print(label_lines.shape)
	from save_load_result import convert_to_int
	label = convert_to_int(label_lines)

	label = label.reshape((label.size, ))
	print("label: ", label.shape)

	data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"])
	data = convert_to_numerical(data, features)


	# s_d = StratifiedShuffleSplit(label, n_iter = 2, test_size = 0.3, train_size = 0.7)
	
	# i = 0
	# for train_index, test_index in s_d:
	# 	train_data, test_data = data[train_index], data[test_index]
	# 	train_label, test_label = label[train_index], label[test_index]

	# 	test_preds = module_xgboost_pre(train_data, train_label, test_data)
	# 	# print(test_preds)
	# 	# print(type(test_preds))
	# 	fig_name = "module_xgb_ROC_7" + "_" + str(i) + ".png"
	# 	print(fig_name)
	# 	i += 1