def apply_networkx(node_count, test_nodes): columns = [ 0, #0 source 1, #1 sink 4, #2 source outdegree 6, #3 sink indegree 10, #4 weighted_in_share_friends 11, #5 weighted_out_share_friends 16 ] #6 flag original_columns_titles = [HEADER[i] for i in columns] sample_data = ut.read_as_list( ut.DATA_DIR + "sample.with_feature.{}.csv".format(node_count))[1:] test_data = ut.read_as_list( ut.DATA_DIR + "test-public_with_features.{}.csv".format(node_count))[1:] column_filter = lambda x: [x[i] for i in columns] sample_data = list(map(column_filter, sample_data)) test_data = list(map(column_filter, test_data)) DG = nx.Graph() DG.add_nodes_from(test_nodes) trans_func = lambda x: [x[0], x[1], 1 / int(x[3])] weighted_edges = list() negative_nodes_set = set() for pair in sample_data: if pair[-1] == '1': weighted_edges.append(trans_func(pair)) else: negative_nodes_set.add(pair[0]) negative_nodes_set.add(pair[1]) # list(map(trans_func, sample_data)) DG.add_weighted_edges_from(weighted_edges, weight='weight') DG.add_nodes_from(negative_nodes_set) ut.log("calculating networkx features for train data...") feature_calculate( DG, sample_data, original_columns_titles, ut.DATA_DIR + "sample.with_feature.{}.networx.csv".format(node_count)) ut.log("calculating networkx features for test data...") feature_calculate( DG, test_data, original_columns_titles, ut.DATA_DIR + "test-public_with_features.{}.networx.csv".format(node_count))
def make_out_of_package(positive_count, source_dict, source_dict_keys_set, sink_dict, sink_dict_keys_set): ut.log("generate out of package pair features...") # positive_count = len(sample_source_pair_list) out_of_package_count = 0 out_of_package_count_target = positive_count original_train_pair = ut.read_as_list(ut.ORIGINAL_TRAIN_FILE, "\t") source_count = len(original_train_pair) calc = pre_calculate.calculator(source_dict, source_dict_keys_set, sink_dict, sink_dict_keys_set) out_of_package_pairs = list() while out_of_package_count < out_of_package_count_target: row_idx = random.randint(0, source_count - 1) row = original_train_pair[row_idx] if len(row) < 3: continue sink_idx = random.randint(1, len(row) - 1) source = row[0] sink = row[sink_idx] this_pair = calc._calculate_pair_features(source, sink, True) this_pair.extend([99999, 0, 0, 0]) out_of_package_pairs.append(this_pair) out_of_package_count += 1 if out_of_package_count % 1000 == 0: ut.log("{}/{}".format(out_of_package_count, out_of_package_count_target)) return out_of_package_pairs
def train_model(model, train_file, cross_valid_n = 10): ut.log("begin to train model...") sample_with_feature = ut.read_as_list(train_file) kf = KFold(cross_valid_n, True) X_list = list() y_list = list() for line in sample_with_feature[1:]: features = list(map(converToFloatIfPossible, line[2:-1])) X_list.append(features) y_list.append(line[-1]) X = np.array(X_list) y = np.array(y_list) test_result = list() for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) score = model.score(X_test, y_test) test_result.append(score) ut.log("Test #{} => Score: {}".format(len(test_result) , score)) return model
def predict(model, test_file, predice_save_to=ut.DATA_DIR + "predict.csv"): ut.log("begin to predict from file {}...".format(test_file)) test_with_feature = ut.read_as_list(test_file) X_list = list() for line in test_with_feature[1:]: features = list(map(float, line[2:-1])) X_list.append(features) X = np.array(X_list) result = model.predict_proba(X) positive_class_idx = None classes = model.classes_ ut.log("classes: {}".format(classes)) for i in range(len(classes)): if classes[i] == '1': positive_class_idx = i break ut.log("#{} is the wanted column".format(positive_class_idx+1)) predict_list = list() predict_list.append(["Id","Prediction"]) for i in range(len(result)): prob = result[i][positive_class_idx] prob = prob if prob > 0.001 else 0.01 predict_list.append([i+1, prob]) ut.log("saving predict result to file {}...".format(predice_save_to)) ut.write_list_csv(predice_save_to,predict_list) ut.log("saving model to file {}...".format(predice_save_to+".model.pkl")) joblib.dump(model, predice_save_to+".model.pkl")
def make_sample( max_node_count: int, sink_node_count: int, max_outdegree: int, sink_dict_file, source_dict_file, test_file=ut.ORIGINAL_TEST_FILE, sample_pair_file_with_features=ut.SAMPLE_PAIR_WITH_FEATURE_FILE, test_pair_file_with_features=ut.TEST_FILE_WITH_FEATURES, use_networkx=False): # begin to do samping ut.log("begin to make sample...") sink_dict = ut.read_as_dict(sink_dict_file) sink_dict_keys_set = set(sink_dict.keys()) sink_dict = OrderedDict( sorted(sink_dict.items(), key=itemgetter(1), reverse=True)) source_dict = ut.read_as_dict(source_dict_file) source_dict_keys_set = set(source_dict.keys()) testing_source_ids, test_sink_ids, test_pair = ut.get_test_file_data( test_file) ut.log("random select more start node...") random_nodes = _random_start_point( source_dict, testing_source_ids, max_node_count - len(testing_source_ids), max_outdegree) start_points = testing_source_ids.union(random_nodes) ut.log("random select more sink node...") random_sink_nodes = _random_sink_point( sink_dict, test_sink_ids, sink_node_count - len(test_sink_ids), 1, 2000) end_points = test_sink_ids.union(random_sink_nodes) # graph ut.log("make positive pairs...") positive_pairs = [] finished_source_node_list = list() positive_nodes_set = set() for source_node in start_points: if not source_node in source_dict_keys_set: break positive_nodes_set.add(source_node) for sink_node in source_dict[source_node]: if len(source_dict[source_node]) > max_outdegree: if sink_node in end_points: positive_pairs.append((source_node, sink_node)) positive_nodes_set.add(sink_node) else: if sink_node in end_points: positive_pairs.append((source_node, sink_node)) positive_nodes_set.add(sink_node) finished_source_node_list.append(source_node) ut.log("{}/{}".format(len(finished_source_node_list), max_node_count)) # create calculator to calculate features calc = pre_calculate.calculator(source_dict, source_dict_keys_set, sink_dict, sink_dict_keys_set) positive_count = 0 # saved_to = sample_pair_file_with_features+"{}.{}.{}.csv".format(max_node_count,sink_node_count,max_outdegree) # positive pairs ut.log("calculate positive pair features and save to file {}...".format( sample_pair_file_with_features + "{}.csv".format(max_node_count))) # ut.write_list_csv(.format(max_node_count), sample_source_pair_list) with open(sample_pair_file_with_features + "{}.csv".format(max_node_count), 'w') as f: f.write("{}\n".format(",".join(HEADER))) featured_pairs = list() for pair in positive_pairs: start = pair[0] end = pair[1] if positive_count % 1000 == 0: ut.log("{}/{}".format(positive_count, len(positive_pairs))) this_pair = calc._calculate_pair_features(start, end, True) # sample_source_pair_list.append(this_pair) featured_pairs.append(this_pair) row_text = ",".join(map(str, this_pair)) f.write("{}\n".format(row_text)) positive_count += 1 ut.log('\npositive pair completed') # # write file into graph DSG format # _write_graph_file_DGS(featured_pairs, positive_nodes_set, sample_pair_file_with_features, max_node_count) # negative pairs # 0. get all sinks ut.log("generate negative pair features...") original_sink_list = list(end_points) sink_count = len(end_points) start_points_list = list(start_points) source_count = len(start_points) # positive_count = len(sample_source_pair_list) negative_count = 0 negative_count_target = positive_count #int(35 * positive_count / 65) while negative_count < negative_count_target: source_idx = random.randint(0, source_count - 1) nga_source = start_points_list[source_idx] sink_idx = random.randint(0, sink_count - 1) nga_sink = original_sink_list[sink_idx] if nga_source not in source_dict: break if nga_sink not in source_dict[nga_source]: this_pair = calc._calculate_pair_features( nga_source, nga_sink, False) row_text = map(str, this_pair) f.write("{}\n".format(",".join(row_text))) # sample_source_pair_list.append(this_pair) negative_count += 1 if negative_count % 1000 == 0: ut.log("{}/{}".format(negative_count, negative_count_target)) # if negative_count % 100000 == 0: # ut.log('') # ut.log("{}/{}".format(negative_count, positive_count)) ut.log('\nnegative pair completed') # out of package ut.log("generate out of package pair features...") # original_sink_list = list(end_points) sink_count = len(end_points) start_points_list = list(start_points) # positive_count = len(sample_source_pair_list) out_of_package_count = 0 out_of_package_count_target = positive_count original_train_pair = ut.read_as_list(ut.ORIGINAL_TRAIN_FILE, "\t") source_count = len(original_train_pair) while out_of_package_count < out_of_package_count_target: row_idx = random.randint(0, source_count - 1) row = original_train_pair[row_idx] if len(row) < 3: continue sink_idx = random.randint(1, len(row) - 1) source = row[0] sink = row[sink_idx] this_pair = calc._calculate_pair_features(source, sink, True) row_text = map(str, this_pair) f.write("{}\n".format(",".join(row_text))) out_of_package_count += 1 if out_of_package_count % 1000 == 0: ut.log("{}/{}".format(out_of_package_count, out_of_package_count_target)) ut.log('\nout of package pair completed') # ut.write_list_csv(sample_pair_file_with_features+"{}.csv".format(max_node_count), sample_source_pair_list) # test public features featured_test_pair_list = list() featured_test_pair_list.append(HEADER) for tp in test_pair: start = tp[0] end = tp[1] this_pair = calc._calculate_pair_features(start, end, False) featured_test_pair_list.append(this_pair) ut.write_list_csv( test_pair_file_with_features + "{}.csv".format(max_node_count), featured_test_pair_list) if use_networkx: apply_networkx(max_node_count, testing_source_ids.union(test_sink_ids))
oob_score=True, criterion='entropy') train_file = ut.DATA_DIR + "sample.with_feature.10000.networx.csv" test_file = ut.DATA_DIR + "test-public_with_features.10000.networx.csv" predice_save_to = ut.DATA_DIR + "kn_randomforestclassifier.2.csv" # all source source_dict = ut.read_as_dict(ut.ORIGINAL_TRAIN_FILE, "\t") sink_dict = ut.read_as_dict(ut.DATA_DIR + "sink_dict.csv") source_dict_keys_set = set(sink_dict.keys()) sink_dict_keys_set = set(sink_dict.keys()) used_columns = [4, 5, 6, 8, 10] ut.log("begin to train model...") sample_with_feature = ut.read_as_list(train_file)[1:] sample_with_feature = sample_with_feature + make_out_of_package( int(len(source_dict_keys_set) / 10), source_dict, source_dict_keys_set, sink_dict, sink_dict_keys_set) ut.write_list_csv(ut.DATA_DIR + "last_hope.csv", sample_with_feature) # sample_count = len(sample_with_feature) - 1 # mid_idx = int(sample_count * 0.5) # P_80 = int(sample_count * 0.8) # P_20 = int(sample_count * 0.2) kf = KFold(2, True) X_list = list() y_list = list()