def apply_networkx(node_count, test_nodes):
    columns = [
        0,  #0 source
        1,  #1 sink
        4,  #2 source outdegree
        6,  #3 sink indegree
        10,  #4 weighted_in_share_friends
        11,  #5 weighted_out_share_friends
        16
    ]  #6 flag
    original_columns_titles = [HEADER[i] for i in columns]

    sample_data = ut.read_as_list(
        ut.DATA_DIR + "sample.with_feature.{}.csv".format(node_count))[1:]
    test_data = ut.read_as_list(
        ut.DATA_DIR +
        "test-public_with_features.{}.csv".format(node_count))[1:]

    column_filter = lambda x: [x[i] for i in columns]
    sample_data = list(map(column_filter, sample_data))
    test_data = list(map(column_filter, test_data))

    DG = nx.Graph()
    DG.add_nodes_from(test_nodes)

    trans_func = lambda x: [x[0], x[1], 1 / int(x[3])]

    weighted_edges = list()
    negative_nodes_set = set()

    for pair in sample_data:
        if pair[-1] == '1':
            weighted_edges.append(trans_func(pair))
        else:
            negative_nodes_set.add(pair[0])
            negative_nodes_set.add(pair[1])

    # list(map(trans_func, sample_data))

    DG.add_weighted_edges_from(weighted_edges, weight='weight')
    DG.add_nodes_from(negative_nodes_set)

    ut.log("calculating networkx features for train data...")
    feature_calculate(
        DG, sample_data, original_columns_titles,
        ut.DATA_DIR + "sample.with_feature.{}.networx.csv".format(node_count))

    ut.log("calculating networkx features for test data...")
    feature_calculate(
        DG, test_data, original_columns_titles, ut.DATA_DIR +
        "test-public_with_features.{}.networx.csv".format(node_count))
Exemple #2
0
def make_out_of_package(positive_count, source_dict, source_dict_keys_set,
                        sink_dict, sink_dict_keys_set):
    ut.log("generate out of package pair features...")

    # positive_count = len(sample_source_pair_list)
    out_of_package_count = 0
    out_of_package_count_target = positive_count
    original_train_pair = ut.read_as_list(ut.ORIGINAL_TRAIN_FILE, "\t")
    source_count = len(original_train_pair)

    calc = pre_calculate.calculator(source_dict, source_dict_keys_set,
                                    sink_dict, sink_dict_keys_set)
    out_of_package_pairs = list()

    while out_of_package_count < out_of_package_count_target:

        row_idx = random.randint(0, source_count - 1)
        row = original_train_pair[row_idx]
        if len(row) < 3:
            continue
        sink_idx = random.randint(1, len(row) - 1)

        source = row[0]
        sink = row[sink_idx]
        this_pair = calc._calculate_pair_features(source, sink, True)
        this_pair.extend([99999, 0, 0, 0])
        out_of_package_pairs.append(this_pair)
        out_of_package_count += 1
        if out_of_package_count % 1000 == 0:
            ut.log("{}/{}".format(out_of_package_count,
                                  out_of_package_count_target))

    return out_of_package_pairs
Exemple #3
0
def train_model(model, train_file, cross_valid_n = 10):
    ut.log("begin to train model...")
    sample_with_feature = ut.read_as_list(train_file)
    kf = KFold(cross_valid_n, True)

    X_list = list()
    y_list = list()
    for line in sample_with_feature[1:]:
        features = list(map(converToFloatIfPossible, line[2:-1]))
        X_list.append(features)
        y_list.append(line[-1])

    X = np.array(X_list)
    y = np.array(y_list)

    test_result = list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        test_result.append(score)
        ut.log("Test #{} => Score: {}".format(len(test_result) , score))

    return model
Exemple #4
0
def predict(model, test_file, predice_save_to=ut.DATA_DIR + "predict.csv"):

    ut.log("begin to predict from file {}...".format(test_file))
    test_with_feature = ut.read_as_list(test_file)
    X_list = list()

    for line in test_with_feature[1:]:
        features = list(map(float, line[2:-1]))
        X_list.append(features)

    X = np.array(X_list)

    result = model.predict_proba(X)

    positive_class_idx = None
    classes = model.classes_
    ut.log("classes: {}".format(classes))
    for i in range(len(classes)):
        if classes[i] == '1':
            positive_class_idx = i
            break
    ut.log("#{} is the wanted column".format(positive_class_idx+1))

    predict_list = list()
    predict_list.append(["Id","Prediction"])

    for i in range(len(result)):
        prob = result[i][positive_class_idx]
        prob = prob if prob > 0.001 else 0.01
        predict_list.append([i+1, prob])

    ut.log("saving predict result to file {}...".format(predice_save_to))
    ut.write_list_csv(predice_save_to,predict_list)

    ut.log("saving model to file {}...".format(predice_save_to+".model.pkl"))
    joblib.dump(model, predice_save_to+".model.pkl")
def make_sample(
        max_node_count: int,
        sink_node_count: int,
        max_outdegree: int,
        sink_dict_file,
        source_dict_file,
        test_file=ut.ORIGINAL_TEST_FILE,
        sample_pair_file_with_features=ut.SAMPLE_PAIR_WITH_FEATURE_FILE,
        test_pair_file_with_features=ut.TEST_FILE_WITH_FEATURES,
        use_networkx=False):

    # begin to do samping
    ut.log("begin to make sample...")

    sink_dict = ut.read_as_dict(sink_dict_file)
    sink_dict_keys_set = set(sink_dict.keys())
    sink_dict = OrderedDict(
        sorted(sink_dict.items(), key=itemgetter(1), reverse=True))

    source_dict = ut.read_as_dict(source_dict_file)
    source_dict_keys_set = set(source_dict.keys())

    testing_source_ids, test_sink_ids, test_pair = ut.get_test_file_data(
        test_file)

    ut.log("random select more start node...")
    random_nodes = _random_start_point(
        source_dict, testing_source_ids,
        max_node_count - len(testing_source_ids), max_outdegree)
    start_points = testing_source_ids.union(random_nodes)

    ut.log("random select more sink node...")
    random_sink_nodes = _random_sink_point(
        sink_dict, test_sink_ids, sink_node_count - len(test_sink_ids), 1,
        2000)
    end_points = test_sink_ids.union(random_sink_nodes)

    # graph
    ut.log("make positive pairs...")
    positive_pairs = []
    finished_source_node_list = list()
    positive_nodes_set = set()
    for source_node in start_points:
        if not source_node in source_dict_keys_set:
            break

        positive_nodes_set.add(source_node)

        for sink_node in source_dict[source_node]:
            if len(source_dict[source_node]) > max_outdegree:
                if sink_node in end_points:
                    positive_pairs.append((source_node, sink_node))
                    positive_nodes_set.add(sink_node)
            else:
                if sink_node in end_points:
                    positive_pairs.append((source_node, sink_node))
                    positive_nodes_set.add(sink_node)

        finished_source_node_list.append(source_node)
        ut.log("{}/{}".format(len(finished_source_node_list), max_node_count))

    # create calculator to calculate features
    calc = pre_calculate.calculator(source_dict, source_dict_keys_set,
                                    sink_dict, sink_dict_keys_set)

    positive_count = 0
    # saved_to = sample_pair_file_with_features+"{}.{}.{}.csv".format(max_node_count,sink_node_count,max_outdegree)
    # positive pairs
    ut.log("calculate positive pair features and save to file {}...".format(
        sample_pair_file_with_features + "{}.csv".format(max_node_count)))
    # ut.write_list_csv(.format(max_node_count), sample_source_pair_list)
    with open(sample_pair_file_with_features + "{}.csv".format(max_node_count),
              'w') as f:
        f.write("{}\n".format(",".join(HEADER)))

        featured_pairs = list()

        for pair in positive_pairs:
            start = pair[0]
            end = pair[1]
            if positive_count % 1000 == 0:
                ut.log("{}/{}".format(positive_count, len(positive_pairs)))

            this_pair = calc._calculate_pair_features(start, end, True)

            # sample_source_pair_list.append(this_pair)
            featured_pairs.append(this_pair)
            row_text = ",".join(map(str, this_pair))
            f.write("{}\n".format(row_text))
            positive_count += 1

        ut.log('\npositive pair completed')

        # # write file into graph DSG format
        # _write_graph_file_DGS(featured_pairs, positive_nodes_set, sample_pair_file_with_features, max_node_count)

        # negative pairs
        # 0. get all sinks
        ut.log("generate negative pair features...")
        original_sink_list = list(end_points)
        sink_count = len(end_points)
        start_points_list = list(start_points)
        source_count = len(start_points)
        # positive_count = len(sample_source_pair_list)
        negative_count = 0
        negative_count_target = positive_count  #int(35 * positive_count / 65)

        while negative_count < negative_count_target:
            source_idx = random.randint(0, source_count - 1)
            nga_source = start_points_list[source_idx]

            sink_idx = random.randint(0, sink_count - 1)
            nga_sink = original_sink_list[sink_idx]

            if nga_source not in source_dict:

                break

            if nga_sink not in source_dict[nga_source]:
                this_pair = calc._calculate_pair_features(
                    nga_source, nga_sink, False)

                row_text = map(str, this_pair)
                f.write("{}\n".format(",".join(row_text)))
                # sample_source_pair_list.append(this_pair)
                negative_count += 1

                if negative_count % 1000 == 0:
                    ut.log("{}/{}".format(negative_count,
                                          negative_count_target))
                # if negative_count % 100000 == 0:
                #     ut.log('')

            # ut.log("{}/{}".format(negative_count, positive_count))

        ut.log('\nnegative pair completed')

        # out of package
        ut.log("generate out of package pair features...")
        # original_sink_list = list(end_points)
        sink_count = len(end_points)
        start_points_list = list(start_points)

        # positive_count = len(sample_source_pair_list)
        out_of_package_count = 0
        out_of_package_count_target = positive_count
        original_train_pair = ut.read_as_list(ut.ORIGINAL_TRAIN_FILE, "\t")
        source_count = len(original_train_pair)

        while out_of_package_count < out_of_package_count_target:

            row_idx = random.randint(0, source_count - 1)
            row = original_train_pair[row_idx]
            if len(row) < 3:
                continue
            sink_idx = random.randint(1, len(row) - 1)

            source = row[0]
            sink = row[sink_idx]
            this_pair = calc._calculate_pair_features(source, sink, True)
            row_text = map(str, this_pair)
            f.write("{}\n".format(",".join(row_text)))
            out_of_package_count += 1
            if out_of_package_count % 1000 == 0:
                ut.log("{}/{}".format(out_of_package_count,
                                      out_of_package_count_target))

        ut.log('\nout of package pair completed')

        # ut.write_list_csv(sample_pair_file_with_features+"{}.csv".format(max_node_count), sample_source_pair_list)

    # test public features
    featured_test_pair_list = list()
    featured_test_pair_list.append(HEADER)
    for tp in test_pair:
        start = tp[0]
        end = tp[1]
        this_pair = calc._calculate_pair_features(start, end, False)
        featured_test_pair_list.append(this_pair)

    ut.write_list_csv(
        test_pair_file_with_features + "{}.csv".format(max_node_count),
        featured_test_pair_list)

    if use_networkx:
        apply_networkx(max_node_count, testing_source_ids.union(test_sink_ids))
Exemple #6
0
          oob_score=True,
          criterion='entropy')
train_file = ut.DATA_DIR + "sample.with_feature.10000.networx.csv"
test_file = ut.DATA_DIR + "test-public_with_features.10000.networx.csv"
predice_save_to = ut.DATA_DIR + "kn_randomforestclassifier.2.csv"

# all source
source_dict = ut.read_as_dict(ut.ORIGINAL_TRAIN_FILE, "\t")
sink_dict = ut.read_as_dict(ut.DATA_DIR + "sink_dict.csv")
source_dict_keys_set = set(sink_dict.keys())
sink_dict_keys_set = set(sink_dict.keys())

used_columns = [4, 5, 6, 8, 10]

ut.log("begin to train model...")
sample_with_feature = ut.read_as_list(train_file)[1:]
sample_with_feature = sample_with_feature + make_out_of_package(
    int(len(source_dict_keys_set) / 10), source_dict, source_dict_keys_set,
    sink_dict, sink_dict_keys_set)

ut.write_list_csv(ut.DATA_DIR + "last_hope.csv", sample_with_feature)
# sample_count = len(sample_with_feature) - 1
# mid_idx = int(sample_count * 0.5)
# P_80 = int(sample_count * 0.8)
# P_20 = int(sample_count * 0.2)

kf = KFold(2, True)

X_list = list()
y_list = list()