コード例 #1
0
def karate_test_scenario(deepwalk_path):

    y_path = '../../local_resources/zachary_karate/y.p'
    x_path = '../../local_resources/zachary_karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['deepwalk'], ['logistic']]

    x_deepwalk = pd.read_csv(deepwalk_path, index_col=0)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    X = [x_deepwalk.values, normalize(x, axis=0)]
    n_folds = 10
    results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/karate/deepwalk_macro_pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/karate/deepwalk_micro_pvalues' +
                    utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/karate/deepwalk_macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = '../../results/karate/deepwalk_micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
コード例 #2
0
def run_scenario(folder, embedding_path):
    y_path = '../../local_resources/{}/y.p'.format(folder)
    x_path = '../../local_resources/{}/X.p'.format(folder)
    sizes = [2, 4, 8, 16, 32, 64, 128]
    deepwalk_embeddings = []
    deepwalk_names = []
    dwpath = '../../local_resources/{0}/{1}'.format(folder, folder)
    for size in sizes:
        path = dwpath + str(size) + '.emd'
        de = pd.read_csv(path, header=None, index_col=0, skiprows=1, sep=" ")
        de.sort_index(inplace=True)
        deepwalk_embeddings.append(de.values)
        deepwalk_names.append(['deepwalk' + str(size)])

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['hyperbolic'], ['logistic']]
    names = deepwalk_names + names

    embedding = pd.read_csv(embedding_path, index_col=0)
    X = deepwalk_embeddings + [embedding.values, normalize(x, axis=0)]
    n_folds = 10
    results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/{0}/pvalues{1}.csv'.format(folder, utils.get_timestamp()))
    tests[1].to_csv('../../results/{0}/pvalues{1}.csv'.format(folder, utils.get_timestamp()))
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/{0}/macro{1}.csv'.format(folder, utils.get_timestamp())
    micro_path = '../../results/{0}/micro{1}.csv'.format(folder, utils.get_timestamp())
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
コード例 #3
0
def test_embeddings():
    feature_path = '../local_resources/features_1in10000.tsv'
    rf_features = pd.read_csv(feature_path, sep='\t', index_col=0)
    emd = pd.read_csv('../local_resources/hyperbolic_embeddings/tf_test1.csv',
                      header=None,
                      index_col=0,
                      skiprows=1,
                      sep=" ")
    features, y = utils.get_classification_xy(rf_features)
    features = features.loc[emd.index, :]
    y = y.loc[emd.index].values
    names = np.array([['RF just emd']])
    n_folds = 10
    classifiers = [
        RandomForestClassifier(max_depth=2,
                               n_estimators=50,
                               bootstrap=True,
                               criterion='entropy',
                               max_features=0.1,
                               n_jobs=1)
    ]
    results = run_all_datasets([emd.values], y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    print 'macro', results[0]
    print 'micro', results[1]
    macro_path = 'tf_testing_1in10000' + utils.get_timestamp() + '.csv'
    micro_path = 'tf_micro_1in10000' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
    assert results[0]['mean'].values > 0.6
コード例 #4
0
def batch_size_scenario():
    """
    Generate embeddings using different batch sizes for the ~1000 vertex polblogs network
    :return:
    """
    import visualisation
    s = datetime.datetime.now()
    y_path = '../../local_resources/political_blogs/y.p'
    x_path = '../../local_resources/political_blogs/X.p'
    y = utils.read_pickle(y_path)
    log_path = '../../local_resources/tf_logs/polblogs/'
    walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv'
    size = 2  # dimensionality of the embedding
    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
    embeddings = []
    for batch_size in batch_sizes:
        params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5,
                        num_pairs=1500,
                        statistics_interval=10.0,
                        initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4)

        path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format(
            batch_size, utils.get_timestamp())

        embedding_in, embedding_out = HCE.main(params)

        visualisation.plot_poincare_embedding(embedding_in, y,
                                              '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        visualisation.plot_poincare_embedding(embedding_out, y,
                                              '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format(
                                                  batch_size, utils.get_timestamp()))
        df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0]))
        df_in.to_csv(path, sep=',')
        df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0]))
        df_out.to_csv(
            '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format(
                batch_size, utils.get_timestamp()),
            sep=',')
        print('political blogs embedding generated in: ', datetime.datetime.now() - s)
        embeddings.append(embedding_in)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [[str(batch_size)] for batch_size in batch_sizes]
    n_folds = 10
    results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv'
    micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)

    return path
コード例 #5
0
def karate_deepwalk_grid_scenario():
    """
    evaluates a grid of embeddings at different sizes, walk lengths and walks per vertex for the karate network.
    Trying to understand why the DeepWalk performance was so poor.
    :return:
    """
    import os
    y_path = '../../local_resources/karate/y.p'
    x_path = '../../local_resources/karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    folder = '../../local_resources/karate/gridsearch/'
    names = [[elem] for elem in os.listdir(folder)]

    embeddings = []
    for name in names:
        emb = pd.read_csv(folder + name[0],
                          header=None,
                          index_col=0,
                          skiprows=1,
                          sep=" ")
        emb.sort_index(inplace=True)
        embeddings.append(emb.values)

    names.append(['hyperbolic'])
    hyp_path = '../../local_resources/karate/embeddings/Win_20170808-185202.csv'
    hyp_emb = pd.read_csv(hyp_path, index_col=0)
    embeddings.append(hyp_emb.values)

    n_folds = 10
    results = run_detectors.run_all_datasets(embeddings, y, names, classifiers,
                                             n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/karate/pvalues' + utils.get_timestamp() +
                    '.csv')
    tests[1].to_csv('../../results/karate/pvalues' + utils.get_timestamp() +
                    '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/karate/macro' + utils.get_timestamp() + '.csv'
    micro_path = '../../results/karate/micro' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
コード例 #6
0
def political_blogs_scenario(embedding_path):
    # deepwalk_path = '../../local_resources/hyperbolic_embeddings/tf_test1.csv'

    y_path = '../../local_resources/political_blogs/y.p'
    x_path = '../../local_resources/political_blogs/X.p'
    sizes = [2, 4, 8, 16, 32, 64, 128]
    deepwalk_embeddings = []
    deepwalk_names = []
    dwpath = '../../local_resources/political_blogs/political_blogs'
    for size in sizes:
        path = dwpath + str(size) + '.emd'
        de = pd.read_csv(path, header=None, index_col=0, skiprows=1, sep=" ")
        de.sort_index(inplace=True)
        deepwalk_embeddings.append(de.values)
        deepwalk_names.append(['deepwalk' + str(size)])

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['hyperbolic'], ['logistic']]
    names = deepwalk_names + names

    embedding = pd.read_csv(embedding_path, index_col=0)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    X = deepwalk_embeddings + [embedding.values, normalize(x, axis=0)]
    n_folds = 10
    results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/political_blogs/pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/political_blogs/pvalues' +
                    utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/political_blogs/macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = '../../results/political_blogs/micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)