Esempio n. 1
0
def income_different_size_embedding_scenario():
    # names = np.array(
    #     [['ridge without emd', 'RF without emd'],
    #      ['ridge with emd', 'RF with emd'],
    #      ['ridge just emd', 'RF just emd']])
    # names = [['ridge', 'RF']]
    names = [['ridge']]
    y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p'

    target = utils.read_target(y_path)
    y = np.array(target['mean_income'])
    n_folds = 10
    sizes = [16, 32, 64, 128]
    for size in sizes:
        print 'running embeddings of size ', size
        emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = utils.read_embedding(emd_path, target)
        results = run_all_datasets([x], y, names, regressors, n_folds)
        # all_results = utils.merge_results(results)
        all_results = pd.concat([x for x in results])
        all_results.rename(columns={n_folds: 'train'}, inplace=True)
        results, tests = t_tests(all_results)
        print results
        path = '../results/income/thresh10_' + str(
            size) + '_' + utils.get_timestamp() + '.csv'
        results.to_csv(path, index=True)
Esempio n. 2
0
def karate_test_scenario(deepwalk_path):

    y_path = '../../local_resources/zachary_karate/y.p'
    x_path = '../../local_resources/zachary_karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['deepwalk'], ['logistic']]

    x_deepwalk = pd.read_csv(deepwalk_path, index_col=0)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    X = [x_deepwalk.values, normalize(x, axis=0)]
    n_folds = 10
    results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/karate/deepwalk_macro_pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/karate/deepwalk_micro_pvalues' +
                    utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/karate/deepwalk_macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = '../../results/karate/deepwalk_micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
Esempio n. 3
0
def tsne_plot():
    model = TSNE(n_components=2, random_state=0)

    x_path = '../../local_resources/income_dataset/X_thresh10.p'
    y_path = '../../local_resources/income_dataset/y_thresh10.p'
    emd_path = '../../local_results/dimension_32_num_10_length_80_context_10.emd'
    outpath = '../../local_results/figures/tsne.pdf'

    X, y = utils.read_data(x_path, y_path, threshold=10)

    target = utils.read_target(y_path)
    emd = pd.read_csv(emd_path, header=None, index_col=0, skiprows=1, sep=",")
    embedding = model.fit_transform(emd)

    # sb.set_context("notebook", font_scale=1.1)
    sns.set_style("ticks")

    print 'embedding shape is ', embedding.shape

    df = pd.DataFrame(data=embedding, index=None, columns=['x', 'y'])
    labels = np.array(target.loc[emd.index].mean_income)
    df['label'] = labels
    df.to_csv('../../local_results/tsne.csv', index=None)

    plot = sns.lmplot('x',
                      'y',
                      data=df,
                      fit_reg=False,
                      hue="label",
                      scatter_kws={
                          "marker": "D",
                          "s": 10
                      })

    plot.savefig(outpath)
def karate_scenario():
    deepwalk_path = 'local_resources/zachary_karate/size8_walks1_len10.emd'

    y_path = 'local_resources/zachary_karate/y.p'
    x_path = 'local_resources/zachary_karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    names = [['logistic'], ['deepwalk']]

    x_deepwalk = utils.read_embedding(deepwalk_path, target)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    X = [x_deepwalk, normalize(x, axis=0)]
    n_folds = 2
    results = run_all_datasets(X, y, names, classifiers, n_folds)
    all_results = utils.merge_results(results)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('results/karate/deepwalk_macro_pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('results/karate/deepwalk_micro_pvalues' +
                    utils.get_timestamp() + '.csv')
    print 'macro', results[0]
    print 'micro', results[1]
    macro_path = 'results/karate/deepwalk_macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = 'results/karate/deepwalk_micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
def karate_deepwalk_grid_scenario():
    """
    evaluates a grid of embeddings at different sizes, walk lengths and walks per vertex for the karate network.
    Trying to understand why the DeepWalk performance was so poor.
    :return:
    """
    import os
    y_path = '../../local_resources/karate/y.p'
    x_path = '../../local_resources/karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    folder = '../../local_resources/karate/gridsearch/'
    names = [[elem] for elem in os.listdir(folder)]

    embeddings = []
    for name in names:
        emb = pd.read_csv(folder + name[0],
                          header=None,
                          index_col=0,
                          skiprows=1,
                          sep=" ")
        emb.sort_index(inplace=True)
        embeddings.append(emb.values)

    names.append(['hyperbolic'])
    hyp_path = '../../local_resources/karate/embeddings/Win_20170808-185202.csv'
    hyp_emb = pd.read_csv(hyp_path, index_col=0)
    embeddings.append(hyp_emb.values)

    n_folds = 10
    results = run_detectors.run_all_datasets(embeddings, y, names, classifiers,
                                             n_folds)
    all_results = utils.merge_results(results, n_folds)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/karate/pvalues' + utils.get_timestamp() +
                    '.csv')
    tests[1].to_csv('../../results/karate/pvalues' + utils.get_timestamp() +
                    '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/karate/macro' + utils.get_timestamp() + '.csv'
    micro_path = '../../results/karate/micro' + utils.get_timestamp() + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
def scenario_vary_walks_and_context():
    print 'reading data'
    x, y = utils.read_data('../../local_resources/income_dataset/X_thresh10.p',
                           '../../local_resources/income_dataset/y_thresh10.p', 0)
    n_vertices = len(y)
    s = datetime.now()
    g = BipartiteGraph(x)
    # print 'building edges'
    # g.build_edge_array()
    all_walks = pd.read_csv('../../local_results/thresh10_walks_length_100_num_walks_20.csv',
                            header=None).values
    target = utils.read_target('../../local_resources/income_dataset/y_thresh10.p')

    num_walks = [6, 8, 10, 12, 14, 16, 18, 20]
    print 'running number of walks loop'
    for n_walks in num_walks:
        walks = all_walks[0:n_walks * n_vertices, 0:80]
        print 'walk shape is ', walks.shape
        outpath = '../../local_results/dimension_32_num_{}_length_10_context_10.emd'.format(n_walks)
        np.random.shuffle(walks)
        g.learn_embeddings(walks, size=32, outpath=outpath, window_size=10)
        change_index(outpath, target)
        print 'number of walks ', str(n_walks), ' embeddings generated in ', datetime.now() - s, ' s'

    walk_lengths = [40, 50, 60, 70, 80, 90, 100]
    print 'running walk length loop'
    for walk_length in walk_lengths:
        walks = all_walks[0:10 * n_vertices, 0:walk_length]
        print 'walk shape is ', walks.shape
        outpath = '../../local_results/dimension_32_num_10_length_{}_context_10.emd'.format(walk_length)
        np.random.shuffle(walks)
        g.learn_embeddings(walks, size=32, outpath=outpath, window_size=10)
        change_index(outpath, target)
        print 'length ', str(walk_length), ' embeddings generated in ', datetime.now() - s, ' s'

    context_size = [8, 10, 12, 14, 16, 18, 20]
    print 'running context size loop'
    for size in context_size:
        walks = all_walks[0:10 * n_vertices, 0:80]
        print 'walk shape is ', walks.shape
        outpath = '../../local_results/dimension_32_num_10_length_80_context_{}.emd'.format(size)
        np.random.shuffle(walks)
        g.learn_embeddings(walks, size=32, outpath=outpath, window_size=size)
        change_index(outpath, target)
        print 'context size ', str(size), ' embeddings generated in ', datetime.now() - s, ' s'
Esempio n. 7
0
def nikos_test_scenario():
    names = [['ridge']]
    y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p'
    target = utils.read_target(y_path)
    y = np.array(target['mean_income'])
    n_folds = 10
    sizes = [16, 32, 64, 128]
    for size in sizes:
        print 'running for size {} \n'.format(size)
        emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = pd.read_csv(emd_path, index_col=0)
        x = x.as_matrix()
        results = run_all_datasets([x], y, names, regressors, n_folds)
        all_results = pd.concat([x for x in results])
        all_results.rename(columns={n_folds: 'train'}, inplace=True)
        results, tests = t_tests(all_results)
        print results
def reindex_embeddings():
    """
    changes the first column of embeddings from an index to a Twitter ID
    :return:
    """
    y_path = '../../local_resources/income_dataset/y_thresh10.p'
    target = utils.read_target(y_path)
    sizes = [16, 32, 64, 128]

    for size in sizes:
        print 'running embeddings of size ', size
        emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = utils.read_embedding(emd_path, target)
        df = pd.DataFrame(data=x, index=target.index)
        try:
            del df.index.name
        except AttributeError:
            pass
        df.to_csv(emd_path)
Esempio n. 9
0
def karate_results(embeddings, names, n_reps, train_size):
    deepwalk_path = '../../local_resources/zachary_karate/size8_walks1_len10.emd'

    y_path = '../../local_resources/zachary_karate/y.p'
    x_path = '../../local_resources/zachary_karate/X.p'

    target = utils.read_target(y_path)

    x, y = utils.read_data(x_path, y_path, threshold=0)

    # names = [['embedding'], ['logistic']]

    names.append(['logistics'])

    # x_deepwalk = utils.read_embedding(deepwalk_path, target)
    # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1)
    # X = [normalize(embedding, axis=0), normalize(x, axis=0)]
    X = embeddings + [normalize(x, axis=0)]
    # names = ['embedding']
    # X = embedding

    results = []
    for exp in zip(X, names):
        tmp = run_detectors.run_experiments(exp[0], y, exp[1], classifiers,
                                            n_reps, train_size)
        results.append(tmp)
    all_results = utils.merge_results(results, n_reps)
    results, tests = utils.stats_test(all_results)
    tests[0].to_csv('../../results/karate/tf_macro_pvalues' +
                    utils.get_timestamp() + '.csv')
    tests[1].to_csv('../../results/karate/tf_micro_pvalues' +
                    utils.get_timestamp() + '.csv')
    print('macro', results[0])
    print('micro', results[1])
    macro_path = '../../results/karate/tf_macro' + utils.get_timestamp(
    ) + '.csv'
    micro_path = '../../results/karate/tf_micro' + utils.get_timestamp(
    ) + '.csv'
    results[0].to_csv(macro_path, index=True)
    results[1].to_csv(micro_path, index=True)
    return results
Esempio n. 10
0
def income_scenario():
    # names = np.array(
    #     [['ridge without emd', 'RF without emd'],
    #      ['ridge with emd', 'RF with emd'],
    #      ['ridge just emd', 'RF just emd']])
    # names = [['ridge', 'RF']]
    names = [['ridge']]
    y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p'
    emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_64.emd'

    target = utils.read_target(y_path)
    x = utils.read_embedding(emd_path, target)
    y = np.array(target['mean_income'])
    n_folds = 10
    # x, y = utils.read_data(x_path, y_path, threshold=1)
    results = run_all_datasets([x], y, names, regressors, n_folds)
    # all_results = utils.merge_results(results)
    all_results = pd.concat([x for x in results])
    all_results.rename(columns={n_folds: 'train'}, inplace=True)
    results, tests = t_tests(all_results)
    print results
    path = '../results/income/thresh10_' + utils.get_timestamp() + '.csv'
    results.to_csv(path, index=True)
Esempio n. 11
0
    df.to_csv(emd_path)


def reindex_embeddings():
    """
    changes the first column of embeddings from an index to a Twitter ID
    :return:
    """
    y_path = '../../local_resources/income_dataset/y_thresh10.p'
    target = utils.read_target(y_path)
    sizes = [16, 32, 64, 128]

    for size in sizes:
        print 'running embeddings of size ', size
        emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format(
            size)
        x = utils.read_embedding(emd_path, target)
        df = pd.DataFrame(data=x, index=target.index)
        try:
            del df.index.name
        except AttributeError:
            pass
        df.to_csv(emd_path)


if __name__ == '__main__':
    scenario_build_income_embeddings(emd_dimension=2)
    target = utils.read_target('../../local_resources/income_dataset/y_thresh10.p')
    change_index('../../local_results/thresh10_num_10_length_80_dimension_2.emd', target)