def income_different_size_embedding_scenario(): # names = np.array( # [['ridge without emd', 'RF without emd'], # ['ridge with emd', 'RF with emd'], # ['ridge just emd', 'RF just emd']]) # names = [['ridge', 'RF']] names = [['ridge']] y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p' target = utils.read_target(y_path) y = np.array(target['mean_income']) n_folds = 10 sizes = [16, 32, 64, 128] for size in sizes: print 'running embeddings of size ', size emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format( size) x = utils.read_embedding(emd_path, target) results = run_all_datasets([x], y, names, regressors, n_folds) # all_results = utils.merge_results(results) all_results = pd.concat([x for x in results]) all_results.rename(columns={n_folds: 'train'}, inplace=True) results, tests = t_tests(all_results) print results path = '../results/income/thresh10_' + str( size) + '_' + utils.get_timestamp() + '.csv' results.to_csv(path, index=True)
def karate_test_scenario(deepwalk_path): y_path = '../../local_resources/zachary_karate/y.p' x_path = '../../local_resources/zachary_karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['deepwalk'], ['logistic']] x_deepwalk = pd.read_csv(deepwalk_path, index_col=0) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) X = [x_deepwalk.values, normalize(x, axis=0)] n_folds = 10 results = run_detectors.run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/karate/deepwalk_macro_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/karate/deepwalk_micro_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/karate/deepwalk_macro' + utils.get_timestamp( ) + '.csv' micro_path = '../../results/karate/deepwalk_micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def tsne_plot(): model = TSNE(n_components=2, random_state=0) x_path = '../../local_resources/income_dataset/X_thresh10.p' y_path = '../../local_resources/income_dataset/y_thresh10.p' emd_path = '../../local_results/dimension_32_num_10_length_80_context_10.emd' outpath = '../../local_results/figures/tsne.pdf' X, y = utils.read_data(x_path, y_path, threshold=10) target = utils.read_target(y_path) emd = pd.read_csv(emd_path, header=None, index_col=0, skiprows=1, sep=",") embedding = model.fit_transform(emd) # sb.set_context("notebook", font_scale=1.1) sns.set_style("ticks") print 'embedding shape is ', embedding.shape df = pd.DataFrame(data=embedding, index=None, columns=['x', 'y']) labels = np.array(target.loc[emd.index].mean_income) df['label'] = labels df.to_csv('../../local_results/tsne.csv', index=None) plot = sns.lmplot('x', 'y', data=df, fit_reg=False, hue="label", scatter_kws={ "marker": "D", "s": 10 }) plot.savefig(outpath)
def karate_scenario(): deepwalk_path = 'local_resources/zachary_karate/size8_walks1_len10.emd' y_path = 'local_resources/zachary_karate/y.p' x_path = 'local_resources/zachary_karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['logistic'], ['deepwalk']] x_deepwalk = utils.read_embedding(deepwalk_path, target) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) X = [x_deepwalk, normalize(x, axis=0)] n_folds = 2 results = run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results) results, tests = utils.stats_test(all_results) tests[0].to_csv('results/karate/deepwalk_macro_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('results/karate/deepwalk_micro_pvalues' + utils.get_timestamp() + '.csv') print 'macro', results[0] print 'micro', results[1] macro_path = 'results/karate/deepwalk_macro' + utils.get_timestamp( ) + '.csv' micro_path = 'results/karate/deepwalk_micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def karate_deepwalk_grid_scenario(): """ evaluates a grid of embeddings at different sizes, walk lengths and walks per vertex for the karate network. Trying to understand why the DeepWalk performance was so poor. :return: """ import os y_path = '../../local_resources/karate/y.p' x_path = '../../local_resources/karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) folder = '../../local_resources/karate/gridsearch/' names = [[elem] for elem in os.listdir(folder)] embeddings = [] for name in names: emb = pd.read_csv(folder + name[0], header=None, index_col=0, skiprows=1, sep=" ") emb.sort_index(inplace=True) embeddings.append(emb.values) names.append(['hyperbolic']) hyp_path = '../../local_resources/karate/embeddings/Win_20170808-185202.csv' hyp_emb = pd.read_csv(hyp_path, index_col=0) embeddings.append(hyp_emb.values) n_folds = 10 results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/karate/pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/karate/pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/karate/macro' + utils.get_timestamp() + '.csv' micro_path = '../../results/karate/micro' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def scenario_vary_walks_and_context(): print 'reading data' x, y = utils.read_data('../../local_resources/income_dataset/X_thresh10.p', '../../local_resources/income_dataset/y_thresh10.p', 0) n_vertices = len(y) s = datetime.now() g = BipartiteGraph(x) # print 'building edges' # g.build_edge_array() all_walks = pd.read_csv('../../local_results/thresh10_walks_length_100_num_walks_20.csv', header=None).values target = utils.read_target('../../local_resources/income_dataset/y_thresh10.p') num_walks = [6, 8, 10, 12, 14, 16, 18, 20] print 'running number of walks loop' for n_walks in num_walks: walks = all_walks[0:n_walks * n_vertices, 0:80] print 'walk shape is ', walks.shape outpath = '../../local_results/dimension_32_num_{}_length_10_context_10.emd'.format(n_walks) np.random.shuffle(walks) g.learn_embeddings(walks, size=32, outpath=outpath, window_size=10) change_index(outpath, target) print 'number of walks ', str(n_walks), ' embeddings generated in ', datetime.now() - s, ' s' walk_lengths = [40, 50, 60, 70, 80, 90, 100] print 'running walk length loop' for walk_length in walk_lengths: walks = all_walks[0:10 * n_vertices, 0:walk_length] print 'walk shape is ', walks.shape outpath = '../../local_results/dimension_32_num_10_length_{}_context_10.emd'.format(walk_length) np.random.shuffle(walks) g.learn_embeddings(walks, size=32, outpath=outpath, window_size=10) change_index(outpath, target) print 'length ', str(walk_length), ' embeddings generated in ', datetime.now() - s, ' s' context_size = [8, 10, 12, 14, 16, 18, 20] print 'running context size loop' for size in context_size: walks = all_walks[0:10 * n_vertices, 0:80] print 'walk shape is ', walks.shape outpath = '../../local_results/dimension_32_num_10_length_80_context_{}.emd'.format(size) np.random.shuffle(walks) g.learn_embeddings(walks, size=32, outpath=outpath, window_size=size) change_index(outpath, target) print 'context size ', str(size), ' embeddings generated in ', datetime.now() - s, ' s'
def nikos_test_scenario(): names = [['ridge']] y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p' target = utils.read_target(y_path) y = np.array(target['mean_income']) n_folds = 10 sizes = [16, 32, 64, 128] for size in sizes: print 'running for size {} \n'.format(size) emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format( size) x = pd.read_csv(emd_path, index_col=0) x = x.as_matrix() results = run_all_datasets([x], y, names, regressors, n_folds) all_results = pd.concat([x for x in results]) all_results.rename(columns={n_folds: 'train'}, inplace=True) results, tests = t_tests(all_results) print results
def reindex_embeddings(): """ changes the first column of embeddings from an index to a Twitter ID :return: """ y_path = '../../local_resources/income_dataset/y_thresh10.p' target = utils.read_target(y_path) sizes = [16, 32, 64, 128] for size in sizes: print 'running embeddings of size ', size emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format( size) x = utils.read_embedding(emd_path, target) df = pd.DataFrame(data=x, index=target.index) try: del df.index.name except AttributeError: pass df.to_csv(emd_path)
def karate_results(embeddings, names, n_reps, train_size): deepwalk_path = '../../local_resources/zachary_karate/size8_walks1_len10.emd' y_path = '../../local_resources/zachary_karate/y.p' x_path = '../../local_resources/zachary_karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) # names = [['embedding'], ['logistic']] names.append(['logistics']) # x_deepwalk = utils.read_embedding(deepwalk_path, target) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) # X = [normalize(embedding, axis=0), normalize(x, axis=0)] X = embeddings + [normalize(x, axis=0)] # names = ['embedding'] # X = embedding results = [] for exp in zip(X, names): tmp = run_detectors.run_experiments(exp[0], y, exp[1], classifiers, n_reps, train_size) results.append(tmp) all_results = utils.merge_results(results, n_reps) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/karate/tf_macro_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/karate/tf_micro_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/karate/tf_macro' + utils.get_timestamp( ) + '.csv' micro_path = '../../results/karate/tf_micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True) return results
def income_scenario(): # names = np.array( # [['ridge without emd', 'RF without emd'], # ['ridge with emd', 'RF with emd'], # ['ridge just emd', 'RF just emd']]) # names = [['ridge', 'RF']] names = [['ridge']] y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p' emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_64.emd' target = utils.read_target(y_path) x = utils.read_embedding(emd_path, target) y = np.array(target['mean_income']) n_folds = 10 # x, y = utils.read_data(x_path, y_path, threshold=1) results = run_all_datasets([x], y, names, regressors, n_folds) # all_results = utils.merge_results(results) all_results = pd.concat([x for x in results]) all_results.rename(columns={n_folds: 'train'}, inplace=True) results, tests = t_tests(all_results) print results path = '../results/income/thresh10_' + utils.get_timestamp() + '.csv' results.to_csv(path, index=True)
df.to_csv(emd_path) def reindex_embeddings(): """ changes the first column of embeddings from an index to a Twitter ID :return: """ y_path = '../../local_resources/income_dataset/y_thresh10.p' target = utils.read_target(y_path) sizes = [16, 32, 64, 128] for size in sizes: print 'running embeddings of size ', size emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format( size) x = utils.read_embedding(emd_path, target) df = pd.DataFrame(data=x, index=target.index) try: del df.index.name except AttributeError: pass df.to_csv(emd_path) if __name__ == '__main__': scenario_build_income_embeddings(emd_dimension=2) target = utils.read_target('../../local_resources/income_dataset/y_thresh10.p') change_index('../../local_results/thresh10_num_10_length_80_dimension_2.emd', target)