def generate_karate_embedding(): import visualisation y_path = '../../local_resources/karate/y.p' targets = utils.read_pickle(y_path) y = np.array(targets['cat']) log_path = '../../local_resources/tf_logs/run4/' walk_path = '../../local_resources/karate/walks_n1_l10.csv' size = 2 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=0.1, initial_learning_rate=1.0, save_path=log_path, epochs=10, concurrent_steps=1) path = '../../local_resources/karate/embeddings/tf_Win_polar' + '_' + utils.get_timestamp() + '.csv' embedding_in, embedding_out = HE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/karate/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf') visualisation.plot_poincare_embedding(embedding_out, y, '../../results/karate/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf') df_in = pd.DataFrame(data=embedding_in, index=range(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=range(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/karate/embeddings/tf_Wout_polar' + '_' + utils.get_timestamp() + '.csv', sep=',') return path
def generate_political_blogs_embedding(): import visualisation s = datetime.datetime.now() y_path = '../../local_resources/political_blogs/y.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/polblogs/' walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv' size = 2 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=1.0, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/political_blogs/embeddings/Win' + '_' + utils.get_timestamp() + '.csv' embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/political_blogs/figs/poincare_polar_Win' + '_' + utils.get_timestamp() + '.pdf') visualisation.plot_poincare_embedding(embedding_out, y, '../../results/political_blogs/figs/poincare_polar_Wout' + '_' + utils.get_timestamp() + '.pdf') df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/political_blogs/embeddings/Wout' + '_' + utils.get_timestamp() + '.csv', sep=',') print('political blogs sample generated in: ', datetime.datetime.now() - s) political_blogs_scenario(path) return path
def generate_blogcatalog_cartesian_embedding(): import visualisation s = datetime.datetime.now() y_path = '../../local_resources/blogcatalog/y.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/blogcatalog_cartesian/final_throw1' walk_path = '../../local_resources/blogcatalog/p025_q025_d128_walks.csv' size = 128 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10, initial_learning_rate=0.2, save_path=log_path, epochs=5, concurrent_steps=12) path = '../../local_resources/blogcatalog/embeddings/Win_cartesian' + '_' + utils.get_timestamp() + '.csv' embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/blogcatalog/figs/poincare_Win_cartesian' + '_' + utils.get_timestamp() + '.pdf') visualisation.plot_poincare_embedding(embedding_out, y, '../../results/blogcatalog/figs/poincare_Wout_cartesian' + '_' + utils.get_timestamp() + '.pdf') df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/blogcatalog/embeddings/Wout_cartesian' + '_' + utils.get_timestamp() + '.csv', sep=',') print('blogcatalog cartesian embedding generated in: ', datetime.datetime.now() - s) return path
def batch_size_scenario(): """ Generate embeddings using different batch sizes for the ~1000 vertex polblogs network :return: """ import visualisation s = datetime.datetime.now() y_path = '../../local_resources/political_blogs/y.p' x_path = '../../local_resources/political_blogs/X.p' y = utils.read_pickle(y_path) log_path = '../../local_resources/tf_logs/polblogs/' walk_path = '../../local_resources/political_blogs/walks_n1_l10.csv' size = 2 # dimensionality of the embedding batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128] embeddings = [] for batch_size in batch_sizes: params = Params(walk_path, batch_size=batch_size, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=0.1, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/political_blogs/embeddings/Win_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()) embedding_in, embedding_out = HCE.main(params) visualisation.plot_poincare_embedding(embedding_in, y, '../../results/political_blogs/figs/poincare_polar_Win_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) visualisation.plot_poincare_embedding(embedding_out, y, '../../results/political_blogs/figs/poincare_polar_Wout_batch_{}_{}.pdf'.format( batch_size, utils.get_timestamp())) df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv( '../../local_resources/political_blogs/embeddings/Wout_batch_{}_{}.csv'.format( batch_size, utils.get_timestamp()), sep=',') print('political blogs embedding generated in: ', datetime.datetime.now() - s) embeddings.append(embedding_in) x, y = utils.read_data(x_path, y_path, threshold=0) names = [[str(batch_size)] for batch_size in batch_sizes] n_folds = 10 results = run_detectors.run_all_datasets(embeddings, y, names, classifiers, n_folds) all_results = utils.merge_results(results, n_folds) results, tests = utils.stats_test(all_results) tests[0].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('../../results/political_blogs/batch_size_pvalues' + utils.get_timestamp() + '.csv') print('macro', results[0]) print('micro', results[1]) macro_path = '../../results/political_blogs/batch_size_macro' + utils.get_timestamp() + '.csv' micro_path = '../../results/political_blogs/batch_size_micro' + utils.get_timestamp() + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True) return path
def simulated_tree_scenario(branching_factor, levels): import visualisation folder = '../../local_resources/simulated_trees' deepwalk_path = '../../local_resources/simulated_trees/deepwalk_z{}_l{}.emd'.format( branching_factor, levels) walk_path = '../../local_resources/simulated_trees/walks_long_z{}_l{}.emd'.format( branching_factor, levels) emb_path = create_adj_mat(folder, branching_factor, levels) generate_simulated_tree(emb_path, walk_path, deepwalk_path) deepwalk_emd = pd.read_csv(deepwalk_path, header=None, index_col=0, skiprows=1, sep=" ") s = datetime.datetime.now() # y_path = '../../local_resources/blogcatalog_121_sample/y.p' # y = utils.read_pickle(y_path) y = generate_y(branching_factor, levels) log_path = '../../local_resources/tf_logs/sim_tree/' # walk_path = '../../local_resources/simulated_trees/walks.csv' size = 2 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=0.1, initial_learning_rate=1.0, save_path=log_path, epochs=20, concurrent_steps=4) path = '../../local_resources/simulated_trees/embeddings/Win' + '_' + utils.get_timestamp( ) + '.csv' embedding_in, embedding_out = HE.main(params) visualisation.plot_deepwalk_embedding( deepwalk_emd.values, y, '../../results/simulated_trees/figs/deepwalk_z{}_l{}_{}.pdf'.format( branching_factor, levels, utils.get_timestamp())) visualisation.plot_poincare_embedding( embedding_in, y, '../../results/simulated_trees/figs/hyp_z{}_l{}_{}.pdf'.format( branching_factor, levels, utils.get_timestamp())) df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') return path
def run_embedding(folder, learning_rate, run_scenario=True, module=HE): """ Generate an embeddings for a given graph :param folder: the name of the folder and also the graph :param run_scenario: True if cv results are required :param module: An alias for the module containing the specific embedding :return: the path to the embedding """ import visualisation s = datetime.datetime.now() y_path = '../../local_resources/{}/y.p'.format(folder) targets = utils.read_pickle(y_path) y = np.array(targets['cat']) log_path = '../../local_resources/tf_logs/run1/' walk_path = '../../local_resources/{}/walks_n1_l10.csv'.format(folder) size = 4 # dimensionality of the embedding params = Params(walk_path, batch_size=4, embedding_size=size, neg_samples=5, skip_window=5, num_pairs=1500, statistics_interval=10.0, initial_learning_rate=learning_rate, save_path=log_path, epochs=5, concurrent_steps=4) path = '../../local_resources/{0}/embeddings/Win_{1}.csv'.format( folder, utils.get_timestamp()) embedding_in, embedding_out = module.main(params) visualisation.plot_poincare_embedding( embedding_in, y, '../../results/all/embedding_figs/{}_Win_{}.pdf'.format( folder, utils.get_timestamp())) visualisation.plot_poincare_embedding( embedding_out, y, '../../results/all/embedding_figs/{}_Wout_{}.pdf'.format( folder, utils.get_timestamp())) df_in = pd.DataFrame(data=embedding_in, index=np.arange(embedding_in.shape[0])) df_in.to_csv(path, sep=',') df_out = pd.DataFrame(data=embedding_out, index=np.arange(embedding_out.shape[0])) df_out.to_csv('../../local_resources/{0}/embeddings/Wout_{1}.csv'.format( folder, utils.get_timestamp()), sep=',') print('{} embedding generated in: '.format(folder), datetime.datetime.now() - s) if run_scenario: MLD.run_scenario(folder, path) return path