import pandas as pd import numpy as np from tqdm import tqdm import helper pathways, interactome = helper.setup() interactome_e2n = helper.convert_edges_to_node(interactome, 'edge_weight', 'interactome_weight') interactome_en = helper.keep_edge_nodes(interactome_e2n, ['head', 'interactome_weight']) interactome_degrees = pd.read_csv( '../output/features_interactome_no_nearest_01.txt', delimiter='\t') interactome_features = pd.merge(interactome_degrees, interactome_en, left_on='name', right_on='head') interactome_features.drop('head', axis=1, inplace=True) num_folds = 2 create_additional_featuers = False for pathway in tqdm(pathways): pathway_dist_score = pd.read_csv( '../output/features_{}_03.txt'.format(pathway), delimiter='\t',
"TCR", "TGF_beta_Receptor", "TNFalpha", "Wnt"] interactome = pd.read_csv( '../data/pathlinker-signaling-children-reg-weighted.txt', delimiter='\t') return pathways, interactome # # Begin Script # pathways, interactome = setup() interactome_e2n = helper.convert_edges_to_node(interactome, 'edge_weight', 'interactome_weight') # pathlinker_weight_attributes = {} # pagerank_weight_attributes = {} # interactome_weight_attributes = {} for pathway_name in tqdm.tqdm(pathways): # pathway_df = pd.read_csv('../data/pathways/{}-edges.txt'. # format(pathway_name), # delimiter='\t') # pathway_df_n2e = helper.convert_edges_to_node(pathway_df) # node_features_df = '../output/features_{}'.format(pathway_name)
pathways, interactome = setup() for pathway in tqdm(pathways): pathway_dist_en = pd.read_csv('../output/features_{}_02.txt'. format(pathway), delimiter='\t') pagerank_df = pd.read_csv('../data/pagerank/{}-q_0.50-edge-fluxes.txt'. format(pathway), delimiter='\t') cyclinker_df = pd.read_csv('../data/cyclinker/{}-k_110000-ranked-edges.txt'. format(pathway), delimiter='\t') pagerank_df_e2n = helper.convert_edges_to_node( pagerank_df, 'edge_flux', 'pagerank_value') cyclinker_df_e2n = helper.convert_edges_to_node( cyclinker_df, 'KSP index', 'cyclinker_value') pagerank_en = helper.keep_edge_nodes( pagerank_df_e2n, ['head', 'pagerank_value']) cyclinker_en = helper.keep_edge_nodes( cyclinker_df_e2n, ['head', 'cyclinker_value']) pathway_dist_ranks = pd.merge( pathway_dist_en, pagerank_en, left_on='name', right_on='head', how='left') pathway_dist_ranks = pd.merge( pathway_dist_ranks, cyclinker_en, left_on='name', right_on='head', how='left')
def main(): pathways, interactome = setup() # I f****d up. # this file should not loop over any of the pathways # the features here should be calculated on the entire interactome # this is why the variable names will be confusing # for pathway_name in pathways: # pathway_df = pd.read_csv('../data/pathways/{}-edges.txt'. # format(pathway_name), # delimiter='\t') pathway_name = 'interactome' pathway_df = interactome pathway_df_n2e = helper.convert_edges_to_node(pathway_df, weight_col='edge_weight') g = nx.from_pandas_dataframe(pathway_df_n2e, source='#tail', target='head', create_using=nx.DiGraph()) # print('calculating betweenness_attributes') # betweenness_attributes = cna.calculate_betweenesss(g) # print('finished betweenness_attributes') print('calculating degree_attributes') degree_attributes = cna.calculate_degree(g) print('finished degree_attributes') # print('calculating katz_attributes') # katz_attributes = cna.calculate_katz(g) # print('finished katz_attributes') # nx.set_node_attributes(g, 'betweenness', betweenness_attributes) nx.set_node_attributes(g, 'degree', degree_attributes) # nx.set_node_attributes(g, 'katz', katz_attributes) # nearest_1_attributes = {} # c # nearest_3_attributes = {} # c # nearest_5_attributes = {} max_degree_head_tail_attributes = {} # c min_degree_head_tail_attributes = {} # c avg_degree_head_tail_attributes = {} # c edge_node_pattern = re.compile('.*_to_.*') print('iterating over nodes') for node in tqdm.tqdm(g.nodes_iter()): match = edge_node_pattern.match(node) if match: # n1 = cna.calculate_nearest_k_nodes(g, node, 2) # nearest_1_attributes[node] = n1 # n3 = cna.calculate_nearest_k_nodes(g, node, 4) # nearest_3_attributes[node] = n3 # n5 = cna.calculate_nearest_k_nodes(g, node, 5) # nearest_5_attributes[node] = n5 max_d_ht = cna.calculate_max_degree_head_tail(g, node) max_degree_head_tail_attributes[node] = max_d_ht min_d_ht = cna.calculate_min_degree_head_tail(g, node) min_degree_head_tail_attributes[node] = min_d_ht avg_d_ht = cna.calculate_avg_degree_head_tail(g, node) avg_degree_head_tail_attributes[node] = avg_d_ht print('setting node attributes') # nx.set_node_attributes(g, 'nearest_1', nearest_1_attributes) # nx.set_node_attributes(g, 'nearest_3', nearest_3_attributes) # nx.set_node_attributes(g, 'nearest_5', nearest_5_attributes) nx.set_node_attributes(g, 'max_degree_head_tail', max_degree_head_tail_attributes) nx.set_node_attributes(g, 'min_degree_head_tail', min_degree_head_tail_attributes) nx.set_node_attributes(g, 'avg_degree_head_tail', avg_degree_head_tail_attributes) node_filename = '../output/features_{}_no_nearest_01.txt'.format( pathway_name) with open(node_filename, 'w') as f: col_names = [ 'name', # 'betweenness', 'degree', # 'katz', # 'nearest_1', # 'nearest_3', # 'nearest_5', 'max_degree_head_tail', 'min_degree_head_tail', 'avg_degree_head_tail' ] fstring = ['%s'] * len(col_names) fstring = '\t'.join(fstring) + '\n' wstring = fstring % tuple(col_names) f.write(wstring) print('writing node info') for node in tqdm.tqdm(g.nodes_iter()): match = edge_node_pattern.match(node) if match: na = g.node[node] f.write(fstring % tuple([ node, # na['betweenness'], na['degree'], # na['katz'], # na['nearest_1'], # na['nearest_3'], # na['nearest_5'], na['max_degree_head_tail'], na['min_degree_head_tail'], na['avg_degree_head_tail'] ])) print("{} created".format(node_filename))
def main(): pathways, interactome = setup() # I f****d up. # this file should not loop over any of the pathways # the features here should be calculated on the entire interactome # this is why the variable names will be confusing # for pathway_name in pathways: # pathway_df = pd.read_csv('../data/pathways/{}-edges.txt'. # format(pathway_name), # delimiter='\t') pathway_name = 'interactome' pathway_df = interactome pathway_df_n2e = helper.convert_edges_to_node( pathway_df, weight_col='edge_weight') g = nx.from_pandas_dataframe( pathway_df_n2e, source='#tail', target='head', create_using=nx.DiGraph()) # print('calculating betweenness_attributes') # betweenness_attributes = cna.calculate_betweenesss(g) # print('finished betweenness_attributes') print('calculating degree_attributes') degree_attributes = cna.calculate_degree(g) print('finished degree_attributes') # print('calculating katz_attributes') # katz_attributes = cna.calculate_katz(g) # print('finished katz_attributes') # nx.set_node_attributes(g, 'betweenness', betweenness_attributes) nx.set_node_attributes(g, 'degree', degree_attributes) # nx.set_node_attributes(g, 'katz', katz_attributes) # nearest_1_attributes = {} # c # nearest_3_attributes = {} # c # nearest_5_attributes = {} max_degree_head_tail_attributes = {} # c min_degree_head_tail_attributes = {} # c avg_degree_head_tail_attributes = {} # c edge_node_pattern = re.compile('.*_to_.*') print('iterating over nodes') for node in tqdm.tqdm(g.nodes_iter()): match = edge_node_pattern.match(node) if match: # n1 = cna.calculate_nearest_k_nodes(g, node, 2) # nearest_1_attributes[node] = n1 # n3 = cna.calculate_nearest_k_nodes(g, node, 4) # nearest_3_attributes[node] = n3 # n5 = cna.calculate_nearest_k_nodes(g, node, 5) # nearest_5_attributes[node] = n5 max_d_ht = cna.calculate_max_degree_head_tail(g, node) max_degree_head_tail_attributes[node] = max_d_ht min_d_ht = cna.calculate_min_degree_head_tail(g, node) min_degree_head_tail_attributes[node] = min_d_ht avg_d_ht = cna.calculate_avg_degree_head_tail(g, node) avg_degree_head_tail_attributes[node] = avg_d_ht print('setting node attributes') # nx.set_node_attributes(g, 'nearest_1', nearest_1_attributes) # nx.set_node_attributes(g, 'nearest_3', nearest_3_attributes) # nx.set_node_attributes(g, 'nearest_5', nearest_5_attributes) nx.set_node_attributes(g, 'max_degree_head_tail', max_degree_head_tail_attributes) nx.set_node_attributes(g, 'min_degree_head_tail', min_degree_head_tail_attributes) nx.set_node_attributes(g, 'avg_degree_head_tail', avg_degree_head_tail_attributes) node_filename = '../output/features_{}_no_nearest_01.txt'.format( pathway_name) with open(node_filename, 'w') as f: col_names = ['name', # 'betweenness', 'degree', # 'katz', # 'nearest_1', # 'nearest_3', # 'nearest_5', 'max_degree_head_tail', 'min_degree_head_tail', 'avg_degree_head_tail'] fstring = ['%s'] * len(col_names) fstring = '\t'.join(fstring) + '\n' wstring = fstring % tuple(col_names) f.write(wstring) print('writing node info') for node in tqdm.tqdm(g.nodes_iter()): match = edge_node_pattern.match(node) if match: na = g.node[node] f.write(fstring % tuple([ node, # na['betweenness'], na['degree'], # na['katz'], # na['nearest_1'], # na['nearest_3'], # na['nearest_5'], na['max_degree_head_tail'], na['min_degree_head_tail'], na['avg_degree_head_tail'] ])) print("{} created".format(node_filename))