def getRecTFs(rec_tfs_file, costs=False): """ Get the receptors and TFs from a file. uniprot_accession_number should be in the first column, with node_type (receptor or tf) in the second. """ if costs is True: lines = utils.readColumns(rec_tfs_file, 1, 2, 3, 4) receptors = set([ acc for acc, node_type, cost, zscore in lines if node_type == 'receptor' ]) tfs = set([ acc for acc, node_type, cost, zscore in lines if node_type == 'tf' ]) costs = {acc: float(cost) for acc, node_type, cost, zscore in lines} zscores = { acc: float(zscore) for acc, node_type, cost, zscore in lines } return receptors, tfs, costs, zscores else: lines = utils.readColumns(rec_tfs_file, 1, 2) receptors = set( [acc for acc, node_type in lines if node_type == 'receptor']) tfs = set([acc for acc, node_type in lines if node_type == 'tf']) return receptors, tfs
def main(args): #global PARENTNODES opts, args = parseArgs(args) #PARENTNODES = opts.include_parent_nodes # Set of edges from another source such as a pathway lines = utils.readColumns(opts.edges,1,2) prededges = set(lines) node_labels = {} if opts.mapping_file is not None: node_labels = utils.readDict(opts.mapping_file, 1, 2) # get attributes of nodes and edges from the graph_attr file graph_attr = {} attr_desc = {} if opts.graph_attr: graph_attr, attr_desc = readGraphAttr(opts.graph_attr) if opts.net is not None: # add the edge weight from the network to attr_desc which will be used for the popup edge_weights = {(u,v):float(w) for u,v,w in utils.readColumns(opts.net,1,2,3)} for e in prededges: if e not in attr_desc: attr_desc[e] = {} attr_desc[e]["edge weight"] = edge_weights[e] # set the width of the edges by the network weight if opts.net is not None and opts.set_edge_width: graph_attr = set_edge_width(prededges, edge_weights, graph_attr, a=1, b=12) # TODO build the popups here. That way the popup building logic can be separated from the # GSGraph building logic popups = {} prednodes = set([n for edge in prededges for n in edge]) for n in prednodes: popups[n] = buildNodePopup(n, attr_val=attr_desc) for u,v in prededges: popups[(u,v)] = buildEdgePopup(u,v, node_labels=node_labels, attr_val=attr_desc) # Now post to graphspace! G = constructGraph(prededges, node_labels=node_labels, graph_attr=graph_attr, popups=popups) # TODO add an option to build the 'graph information' tab legend/info # build the 'Graph Information' metadata desc = buildGraphDescription(opts.edges, opts.net) metadata = {'description':desc,'tags':[], 'title':''} if opts.tag: metadata['tags'] = opts.tag G.set_data(metadata) G.set_name(opts.graph_name) post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name, group=opts.group, make_public=opts.make_public)
def get_ctd_support(chemical, prednodes, ctd_support_file): print("Getting CTD support counts from %s" % (ctd_support_file)) num_intxs_per_gene = {} num_intxs_per_node = {} for cas, gene, interaction_action, pubmedids in utils.readColumns( ctd_support_file, 3, 4, 10, 11): if chemIDtoCAS[chemical] != cas: continue if 'phosphorylation' in interaction_action: if gene not in num_intxs_per_gene: num_intxs_per_gene[gene] = 0 num_intxs_per_gene[gene] += 1 for n in prednodes: gene = uniprot_to_gene[n] if gene in num_intxs_per_gene: # for now, take the node in the family node with the maximum support num_intxs_per_node[n] = num_intxs_per_gene[gene] print( "\nOf the %d prots with (de)phosphorylation evidence in CTD, %d of the %d net nodes overlap" % (len(num_intxs_per_gene), len(num_intxs_per_node), len(prednodes))) # # write these counts to a table # out_file = "%s-ctd-support.txt" % (opts.node_overlap) # print("Writing support counts to %s" % (out_file)) # with open(out_file, 'w') as out: # out.write("#uniprot\tgene\tmax_num_phospho_intxs\n") # # write the sorted results to a file # out.write('\n'.join(["%s\t%s\t%d" % (N, uniprot_to_gene[N], num_intxs_per_node[N]) for N in sorted(num_intxs_per_node, key=num_intxs_per_node.get, reverse=True)]) + '\n') return num_intxs_per_node
def getPvals(resultsprefix, scope, sig_cutoff_type='FDR'): """ Function to retreive the pvalues for each chemical automatically. Currently only supports k 200 with FDR and BF pval corrections """ print("Getting p-values for scope '%s'" % (scope)) # get the significant chemicals # TODO add a k option and get the column automatically from the header line k = 200 print("Using k %d" % (k)) pvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % (resultsprefix, scope) with open(pvals_file, 'r') as file_handle: # example header line: #chemical k10-BFcorr-pval k10-qval k25-BFcorr-pval k25-qval k50-BFcorr-pval k50-qval header = file_handle.readline().rstrip().split('\t') # TODO add an option to get the uncorrected pvals if sig_cutoff_type == 'BF': #pval_col = 10 pval_col = header.index("k%d-BFcorr-pval" % k) + 1 elif sig_cutoff_type == 'FDR': #pval_col = 11 pval_col = header.index("k%d-qval" % k) + 1 else: # TODO add the non-corrected p-value as an option print( "please enter a valid value for --sig-cutoff-type. Valid options are: 'BF', 'FDR'" ) sys.exit(1) chemical_pvals = { chemical: float(k_pval) for chemical, k_pval in utils.readColumns(pvals_file, 1, pval_col) } return chemical_pvals
def setup_sparse_network(network_file, node2idx_file=None, forced=False): """ Takes a network file and converts it to a sparse matrix """ sparse_net_file = network_file.replace('.txt', '.npz') if node2idx_file is None: node2idx_file = sparse_net_file + "-node-ids.txt" if forced is False and (os.path.isfile(sparse_net_file) and os.path.isfile(node2idx_file)): print("Reading network from %s" % (sparse_net_file)) W = sparse.load_npz(sparse_net_file) print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2)) print("Reading node names from %s" % (node2idx_file)) node2idx = { n: int(n2) for n, n2 in utils.readColumns(node2idx_file, 1, 2) } idx2node = {n2: n for n, n2 in node2idx.items()} prots = [idx2node[n] for n in sorted(idx2node)] elif os.path.isfile(network_file): print("Reading network from %s" % (network_file)) u, v, w = [], [], [] # TODO make sure the network is symmetrical with open(network_file, 'r') as f: # add tqdm? for line in tqdm(f, total=120000000): line = line.rstrip().split('\t') u.append(line[0]) v.append(line[1]) w.append(float(line[2])) print("\tconverting uniprot ids to node indexes / ids") # first convert the uniprot ids to node indexes / ids prots = sorted(set(list(u)) | set(list(v))) node2idx = {prot: i for i, prot in enumerate(prots)} i = [node2idx[n] for n in u] j = [node2idx[n] for n in v] print("\tcreating sparse matrix") #print(i,j,w) W = sparse.coo_matrix((w, (i, j)), shape=(len(prots), len(prots))).tocsr() # make sure it is symmetric if (W.T != W).nnz == 0: pass else: print("### Matrix not symmetric!") W = W + W.T print("### Matrix converted to symmetric.") #name = os.path.basename(net_file) print("\twriting sparse matrix to %s" % (sparse_net_file)) sparse.save_npz(sparse_net_file, W) print("\twriting node2idx labels to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join( ["%s\t%d\n" % (prot, i) for i, prot in enumerate(prots)])) else: print("Network %s not found. Quitting" % (network_file)) sys.exit(1) return W, prots
def getProteins(paths='', ranked_edges='', max_k=200, max_prots=None, ties=False): """ get the proteins of a network from the paths or ranked edges file The *ties* option can only be used with the paths file The *ties* option will find the path score at the max_k, and then continue until that path score is passed *rec_tfs* option will include the receptors and TFs in the paths. Only works for the paths option """ # get the proteins of the top k paths, or up to a certain number of proteins proteins = set() sources = set() targets = set() # keep track of the k or number of paths if ties is used num_paths = 0 if paths: paths = getPaths(paths, max_k, ties) num_paths = len(paths) for path in paths: path = path.split('|') # add the start and end of the path as source/target sources.add(path[0]) targets.add(path[-1]) # add each of the proteins in the path to the set of protiens proteins = proteins.union(set(path)) else: # use the ranked edges file for p1, p2, k in utils.readColumns(ranked_edges, 1, 2, 3): if int(k) > max_k or len(proteins) > max_prots: break proteins.add(p1) proteins.add(p2) # TODO add an option to also return family nodes # # remove the source/target family nodes from the set of proteins # for s in sources: # if len(s.split(',')) > 1: # proteins.remove(s) # for t in targets: # if len(t.split(',')) > 1: # proteins.remove(t) # # # split family nodes into individual nodes # split_family_proteins = set() # for p in proteins: # split_family_proteins.update(set(p.split(','))) # proteins = split_family_proteins if ties: # return the total number of paths from keeping the ties return proteins, num_paths else: return proteins
def readGraphAttr(graph_attr_file): """ Read attributes of nodes and edges from the graph_attr file Must have 4 tab-delimited columns. 1: Style name 2: Style value 3: Nodes/Edges (joined by '|') to apply the style to 4: This is intended to be either a popup or part of the Graph Description / Legend, but it isn't built yet # example node attribute: color blue p1|p2|p3 - # example edge attribute: edge_style dotted p1-p2|p2-p3 - # example compound node. Here p1, p2, and p3 will have the parent attribute set to 'parent1' (i.e. they will belong to the same compound node parent1) parent parent1 p1|p2|p3 - # then to set the attributes of 'parent1', specify it as the node color blue parent1 - """ graph_attr = {} # description of a style, style_attr tuple # can also contain edge-str: name: value # which can be used when building popups attr_desc = {} # keep the order of the pathways by order of highest posterior probability #pathway_colors = collections.OrderedDict() print( "Adding graph attributes from '%s' (must have 3 tab-delimited columns)" % (graph_attr_file)) # TODO the last column (here always '-') can be given as a description #lines = utils.readColumns(graph_attr_file, 1,2,3,4) lines = utils.readColumns(graph_attr_file, 1, 2, 3) print("\tread %d lines" % (len(lines))) # reverse the lines so the pathways at the top of the file will overwrite the pathways at the bottom #for style, style_attr, items, desc in lines[::-1]: for style, style_attr, items in lines[::-1]: for item in items.split('|'): # if this is an edge, then split it by the '-' if len(item.split('-')) == 2: item = tuple(item.split('-')) elif len(item.split('-')) > 2: print( "Error: '-' found in node name for edge: %s. '-' is used to split an edge." % (item)) sys.exit(1) if item not in graph_attr: graph_attr[item] = {} graph_attr[item][style] = style_attr #attr_desc[(style, style_attr)] = desc #graph_attributes[group_number] = {"style": style, "style_attr": style_attr, "prots": prots.split(','), "desc":desc} return graph_attr, attr_desc
def get_sig_chemicals(chemical_pvals, pval_col=5, sig_cutoff=0.05): """ *chemical_pvals* is the file output by compute_stat_sig.py """ print( "Getting the significant chemicals with a pval cutoff of %s from %s" % (str(sig_cutoff), chemical_pvals)) sig_chemicals = [] for chemical, xk_pval in utils.readColumns(chemical_pvals, 1, pval_col): if float(xk_pval) < sig_cutoff: sig_chemicals.append(chemical) print("%d chemicals are significant" % (len(sig_chemicals))) return sig_chemicals
def readNetwork(paths=None, ranked_edges=None, k_limit=200, no_k=False): """ Read the PathLinker paths or ranked_edges output. Get all of the edges that have a k less than the k_limit. """ if no_k is False: if paths is not None: # Predicted paths from pathlinker lines = utils.readColumns(paths, 1, 2, 3) prededges = {} edges = set() for k, path_score, path in lines: # get all of the edges in the paths that have a k less than the k_limit if int(k) > k_limit: break path = path.split('|') for i in range(len(path) - 1): edge = (path[i], path[i + 1]) if edge not in edges: edges.add(edge) prededges[edge] = int(k) if ranked_edges is not None: # Predicted edges from pathlinker lines = utils.readColumns(ranked_edges, 1, 2, 3) # get all of the edges that have a k less than the k_limit prededges = {(u, v): int(k) for u, v, k in lines if int(k) <= k_limit} else: if ranked_edges: # Set of edges from another source such as a pathway lines = utils.readColumns(ranked_edges, 1, 2) # keep the edges in a dictionary to work with the rest of the code prededges = {(u, v): None for u, v in lines} return prededges
def get_already_run_terms(alg_runners, **kwargs): # for each alg, taxon and go term pair, see which already exist and skip them alg_taxon_terms_to_skip = defaultdict(dict) for run_obj in alg_runners: alg = run_obj.name # define the output file path to see if it already exists #exp_type="%sloso" % ("all-sp-" if kwargs['keep_ann'] else '') exp_type = "loso" out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type, run_obj.params_str, kwargs.get("postfix", "")) if os.path.isfile(out_file) and kwargs['forcealg']: print( "Removing %s as results will be appended to it for each taxon" % (out_file)) os.remove(out_file) # the ranks file is for sinksource_bounds ranks_file = out_file.replace('.txt', '-ranks.txt') if '_bounds' in alg and os.path.isfile(ranks_file): print("\tAlso removing %s" % (ranks_file)) os.remove(ranks_file) stats_file = out_file.replace('.txt', '-stats.txt') if os.path.isfile(stats_file): print("\tAlso removing %s" % (stats_file)) os.remove(stats_file) # if the output file already exists, skip the terms that are already there # unless --write-prec-rec is specified with a single term. # then only the full prec_rec file will be written elif kwargs['write_prec_rec'] and len(kwargs['goterm']) == 1: pass elif os.path.isfile(out_file): print("WARNING: %s results file already exists. Appending to it" % (out_file)) # check which results already exist and append to the rest print("Reading results from %s " % (out_file)) taxon_terms_completed = utils.readColumns(out_file, 1, 2) alg_taxon_terms_to_skip[alg] = { taxon: set() for taxon, term in taxon_terms_completed } for taxon, term in taxon_terms_completed: alg_taxon_terms_to_skip[alg][taxon].add(term) print("\t%d taxon - term pairs already finished" % (len(taxon_terms_completed))) return alg_taxon_terms_to_skip
def getPaths(paths_file, max_k=200, ties=False, scores=False): paths = [] if scores: paths = {} last_score = None for k, score, path in utils.readColumns(paths_file, 1, 2, 3): # use 6 decimal places score = "%0.6f" % (float(score)) if int(k) > max_k: # if this path has the same path score as the previous path, then keep adding its proteins if ties and last_score == score: pass else: break #path = path.split('|') if scores: paths[path] = score else: paths.append(path) last_score = score return paths
def getFamilyNodes(version, interactome_file=None): """ Get the set of family nodes present in the interactome If the family-nodes.txt file already exists, then just read from it Otherwise read the interactome_file and write the family nodes to family-nodes.txt *returns*: the set of family nodes in the interactome """ family_nodes_file = "inputs/%s/family-nodes.txt" % (version) if not os.path.isfile(family_nodes_file): print( "%s does not exist. Getting the family nodes from the interactome" % (family_nodes_file)) # get the set of family nodes from the interactome print("Reading the interactome from %s" % (interactome_file)) family_nodes = set([ N for U, V in utils.readColumns(interactome_file, 1, 2) for N in (U, V) if len(N.split(',')) > 1 ]) print("Writing family nodes to %s" % (family_nodes_file)) with open(family_nodes_file, 'w') as out: out.write('\n'.join(family_nodes)) else: family_nodes = utils.readItemSet(family_nodes_file) return family_nodes
def splitRecTFsFamilyNodes(chemicals, version, interactome_file): """ """ # leave some nodes as family nodes as that's how they are in the toxcast data map_family_to_prot = { # FOS,JUN,FOSL1,FOSL2,JUNB,JUND,FOSB: FOS,JUN "P01100,P05412,P15407,P15408,P17275,P17535,P53539": ["P01100,P05412"], # FOS,JUN,SP1: FOS,JUN "P01100,P05412,P08047": ["P01100,P05412"], # FOS,JUN: FOS,JUN "P01100,P05412": ["P01100,P05412"], # TCF7,TCF7L1,TCF7L2,LEF1: TCF7,TCF7L1,TCF7L2,LEF1 "P36402,Q9HCS4,Q9NQB0,Q9UJU2": ["P36402,Q9HCS4,Q9NQB0,Q9UJU2"], # FOXO3,FOXO4,FOXO1: FOXO3,FOXO4,FOXO1 "O43524,P98177,Q12778": ["O43524,P98177,Q12778"], } rec_tfs_file = "inputs/%s/rec-tfs/%%s-rec-tfs.txt" % (version) interactomes_dir = "inputs/%s" % (version) t_utils.checkDir(interactomes_dir) new_interactome_file = "%s/%s-interactome.txt" % (interactomes_dir, version) # get the set of family nodes from the interactome print("Reading the interactome from %s" % (interactome_file)) lines = utils.readColumns(interactome_file, 1, 2, 3) family_nodes = set( [N for U, V, w in lines for N in (U, V) if len(N.split(',')) > 1]) print( "Splitting the source/target family nodes of all chemicals in the interactome and writing to %s" % (new_interactome_file)) # set of family nodes to split from all chemicals family_to_split = {} for chemical in tqdm(chemicals): rec, tfs = t_utils.getRecTFs(rec_tfs_file % (chemical)) for N in family_nodes: for n in rec.union(tfs): if n in N: if N not in family_to_split: family_to_split[N] = set() family_to_split[N].add(n) # leave some tfs as family nodes because that's how they're listed in toxcast family_to_split.update(map_family_to_prot) split_rec = set() split_tfs = set() new_interactome = [] all_new_edges = set() # it's a bit ad hoc because the weight of the family edge is the max of the individual edges, # and now we're setting the edge weight of the split edges to be the max of the individual edges and the family edge new_edge_weights = {} #new_edge_ev = {} # there could be multiple family edges contributing to a single edge for U, V, w in lines: new_edges = set() # split up the rec/tf family nodes if U in family_to_split and V in family_to_split: split_rec.add(U) split_tfs.add(V) for u in family_to_split[U]: for v in family_to_split[V]: new_edges.add((u, v)) elif U in family_to_split: split_rec.add(U) for u in family_to_split[U]: new_edges.add((u, V)) elif V in family_to_split: split_tfs.add(V) for v in family_to_split[V]: new_edges.add((U, v)) # otherwise leave the edge as it is else: new_interactome.append((U, V, w)) continue all_new_edges.update(new_edges) for (u, v) in new_edges: if (u, v) not in new_edge_weights: new_edge_weights[(u, v)] = set() new_edge_weights[(u, v)].add(float(w)) # for now, don't write the evidence to each of the new networks to save on space # the evidence is present in the original interactome and the evidence file #if (u,v) not in new_edge_ev: # new_edge_ev[(u,v)] = set() #new_edge_ev[(u,v)].update(set(ev.split('|'))) for u, v in all_new_edges: w = max(new_edge_weights[(u, v)]) #ev = '|'.join(new_edge_ev[(u,v)]) new_interactome.append((u, v, "%0.6f" % w)) # now write the new interactome print("Writing the new interactome with rec/tf family nodes split to %s" % (new_interactome_file)) with open(new_interactome_file, 'w') as out: out.write('\n'.join(['\t'.join(line) for line in new_interactome]) + '\n') # also write the family nodes that were split mapping = getUniprotToGeneMapping(version) # also write the mapping from the rec/tf family node to the proteins it came from out_file = "inputs/%s/family-split-rec-tfs.txt" % (version) print( "Writing a mapping of the split family rec/tfs and the protein hits they came from to: %s" % (out_file)) with open(out_file, 'w') as out: out.write('\n'.join([ "%s\t%s\t%s\t%s" % (N, '|'.join(family_to_split[N]), mapping[N], '|'.join([mapping[n] for n in family_to_split[N]])) for N in sorted(family_to_split) ]) + '\n') print("A total of %d family nodes were split" % (len(family_to_split))) # add the zscore penalty to the few family nodes in the ToxCast data toxcast_family_nodes = [N[0] for N in map_family_to_prot.values()] addRecTFsFamilyNodes(chemicals, version, family_nodes=toxcast_family_nodes, costs=True)
def setup_post_to_graphspace(config_map, selected_goid, alg='fastsinksource', name_postfix='', tags=None, taxon=None, goid_summary_file=None, num_neighbors=1, nodes_to_post=None, **kwargs): input_settings, alg_settings, \ output_settings, out_pref, kwargs = \ plot_utils.setup_variables( config_map, **kwargs) input_dir = input_settings['input_dir'] dataset = input_settings['datasets'][0] for arg in [ 'ssn_target_only', 'ssn_target_ann_only', 'ssn_only', 'string_target_only', 'string_nontarget_only', 'limit_to_taxons_file', 'add_target_taxon', 'oracle_weights', 'rem_neg_neighbors', 'youngs_neg', 'sp_leaf_terms_only' ]: kwargs[arg] = dataset.get(arg) uniprot_taxon_file = "%s/%s" % (input_dir, dataset['taxon_file']) # don't need it since we are re-running the alg anyway # # predictions file: # results_dir = "%s/%s/%s" % ( # output_settings['output_dir'], dataset['net_version'], dataset['exp_name']) # alg_params = alg_settings[alg] # combos = [dict(zip(alg_params.keys(), val)) # for val in itertools.product( # *(alg_params[param] for param in alg_params))] # # TODO allow for multiple # if len(combos) > 1: # print("%d combinations for %s. Using the first one" % (len(combos), alg)) # param_combo = combos[0] # # first get the parameter string for this runner # params_str = runner.get_runner_params_str(alg, dataset, param_combo) # prec_rec_str = "prec-rec%s-%s" % (taxon, selected_goid) # exp_type = 'loso' # pred_file = "%s/%s/%s%s%s%s.txt" % (results_dir, alg, exp_type, params_str, kwargs.get('postfix',''), prec_rec_str) # if not os.path.isfile(pred_file): # print("\tPredictions file not found: %s. Quitting" % (pred_file)) # sys.exit(1) # print("\treading %s" % (pred_file)) # df = pd.read_csv(pred_file, sep='\t') # print(df.head()) out_dir = "outputs/viz/graphspace/%s-%s/" % (dataset['net_version'].split( '/')[-1], dataset['exp_name'].split('/')[-1]) os.makedirs(out_dir, exist_ok=True) print("storing net and ann files to %s" % (out_dir)) # TODO allow posting without STRING net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx = \ load_net_ann_datasets( out_dir, taxon, dataset, input_settings, alg_settings, uniprot_taxon_file, **kwargs) W = new_net_obj.W prots = ann_obj.prots # also run the alg to get the full prediction scores # TODO get them from a file? alg_settings = {alg: alg_settings[alg]} alg_settings[alg]['should_run'] = [True] kwargs['verbose'] = True alg_runners = run_eval_algs.setup_runners(alg_settings, new_net_obj, ann_obj, output_settings['output_dir'], **kwargs) run_obj = alg_runners[0] run_obj.goids_to_run = [selected_goid] train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon( taxon, ann_obj, species_to_uniprot_idx, eval_ann_obj=eval_ann_obj, **kwargs) # now run the loso evaluation for this term, and get the scores back eval_loso.run_and_eval_algs(run_obj, ann_obj, train_ann_mat, test_ann_mat, taxon=taxon, **kwargs) term_scores = np.ravel( run_obj.goid_scores[ann_obj.goid2idx[selected_goid]].toarray()) print("top 10 scores for %s, %s:" % (taxon, selected_goid)) taxon_prots_idx = list(species_to_uniprot_idx[taxon]) taxon_prots = [prots[i] for i in taxon_prots_idx] taxon_term_scores = term_scores[taxon_prots_idx] print('\n'.join(["%s\t%0.4e" % ( ann_obj.prots[taxon_prots_idx[i]], taxon_term_scores[i]) \ for i in np.argsort(taxon_term_scores)[::-1][:10]])) pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file']) #selected_goid = "15643" # toxic substance binding #selected_goid = "9405" # pathogenesis #selected_goid = "98754" # detoxification selected_goname = None # build a dictionary of the evidencecode for each prot uniprot_to_evidencecode = defaultdict(set) annotated_prots = set() neg_prots = set() if goid_summary_file is None: goid_summary_file = pos_neg_file.replace("bp-", '').replace("mf-", '') if '-list' in pos_neg_file: goid_summary_file = goid_summary_file.replace( "-list", "-summary-stats") elif '.gz' in pos_neg_file: goid_summary_file = goid_summary_file.replace( ".tsv.gz", "-summary-stats.tsv") else: goid_summary_file = goid_summary_file.replace( ".tsv", "-summary-stats.tsv") df_summary = pd.read_csv(goid_summary_file, sep='\t') goid_names = dict(zip(df_summary['GO term'], df_summary['GO term name'])) #goid_num_anno = dict(zip(df_summary['GO term'], df_summary['# positive examples'])) print("GO name: %s" % (goid_names[selected_goid])) selected_goname = goid_names[selected_goid].replace(' ', '-')[0:20] # load the GAIN propagation to get the evidence code ev_codes_file = dataset.get('ev_codes_file') if ev_codes_file is not None: for orf, goid, goname, hierarchy, evidencecode, annotation_type in utils.readColumns( ev_codes_file, 1, 2, 3, 4, 5, 6): if selected_goid[:3] == "GO:": goid = "GO:" + "0" * (7 - len(goid)) + goid if goid != selected_goid: continue selected_goname = goname.replace(' ', '-')[0:20] if annotation_type != '1': continue uniprot_to_evidencecode[orf].add(evidencecode) # limit it to the current taxon if taxon is not None: print("Getting species of each prot from %s" % (uniprot_taxon_file)) #print("Limiting the prots to those for taxon %s (%s)" % (taxon, selected_species[taxon])) print("Limiting the prots to those for taxon %s" % (taxon)) # for each of the 19 species, leave out their annotations # and see how well we can retrieve them uniprot_to_species = utils.readDict(uniprot_taxon_file, 1, 2) if taxon not in species_to_uniprot_idx: print("Error: taxon ID '%d' not found" % (taxon)) sys.exit() # also limit the proteins to those in the network print("\t%d prots for taxon %s." % (len(taxon_prots_idx), taxon)) goid_idx = ann_obj.goid2idx[selected_goid] pos, neg = alg_utils.get_goid_pos_neg(train_ann_mat, goid_idx) non_taxon_annotated_prots = set([prots[i] for i in pos]) non_taxon_neg_prots = set([prots[i] for i in neg]) print("\t%d non-taxon pos, %d non-taxon neg" % (len(non_taxon_annotated_prots), len(non_taxon_neg_prots))) pos, neg = alg_utils.get_goid_pos_neg(test_ann_mat, goid_idx) annotated_prots = set([prots[i] for i in pos]) neg_prots = set([prots[i] for i in neg]) print("\t%d taxon pos, %d taxon neg" % (len(annotated_prots), len(neg_prots))) print("\t%d annotated prots for %s (%s)" % (len(annotated_prots), selected_goname, selected_goid)) #conf_cutoff = 0.2 conf_cutoff = -1 predicted_prots = set() ranks = {} scores = {} first_zero_rank = None for i, idx in enumerate(np.argsort(taxon_term_scores)[::-1]): rank = i + 1 prot = prots[taxon_prots_idx[idx]] predicted_prots.add(prot) score = taxon_term_scores[idx] scores[prot] = score if taxon is not None: ranks[prot] = rank if score == 0 and first_zero_rank is None: first_zero_rank = rank else: ranks[prot] = rank # move the score between 0 and 1 if it's genemania (normally between -1 and 1) # as the score is used to set the opacity # TODO fix genemania #if alg == "genemania": # pred_cut_conf[gene] = local_conf # local_conf = ((float(local_conf) - -1) / float(1--1)) * (1-0) + 0 #pred_local_conf[gene] = local_conf print("\t%d prots with a score" % (len(taxon_term_scores))) print("Rank of first zero score: %d" % (first_zero_rank)) print("Ranks of left-out positives:") for gene in sorted(annotated_prots, key=ranks.get): print("%s\t%d" % (gene, ranks[gene])) print("Including top 30 ranked-proteins of left-out species") top_30 = sorted(set(taxon_prots) & set(ranks.keys()), key=ranks.get)[:30] if ev_codes_file is not None: print("Evidence codes of top 30:") for i, gene in enumerate(top_30): if gene in uniprot_to_evidencecode: print("%s\t%s\t%s" % (i, gene, uniprot_to_evidencecode[gene])) top_30 = set(top_30) if taxon is not None: print( "Getting the induced subgraph of the neighbors of the %d annotated nodes" % (len(annotated_prots))) prededges = set() if nodes_to_post is not None: print("Getting neighbors of %s" % (', '.join(nodes_to_post))) nodes_to_add_neighbors = set(nodes_to_post) else: nodes_to_add_neighbors = annotated_prots.copy() | top_30 node2idx = ann_obj.node2idx for i in range(opts.num_neighbors): #print("Adding neighbors %d" % (i+1)) curr_nodes_to_add_neighbors = nodes_to_add_neighbors.copy() nodes_to_add_neighbors = set() print("adding %sneighbors of %d nodes" % ("positive ", len(curr_nodes_to_add_neighbors))) for u in curr_nodes_to_add_neighbors: #neighbors = set(nx.all_neighbors(G, u)) neighbors = set( [prots[v] for v in get_mat_neighbors(W, node2idx[u])]) if opts.node_to_post is None: # UPDATE 2018-10: try adding just the positive neighbors of the node # TODO make this a command-line option neighbors = neighbors & (non_taxon_annotated_prots | annotated_prots | top_30) #if len(neighbors) > 15 and nodes_to_post is None: # print("\tskipping adding neighbors of %s. len(neighbors): %d" % (u, len(neighbors))) # continue nodes_to_add_neighbors.update(neighbors) prededges.update(set([(u, v) for v in neighbors])) else: print( "Getting the induced subgraph of the %d annotated and %d predicted proteins" % (len(annotated_prots), len(predicted_prots))) print("not yet implemented. quitting") sys.exit() # prededges = set(G.subgraph(annotated_prots.union(predicted_prots)).edges()) prededges = set([tuple(sorted((u, v))) for u, v in prededges]) # TODO I should also show the disconnected nodes prednodes = set([n for edge in prededges for n in edge]) print("\t%d nodes, %d edges" % (len(prednodes), len(prededges))) if len(prededges) > 1000 or len(prednodes) > 500: print("\nToo many nodes/edges. Not posting to GraphSpace. Quitting") sys.exit() #graph_attr_file = "" #graph_attr, attr_desc = readGraphAttr() # add the edge weight from the network to attr_desc which will be used for the popup # set the edges as the neighbors of the annotated genes #prededges = set() # get the induced subgraph of the annotated nodes and predicted nodes #for n in func_prots: # if not G.has_node(n): # continue # for neighbor in G.neighbors(n): # prededges.add((n, neighbor)) graph_attr = {n: {} for n in prednodes} attr_desc = {n: {} for n in prednodes} print("Reading gene names and species for each protein from %s" % (uniprot_taxon_file)) #prot_species = utils.readDict(uniprot_taxon_file, 1, 2) uniprot_to_gene = utils.readDict(uniprot_taxon_file, 1, 4) # there can be multiple gene names. Just show the first one for now uniprot_to_gene = { n: gene.split(' ')[0] for n, gene in uniprot_to_gene.items() } node_labels = {} print("building graphspace object") # get the abbreviation of the species names species_names, net_taxons = eval_loso.get_selected_species( species_to_uniprot_idx, kwargs['limit_to_taxons_file']) sp_abbrv = { t: ''.join(subs[0] for subs in sp_name.split(' ')[:2]) for t, sp_name in species_names.items() } # for each node, add the prediction values for n in tqdm(prednodes): # set the name of the node to be the gene name and add the k to the label gene_name = uniprot_to_gene.get(n, n) curr_taxon = uniprot_to_species[n] species_short_name = sp_abbrv[curr_taxon] # add the species to the front of the gene name label = "%s-%s" % (species_short_name, gene_name) uniprot_to_gene[n] = label #node_labels[n] = "%s\n%d" % (label, min(ranks[n], 43)) if n in annotated_prots else label node_labels[n] = "%s\n%d" % ( label, ranks[n] if ranks[n] < first_zero_rank else first_zero_rank) if n in taxon_prots else label # maybe put the labels below the nodes? # helps with visualizing the background opacity graph_attr[n]['text-valign'] = 'bottom' # add the strain name to the popup attr_desc[n]['Strain'] = species_names[curr_taxon] if n in predicted_prots: # don't need to normalize because the confidence values are already between 0 and 1 if taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots): pass else: # UPDATE: use the node rank instead of the node score #graph_attr[n]['background-opacity'] = pred_local_conf[n] if n not in ranks: graph_attr[n]['background-opacity'] = scores[n] else: #graph_attr[n]['background-opacity'] = scores[n] graph_attr[n]['background-opacity'] = max([ 0.9 - (ranks[n] / float(first_zero_rank)), float(scores[n]) ]) attr_desc[n]["%s rank" % (alg_names[alg])] = ranks[n] attr_desc[n]["%s prediction score" % (alg_names[alg])] = "%0.4f" % (scores[n]) #elif n in annotated_prots or (taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots)) \ # or n in neg_prots: #if n in pred_local_conf: # graph_attr[n]['background-opacity'] = pred_local_conf[n] # attr_desc[n]["Local prediction confidence"] = pred_local_conf[n] # also add the annotation to the popup if n in uniprot_to_evidencecode: codes = uniprot_to_evidencecode[n] # TODO add bullet points to the list #attr_desc[n]["Evidence code"] = ''.join(["%s (%s)\n" % (c, evidence_code_name[c]) for c in codes]) # order it by exp, comp, then elec evidence_codes = ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'experimental' ]) evidence_codes += ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'computational' ]) evidence_codes += ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'electronic' ]) attr_desc[n]["Evidence code"] = "<ul>%s</ul>" % (evidence_codes) # set the width of the edges by the network weight edge_weights = defaultdict(float) for u, v in tqdm(prededges): e = (u, v) if e not in attr_desc: attr_desc[e] = {} if e not in graph_attr: graph_attr[e] = {} #attr_desc[e]["edge weight"] = G.adj[u][v]]['weight'] if net_obj.multi_net: #attr_desc[e]["Final edge weight"] = "%0.1f" % (W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) edge_type_weights = [] # add the weights for the individual string networks for i in range(len(net_obj.net_names)): net_name = net_obj.net_names[i] net_name = "SSN (E-value <= 0.1)" if 'eval-e0_1' in net_name else net_name net = net_obj.sparse_networks[i] w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0] if w != 0: #attr_desc[e][net_name] = "%0.1f" % (w) edge_type_weights.append("<li>%s: %0.1f</li>" % (net_name, w)) edge_weights[e] += w * net_obj.swsn_weights[i] attr_desc[e]["Edge weights by type"] = "<ul>%s</ul>" % (''.join( sorted(edge_type_weights))) else: attr_desc[e]["Edge weight"] = "%0.1f" % ( W[node2idx[u]][:, node2idx[v]].A.flatten()[0]) # make the edges somewhat opaque for a better visual style graph_attr[e]['opacity'] = 0.7 # set the width of the edges by the network weight #edge_weights = {(u,v): float(W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) for u,v in prededges} for e, w in edge_weights.items(): attr_desc[e]["Final edge weight"] = "%0.1f" % (w) # TODO set the min and max as parameters or something #max_weight = 180 if net_obj.multi_net: max_weight = net_obj.swsn_weights[0] * 180 print(max_weight) else: max_weight = 180 for e in edge_weights: if edge_weights[e] > max_weight: edge_weights[e] = max_weight graph_attr = gs.set_edge_width(prededges, edge_weights, graph_attr, a=1, b=12, min_weight=1, max_weight=max_weight) H = nx.Graph() H.add_edges_from(prededges) # see which DB the edge came from to set the edge color print("Getting the edge type from networks") if net_obj.multi_net: print("\tFrom both STRING and SEQ_SIM") seq_sim_edges = set() for u, v in prededges: # get the SSN weight of this edge. Should be the first network net = net_obj.sparse_networks[0] w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0] if w != 0: # these are all undirected, so just store the sorted version u, v = tuple(sorted((u, v))) # give these the default color graph_attr[(u, v)]['color'] = edge_type_color['default'] seq_sim_edges.add((u, v)) # string_edges = set() # temp_version = '2017_10-string' # net = f_settings.NETWORK_template % (temp_version, temp_version) # for u,v in utils.readColumns(net, 1, 2): # #if (u,v) not in prededges: # if not H.has_edge(u,v): # continue # # give these the default color # u,v = tuple(sorted((u,v))) # graph_attr[(u,v)]['color'] = edge_type_color['string'] # string_edges.add((u,v)) string_edges = prededges.difference(seq_sim_edges) print("\t%d edges from seq-sim, %d edges from STRING" % (len(seq_sim_edges), len(string_edges))) # set the color to STRING if it didn't come from sequence similarity for e in string_edges: #if 'color' not in graph_attr[e]: graph_attr[e]['color'] = edge_type_color['string'] #elif 'STRING' in f_settings.NETWORK_VERSION_INPUTS[version]: # for e in graph_attr: # graph_attr[e]['color'] = edge_type_color['string'] else: for e in graph_attr: graph_attr[e]['color'] = edge_type_color['default'] # apply the evidence code style to each protein for n in prednodes: if n in annotated_prots: graph_attr[n]['color'] = node_type_color['annotation'] elif taxon and n in non_taxon_annotated_prots: graph_attr[n]['color'] = node_type_color['non-taxon-annotation'] elif taxon and n in non_taxon_neg_prots: graph_attr[n]['color'] = node_type_color[ 'non-taxon-neg-annotation'] elif n in neg_prots: graph_attr[n]['color'] = node_type_color['neg-annotation'] elif n in predicted_prots: graph_attr[n]['color'] = node_type_color['prediction'] if n in uniprot_to_evidencecode: curr_style = "" for evidencecode in uniprot_to_evidencecode[n]: curr_type = evidence_code_type[evidencecode] if curr_type == "experimental": curr_style = annotation_type_styles[curr_type] break elif curr_style == "computational": continue else: curr_style = annotation_type_styles[curr_type] graph_attr[n].update(curr_style) # temporary fix to get the non-target positive examples if n in non_taxon_annotated_prots: graph_attr[n].update(annotation_type_styles['experimental']) # TODO build the popups here. That way the popup building logic can be separated from the # GSGraph building logic popups = {} prednodes = set([n for edge in prededges for n in edge]) for n in prednodes: popups[n] = gs.buildNodePopup(n, attr_val=attr_desc) for u, v in prededges: popups[(u, v)] = gs.buildEdgePopup(u, v, node_labels=uniprot_to_gene, attr_val=attr_desc) # Now post to graphspace! print("Building GraphSpace graph") G = gs.constructGraph(prededges, node_labels=node_labels, graph_attr=graph_attr, popups=popups) # TODO add an option to build the 'graph information' tab legend/info # build the 'Graph Information' metadata #desc = gs.buildGraphDescription(opts.edges, opts.net) desc = '' metadata = {'description': desc, 'tags': [], 'title': ''} if tags is not None: metadata['tags'] = tags G.set_data(metadata) if 'graph_exp_name' in dataset: graph_exp_name = dataset['graph_exp_name'] else: graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1], dataset['net_version'].split('/')[-1]) graph_name = "%s-%s-%s-%s%s" % (selected_goname, selected_goid, alg, graph_exp_name, name_postfix) G.set_name(graph_name) # rather than call it from here and repeat all the options, return G, and then call this after #post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name, # group=opts.group, make_public=opts.make_public) return G, graph_name
def get_summary_stats(version="2018_01-toxcast-d2d-p1_5-u1_25", summary_file="network_summaries.csv", scope="permute-dir-undir", forced=False): """ Function to aggregate summary statistics for every network returns a dataframe containing the counted metrics for each chemical """ TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[version]) #inputs_dir = "inputs/%s/" % (version) t_settings.set_version(version) inputs_dir = t_settings.INPUTSPREFIX outputs_dir = "outputs/%s/weighted" % (version) chemicals = utils.readItemList("%s/chemicals.txt" % (inputs_dir), 1) #hits_template = "%s/hit-prots/%%s-hit-prots.txt" % (inputs_dir) #nonhits_template = "%s/hit-prots/%%s-nonhit-prots.txt" % (inputs_dir) #rec_tfs_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (inputs_dir) chem_rec, chem_tfs = TOXCAST_DATA.chemical_rec, TOXCAST_DATA.chemical_tfs chem_prot_hit_vals = TOXCAST_DATA.chemical_protein_hit paths_dir = "%s/edgelinker" % (outputs_dir) paths_template = "%s/%%s-paths.txt" % (paths_dir) out_dir = "%s/stats/summary" % outputs_dir t_utils.checkDir(out_dir) summary_file = "%s/%s" % (out_dir, summary_file) if os.path.isfile(summary_file) and not forced: print( "Reading network summary stats from '%s'. Set forced to True to overwrite it." % (summary_file)) df = pd.read_csv(summary_file, index_col=0) else: print("Reading in the stats from the response networks in", paths_dir) chemical_names, chemical_name_to_id = t_utils.getChemicalNameMaps() chemical_names = { chemical: chemical_names[chemical] for chemical in chemicals } chemical_prots = {} chemical_num_paths = {} chemical_num_edges = {} chemical_avg_path_lengths = {} chemical_rec = {} chemical_tfs = {} chemical_net_rec = {} chemical_net_tfs = {} chemical_hits = {} chemical_nonhits = {} chemical_net_hits = {} chemical_net_nonhits = {} chemical_inter_hits = {} chemical_inter_nonhits = {} chemical_inter_net_hits = {} chemical_inter_net_nonhits = {} # also get the q-value for each chemical chemical_pvals = {} pvals_file = "%s/stats/stat-sig-%s/gpd-pval.txt" % (outputs_dir, scope) if os.path.isfile(pvals_file): with open(pvals_file, 'r') as file_handle: header = file_handle.readline().rstrip().split('\t') pval_col = header.index("200") + 1 chemical_pvals = { chem: pval for chem, pval in utils.readColumns(pvals_file, 1, pval_col) } chemical_qvals = {} qvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % ( outputs_dir, scope) if os.path.isfile(qvals_file): chemical_qvals = t_utils.getPvals(outputs_dir, scope, sig_cutoff_type="FDR") for chemical in tqdm(chemicals): #prots, paths = getProteins(paths=paths_template % chemical, max_k=200, ties=True) paths = t_utils.getPaths(paths_template % chemical, max_k=200, ties=True) prots = set() num_paths = len(paths) edges = set() path_lengths = [] for path in paths: path = path.split('|') # path length is the number of edges in a path path_lengths.append(len(path) - 1) prots = prots.union(set(path)) for i in range(len(path) - 1): edges.add((path[i], path[i + 1])) chemical_prots[chemical] = len(prots) chemical_num_paths[chemical] = len(paths) chemical_avg_path_lengths[chemical] = np.mean(path_lengths) chemical_num_edges[chemical] = len(edges) #rec, tfs = t_utils.getRecTFs(rec_tfs_template % chemical) rec, tfs = chem_rec[chemical], chem_tfs[chemical] chemical_rec[chemical] = len(rec) chemical_tfs[chemical] = len(tfs) chemical_net_rec[chemical] = len(prots.intersection(rec)) chemical_net_tfs[chemical] = len(prots.intersection(tfs)) # read the hits and nonhits for each chemical to calculate how many of them are in the network #hits = utils.readItemSet(hits_template % chemical, 1) #nonhits = utils.readItemSet(nonhits_template % chemical, 1) hits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 1]) nonhits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 0]) chemical_hits[chemical] = len(hits) chemical_nonhits[chemical] = len(nonhits) chemical_net_hits[chemical] = len(hits.intersection(prots)) chemical_net_nonhits[chemical] = len(nonhits.intersection(prots)) # subtract the rec and tfs to get just the intermediate hits and nonhits chemical_inter_hits[chemical] = len(hits.difference( rec.union(tfs))) chemical_inter_nonhits[chemical] = len( nonhits.difference(rec.union(tfs))) chemical_inter_net_hits[chemical] = len( hits.intersection(prots).difference(rec.union(tfs))) chemical_inter_net_nonhits[chemical] = len( nonhits.intersection(prots).difference(rec.union(tfs))) # write these metrics to a file df = pd.DataFrame({ "name": chemical_names, "prots": chemical_prots, "num_paths": chemical_num_paths, "pvals": chemical_pvals, "qvals": chemical_qvals, "num_edges": chemical_num_edges, "avg_path_lengths": chemical_avg_path_lengths, "net_rec": chemical_net_rec, "net_tfs": chemical_net_tfs, "hit_rec": chemical_rec, "hit_tfs": chemical_tfs, "net_hits": chemical_net_hits, "net_nonhits": chemical_net_nonhits, 'hits': chemical_hits, 'nonhits': chemical_nonhits, "inter_net_hits": chemical_inter_net_hits, "inter_net_nonhits": chemical_inter_net_nonhits, "inter_hits": chemical_inter_hits, "inter_nonhits": chemical_inter_nonhits, }) print("Writing: ", summary_file) df.to_csv(summary_file, header=True, columns=[ 'name', 'prots', 'num_paths', 'num_edges', 'avg_path_lengths', 'hits', 'nonhits', 'net_hits', 'net_nonhits', 'hit_rec', 'hit_tfs', 'net_rec', 'net_tfs', 'inter_net_hits', 'inter_net_nonhits', 'inter_hits', 'inter_nonhits', 'pvals', 'qvals' ]) # change the index or chemical id to unicode (string) #df.index = df.index.map(unicode) return df
def build_graph_and_post( version, interactome, rec_tfs_file, RESULTSPREFIX, paths_file, chemical, max_k=200, graph_name="test", #postfix='-'+version, tag=version, chemical_color_file=) graph_attr_file=None, ev_file=None, datadir="/home/jeffl/svnrepo/data", **kwargs): # get the "evidence version" which is used to get the CSBDB related files #ev_version = t_utils.get_ev_version(version) PPI = interactome lines = utils.readColumns(PPI, 1, 2, 3) global PPIEDGES, PPIWEIGHTS PPIEDGES = [(u, v) for u, v, w in lines] PPIWEIGHTS = {(u, v): float(w) for u, v, w in lines} prededges = readNetwork(paths=paths_file, k_limit=max_k) sources = set() targets = set() lines = utils.readColumns(rec_tfs_file, 1, 2) sources = set([ acc for acc, node_type in lines if node_type.lower() in ['source', 'receptor'] ]) targets = set([ acc for acc, node_type in lines if node_type.lower() in ['target', 'tf'] ]) # human_rec = set() # human_tfs = set() # if opts.human_rec_tfs is not None: # # also get the human rec and tfs # lines = utils.readColumns(opts.human_rec_tfs,1,2) # human_rec = set([acc for acc, node_type in lines if node_type.lower() in ['source', 'receptor']]) # human_tfs = set([acc for acc, node_type in lines if node_type.lower() in ['target', 'tf']]) # TODO tempororay fix for family nodes prednodes = set([t for t, h in prededges ]).union(set([h for t, h in prededges])) global uniprot_to_gene #uniprot_to_gene = utils.readDict(getev.getMappingFile(ev_version, datadir), 1, 2) uniprot_to_gene = utils.readDict(kwargs['mapping_file'], 1, 2) # get attributes of nodes and edges from the graph_attr file graph_attr = {} # description of a style, style_attr tuple attr_desc = {} if graph_attr_file is not None: graph_attr, attr_desc = gs_base.readGraphAttr(graph_attr_file) # if opts.chemicalID: # global CHEMICALS # CHEMICALS = t_utils.loadChemicalMap(PPI) if kwargs.get('ctd_support_file'): num_intxs_per_node = get_ctd_support(chemical, prednodes, kwargs['ctd_support_file']) # set the double border attribute for nodes nodes with any CTD support for n in num_intxs_per_node: graph_attr[n]['style'] = 'double' graph_attr[n]['border_color'] = 'maroon' graph_attr[n]['border_width'] = 10 # set the case study colors - all nodes are gray by default if kwargs.get('case_study') is True: if graph_attr_file is not None: # if there are other colors present, then make the sources and targets gray by default because they could have other colors NODE_COLORS.update(CASESTUDY_NODE_COLORS) EDGE_COLORS.update(CASESTUDY_EDGE_COLORS) # get the evidence supporting each edge evidence, edge_types, edge_dir = gs_utils.getEvidence( prededges.keys(), evidence_file=ev_file) # Now post to graphspace! #G = gs.constructGraph(pred_edges, node_labels=uniprot_to_gene, graph_attr=graph_attr, popups=popups) #G = gs_base.constructGraph(prededges, node_labels=uniprot_to_gene, graph_attr=graph_attr, attr_desc=attr_desc) G = constructGraph(prededges, sources, targets, node_labels=uniprot_to_gene, evidence=evidence, edge_types=edge_types, edge_dir=edge_dir, graph_attr=graph_attr, attr_desc=attr_desc, **kwargs) print("Graph has %d nodes and %d edges" % (G.number_of_nodes(), G.number_of_edges())) # put the parent nodes and the nodes in the parent nodes in a grid layout automatically print("Setting the x and y coordinates of each node in a grid layout") # relabel the nodes to their names graph_attr = { uniprot_to_gene.get(n, n): attr for n, attr in graph_attr.items() } layout = gs_utils.grid_layout(G, graph_attr) for node, (x, y) in layout.items(): G.set_node_position(node_name=node, x=x, y=y) # before posting, see if we want to write the Graph's JSON to a file if kwargs.get('out_pref') is not None: print( "Writing graph and style JSON files to:\n\t%s-graph.json \n\t%s-style.json" % (kwargs['out_pref'], kwargs['out_pref'])) with open(kwargs['out_pref'] + "-graph.json", 'w') as out: json.dump(G.get_graph_json(), out, indent=2) with open(kwargs['out_pref'] + "-style.json", 'w') as out: json.dump(G.get_style_json(), out, indent=2) G.set_tags(kwargs.get('tags', [])) G.set_name(graph_name) gs_base.post_graph_to_graphspace(G, kwargs['username'], kwargs['password'], graph_name, apply_layout=kwargs['apply_layout'], layout_name=kwargs['layout_name'], group=kwargs['group'], make_public=kwargs['make_public'])
def addEdgeDir(interactome_file, dir_trumps_undir=True, evidence_file=None, new_interactome_file=None): """ Add T/F for if the edge is directed or not as a 4th column """ if new_interactome_file is None: new_interactome_file = interactome_file print("Re-writing %s with edge direction as a fourth column" % (interactome_file)) else: print("Reading %s and adding edge direction as a fourth column to %s" % (interactome_file, new_interactome_file)) edges = set(utils.readColumns(interactome_file, 1, 2, 3)) # ensure there are no self edges num_edges = len(edges) edges = [(u, v, w) for u, v, w in edges if u != v] if len(edges) != num_edges: print("%d self-edges removed" % (num_edges - len(edges))) edge_weights = {(u, v): w for u, v, w in edges} # also add the direction to the interactome print("Getting the edge direction of all edges in the interactome") if evidence_file is None: # try to get the edge direction automatically if '2018_01' in interactome_file: evidence_version = "2018_01pathlinker" if '2017_01' in interactome_file: evidence_version = "2017_01pathlinker" else: evidence_version = "2016_05pathlinker" evidence_file = getev.getEvidenceFile(evidence_version, t_settings.DATADIR) # the evidence file has if the edges are directed or not. # get that information here using get_interaction_evidence.py edge_dir = getev.getEdgeDir(edge_weights.keys(), evidence_file, split_family_nodes=True, add_ev_to_family_edges=True) # UPDATE 2017-12-07: After splitting the family receptors and TFs, # some of the undirected edges that are trumped by directed edges are back again. Make sure they're removed G = nx.Graph() dirG = nx.DiGraph() for u, v in edge_weights: if edge_dir[(u, v)] is True: dirG.add_edge(u, v) else: G.add_edge(u, v) # now remove trumped edges trumped_edges = 0 for u, v in dirG.edges(): if G.has_edge(u, v): trumped_edges += 1 G.remove_edge(u, v) print("%d undir edges trumped by dir edges" % (trumped_edges)) print("%d directed, %d undirected edges" % (dirG.number_of_edges(), G.number_of_edges())) combG = nx.DiGraph() combG.add_edges_from(dirG.edges(), type="Dir") combG.add_edges_from(G.to_directed().edges(), type="Undir") print("%d total edges" % (combG.number_of_edges())) #new_edges = dirG.edges() #for u,v in G.edges(): # new_edges.append((u,v)) # new_edges.append((v,u)) #if len(new_edges) != len(set(new_edges)): # print("ERROR: there are duplicates") if combG.number_of_edges() != (dirG.number_of_edges() + (G.number_of_edges() * 2)): print( "ERROR: len(new_edges) != dirG.number_of_edges() + (G.number_of_edges()*2):" ) print("%d != %d + (%d*2)" % (combG.number_of_edges(), dirG.number_of_edges(), G.number_of_edges())) sys.exit() # now write the interactome file again with open(new_interactome_file, 'w') as out: for u, v in combG.edges(): #dir_string = "Dir" if edge_dir[(u,v)] else "Undir" w = edge_weights[(u, v)] if (u, v) in edge_weights else edge_weights[(v, u)] out.write("%s\t%s\t%s\t%s\n" % (u, v, w, combG.edge[u][v]['type']))
help='Also store a pdf of the figures') (opts, args) = parser.parse_args() for i, version in enumerate(opts.version): print("") print("-" * 30) t_settings.set_version(version) chemicals = sorted( utils.readItemList("%s/chemicals.txt" % (t_settings.INPUTSPREFIX))) interactome = t_settings.INTERACTOME print( "Getting the weight, cost and direction of each edge from the interactome %s" % (interactome)) lines = utils.readColumns(interactome, 1, 2, 3, 4) edge_weights = {(u, v): float(w) for u, v, w, d in lines} edge_dir = {(u,v): True if d.lower() in ['true','t','dir','directed'] else False \ for u,v,w,d in lines} # get the evidence from get_interaction_evidence.py #evidence_file = getev.getEvidenceFile(evidence_version, t_settings.DATADIR) #edge_dir = getev.getEdgeDir(edge_weights.keys(), evidence_file, split_family_nodes=False, add_ev_to_family_edges=False) # TODO try just the directed edges #weights = [edge_weights[e] for e in edge_weights if edge_dir[e] is True] weights = edge_weights.values() # for now, don't show the costs from the edges of weight 0 costs = [-log(max(0.000001, w)) for w in weights if w != 0] #costs = [cost for cost in costs if cost < 4.5] # also incorporate the penalty into the plot penalty = log(t_settings.EDGE_PENALTIES.get(version, 1))
def permute_and_run_edgelinker(opts, random_index): if opts.write_score_counts: rand_scores_k = "%s/rand-networks/rand-%d-med-scores-k.txt" % ( opts.write_score_counts, random_index) # if the final score counts file already exists, then don't do anything if os.path.isfile(rand_scores_k) and not opts.forced: print("%s already exists. Skipping." % (rand_scores_k)) return chemical_k_scores = "%s/chemical-k-median-scores.txt" % ( opts.write_score_counts) if not os.path.isfile(chemical_k_scores): print( "Error: %s does not exist. Run compute_stat_sig.py with the --write-counts option to write it. Quitting" % (chemical_k_scores)) return t_utils.checkDir("%s/networks" % (opts.out_dir)) rec_tfs_file_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (opts.inputs_dir) chemicals = sorted( utils.readItemList("%s/chemicals.txt" % opts.inputs_dir, col=1)) if opts.single_chem: chemicals = opts.single_chem if opts.permute_rec_tfs is not None: # if specified, "permute" the sets of receptors and tfs for each chemical instead of the interactome print("Writing random sets of rec/tfs for each chemical to %s" % (opts.out_dir)) rec_tfs_file_template = "%s/%%s/%d-random-rec-tfs.txt" % (opts.out_dir, random_index) all_rec, all_tfs = t_utils.getRecTFs(opts.permute_rec_tfs) #chemical_num_rectfs_file = "%s/chemical_num_rectfs.txt" % (opts.inputs_dir) #lines = utils.readColumns(chemical_num_rectfs_file, 2, 3, 4) #for chem, num_rec, num_tfs in tqdm(lines): for chemical in tqdm(chemicals, disable=opts.verbose): out_file = rec_tfs_file_template % (chemical) if not os.path.isfile(out_file) or opts.forced: rec, tfs, costs, zscores = t_utils.getRecTFs( t_settings.REC_TFS_FILE % (opts.inputs_dir, chemical), costs=True) rec = list(rec) tfs = list(tfs) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) random_rec = random.sample(all_rec, len(rec)) # apply the costs to the random rec and tfs for i in range(len(rec)): costs[random_rec[i]] = costs[rec[i]] zscores[random_rec[i]] = zscores[rec[i]] random_tfs = random.sample(all_tfs, len(tfs)) for i in range(len(tfs)): costs[random_tfs[i]] = costs[tfs[i]] zscores[random_tfs[i]] = zscores[tfs[i]] t_utils.writeRecTFs(out_file, random_rec, random_tfs, costs=costs, zscores=zscores) # use the original interactome permuted_network_out_file = opts.interactome print("Using the original interactome %s" % (permuted_network_out_file)) else: # default is to permute the interactome permuted_network_out_file = '%s/networks/permuted-network%d.txt' % ( opts.out_dir, random_index) if not os.path.isfile(permuted_network_out_file) or opts.forced: # don't log transform. The weights will be log transformed by the edgelinker code #G = cycLinker.readNetwork(opts.interactome, weight=True, logtransform=False) # UPDATE: 2017-12-07: try using the direction of the edges from the fourth column of the interactome instead of splitting based on if the edge is bidirected or not G = nx.DiGraph() dir_edges = [] undir_edges = [] lines = utils.readColumns(opts.interactome, 1, 2, 3, 4) if len(lines) == 0: print( "ERROR: interactome should have 4 columns: a, b, w, and True/False for directed/undirected. Quitting" ) sys.exit() for u, v, w, directed in lines: G.add_edge(u, v, weight=float(w)) if directed.lower() in ["true", "t", "dir", 'directed']: dir_edges.append((u, v)) elif directed.lower() not in [ "false", 'f', 'undir', 'undirected' ]: print( "ERROR: Unknown directed edge type '%s'. 4th column should be T/F to indicdate directed/undirected" % (directed.lower())) print("Quitting.") sys.exit() elif u < v: undir_edges.append((u, v)) if opts.undirected: # swap all edges as undirected edges permG = permute_network.permute_network( G.to_undirected(), num_iterations=opts.num_iterations) permG = permG.to_directed() elif opts.split_by_weight: # split the edges into bins by weight and swap the directed and undirected edges separately # if specified by the user permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, split_weight=opts.split_by_weight, num_iterations=opts.num_iterations) elif opts.swap_phys_sig_sep: # swap the directed and undirected edges separately permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, num_iterations=opts.num_iterations, edge_lists=(undir_edges, dir_edges)) else: # if none of the options are specified, then swap everything as directed edges permG = permute_network.permute_network( G, num_iterations=opts.num_iterations) print("Writing %s" % (permuted_network_out_file)) nx.write_weighted_edgelist(permG, permuted_network_out_file, comments='#', delimiter='\t') else: print("Using %s" % (permuted_network_out_file)) # now run edgelinker on each of the chemicals using the permuted network # if version is netpath, use the different type of input file # TODO fix this # PATHLINKERDATAVERSIONS #if 'kegg' in opts.inputs_dir or 'netpath' in opts.inputs_dir: # rec_tfs_file_template = "%s/rec-tfs/%%s-nodes.txt" % (opts.inputs_dir) in_files = [] out_files = [] for chemical in tqdm(chemicals, disable=opts.verbose): rec_tfs_file = rec_tfs_file_template % (chemical) in_files.append(os.path.abspath(rec_tfs_file)) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) out_pref = "%s/%d-random" % (out_dir, random_index) out_files.append(os.path.abspath(out_pref)) # python implementation of edgelinker is taking too long. Switching to java for now. #run_write_edgelinker(permG, rec_tfs_file, opts.k, out_pref) # run the java implementation of edgelinker below # write the in and out files to the networks dir edgelinker_in_files = '%s/networks/permuted-network%d-infiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_in_files, 'w') as out: out.write('\n'.join(in_files)) edgelinker_out_files = '%s/networks/permuted-network%d-outfiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_out_files, 'w') as out: out.write('\n'.join(out_files)) print("Running edgelinker on chemical %s: %s" % (chemical, out_pref)) run_edgelinker.runEdgeLinker(permuted_network_out_file, cyclinker_in_files, cyclinker_out_files, opts.k, edge_penalty=EDGE_PENALTY, rec_tfs_penalty=REC_TFS_PENALTY, multi_run=True) if opts.write_score_counts: # now that edgelinker has been run on all of the chemical sources/targets, # get the path counts for the chemical network's path scores # import compute_stat_sig.py and run the code directly. This avoids the issues of re-importing the libraries from baobab print( "Writing the counts for each of the scores for random index: '%d'" % (random_index)) stat_sig = compute_stat_sig.StatSig(random_paths_dir=opts.out_dir, k_limit=opts.k, num_random=(random_index, random_index), out_dir=opts.write_score_counts) stat_sig.write_rand_counts(chemicals=chemicals, forced=opts.forced) # cmd = "python src/compute_stat_sig.py " + \ # " --chemicals %s/chemicals.txt " % (opts.inputs_dir) + \ # " --random-paths-dir %s/ " % (opts.out_dir) + \ # " -P --k-limit %d " % (opts.k) + \ # " --num-random %d %d" % (random_index, random_index) + \ # " --group-by-prob " + \ # " --write-rand-counts " + \ # " --out-dir %s " % (opts.write_score_counts) # if opts.forced: # cmd += " --forced " # print(cmd) # subprocess.check_call(cmd.split()) #if opts.run_mgsa_random: # run_mgsa_random(random_index) if opts.cleanup: print( "Deleting the generated permuted network and the edgelinker output files" ) if permuted_network_out_file != opts.interactome: os.remove(permuted_network_out_file) os.remove(edgelinker_in_files) # remove the individual output files for cyc_out_file in out_files: # # 2017-02-17 - temporarilly don't remove the paths file for running MGSA os.remove(cyc_out_file + "-paths.txt") os.remove(cyc_out_file + "-ranked-edges.txt") os.remove(edgelinker_out_files)