def get_guild_based_synergy(): # Get network network = wrappers.get_network(network_file=CONFIG.get("network_file"), only_lcc=True) nodes = set(network.nodes()) #create_edge_file(network) # Get drug info drug_to_values = get_drug_info(nodes=nodes) #print drug_to_values.keys() # Get gexp info gexp_norm, gene_to_idx, cell_line_to_idx = None, None, None #gexp_norm, gene_to_idx, cell_line_to_idx = wrappers.get_expression_info(gexp_file = CONFIG.get("gexp_file"), process=set(["z", "abs"]), dump_file = CONFIG.get("gexp_dump")) # Check individual drugs guild_drugs(drug_to_values, nodes, gexp_norm, gene_to_idx, cell_line_to_idx) # Check pairwise drug combinations guild_combinations(drug_to_values, nodes, gexp_norm, gene_to_idx, cell_line_to_idx) return # Now using create_feature_file instead of below # Get synergy info combination_to_values = get_synergy_info() # Check synergy between known pairs combination_to_guild_values = get_guild_based_synergy_scores( drug_to_values.keys(), None, gexp_norm, gene_to_idx, cell_line_to_idx) out_file = CONFIG.get("guild_file") f = open(out_file, 'w') f.write("comb.id cell.line max.a max.b med mean sd max min syn\n") for comb_id, cell_line_to_values in combination_to_values.iteritems(): #drug1, drug2 = comb_id.split(".") for cell_line, vals in cell_line_to_values.iteritems(): values = combination_to_guild_values[comb_id] max_a, max_b, synergy = vals f.write("%s %s %f %f %f %f %f %f %f %s\n" % (comb_id, cell_line, max_a, max_b, numpy.median(values), numpy.mean(values), numpy.std(values), numpy.max(values), numpy.min(values), synergy)) f.close() return
def create_feature_file(): # Get drug info drug_to_values = get_drug_info(nodes=None) #drug_to_values = get_drug_info_with_predicted_targets() # worsens the performance #for drug, values in drug_to_values.iteritems(): # if values[1] != "": # print drug, values[1] #print drug_to_values.items()[:3] # Get cell line info cell_line_to_value = get_cell_line_info() #print cell_line_to_value.items()[:3] # Get synergy info combination_to_values = get_synergy_info() cell_line_to_synergy, combination_to_synergy = get_synergy_values_per_cell_line_and_combination( ) #print combination_to_values.items()[:3] # Get gexp info gexp_norm, gene_to_idx, cell_line_to_idx = wrappers.get_expression_info( gexp_file=CONFIG.get("gexp_file"), process=set(["z"]), dump_file=CONFIG.get("gexp_dump")) # process=set(["z", "abs"]) #values = [gene_to_idx["TSPAN6"], gene_to_idx["TNMD"]] #print gexp_norm[values, cell_line_to_idx["647-V"]] #print "TNMD @ 647-V", gexp_norm[gene_to_idx["TNMD"], cell_line_to_idx["647-V"]] # Get methylation info meth, meth_gene_to_idx, meth_cell_line_to_idx = wrappers.get_expression_info( gexp_file=CONFIG.get("methylation_file"), process=set(["z"]), dump_file=CONFIG.get("methylation_dump")) #print "A1BG @ 647-V", meth[meth_gene_to_idx["A1BG"], meth_cell_line_to_idx["647-V"]] # Get mutation info gene_to_cell_line_to_mutation = get_mutation_info() #print gene_to_cell_line_to_mutation.items()[:3] # Get CNV info #gene_to_cell_line_to_cnv = {} gene_to_cell_line_to_cnv = get_cnv_info(CONFIG.get("cnv_file")) #print gene_to_cell_line_to_cnv.items()[:2] # Get cancer gene & pathway info pathway_to_genes = get_pathway_info(nodes=None) #print pathway_to_genes.keys() genes_pathway = set() for genes in (pathway_to_genes["kegg"], pathway_to_genes["census"]): genes_pathway |= genes genes_pathway = list(genes_pathway) # Get network network = wrappers.get_network(network_file=CONFIG.get("network_file"), only_lcc=True) network_nodes = set(network.nodes()) # Get GUILD info #combination_to_guild_values = {} combination_to_guild_values = get_guild_based_synergy_scores( drug_to_values.keys(), genes_pathway, gexp_norm, gene_to_idx, cell_line_to_idx) #print combination_to_guild_values.items()[:2] # Get drug similarity info combination_to_similarity = get_drug_similarity(drug_to_values) #print combination_to_similarity.items()[:5] #values = combination_to_similarity.items() #values.sort(key=lambda x: x[1]) #print values[-20:] task = CONFIG.get("task") if task.endswith("-train"): out_file = CONFIG.get("feature_file_train") elif task == "ch1-test": out_file = CONFIG.get("feature_file_test_ch1") elif task == "ch2-test": out_file = CONFIG.get("feature_file_test_ch2") else: raise ValueError("Uknown task: " + task) f = open(out_file, 'w') features = [ "gexpA.med", "gexpA.amed", "gexpB.med", "gexpB.amed", "mutA", "mutB", "cnvA", "cnvB", "metA.med", "metA.amed", "metB.med", "metB.amed", "cell.med", "comb.med", "sim.target", "sim.chemical", "kA", "kB", "dAB", "guild.common", "guild.med", "guild.max", "kegg.inA", "kegg.inB", "kegg.gexp.med", "kegg.gexp.max", "kegg.mut.med", "kegg.mut.max", "kegg.cnv.med", "kegg.cnv.max", "kegg.cnvA", "kegg.cnvB", "cosmic.inA", "cosmic.inB", "cosmic.gexp.med", "cosmic.gexp.max", "cosmic.mut.med", "cosmic.mut.max", "cosmic.cnv.med", "cosmic.cnv.max", "cosmic.cnvA", "cosmic.cnvB" ] drugs = drug_to_values.keys() seen_combinations = set() # Get all targets targets_all = set() for i, drug1 in enumerate(drugs): for j, drug2 in enumerate(drugs): if i >= j: continue targets1 = drug_to_values[drug1][0] targets2 = drug_to_values[drug2][0] targets_all |= targets1 | targets2 targets_all = list(targets_all) target_to_idx = dict((target, i) for i, target in enumerate(targets_all)) # Get all pathway genes #targets_pathway = set() #for genes in (pathway_to_genes["kegg"], pathway_to_genes["census"]): # targets_pathway |= genes #targets_pathway = list(targets_pathway & set(targets_all)) #pathway_target_to_idx = dict((gene, i) for i, gene in enumerate(targets_pathway)) targets_pathway = [] for pathway, genes in pathway_to_genes.iteritems(): targets_pathway.append(pathway) # Create header features = map(lambda x: ".g" + x, targets_all) + map(lambda x: ".m" + x, targets_all) + map( lambda x: ".c" + x, targets_all) + map( lambda x: ".z" + x, targets_all) + map( lambda x: ".e" + x, targets_all) + map( lambda x: ".p" + x.replace("_", "."), targets_pathway) + features f.write("comb.id cell.line %s\n" % " ".join(features)) for i, drug1 in enumerate(drugs): for j, drug2 in enumerate(drugs): if i >= j: continue #comb_id = ".".join(sorted([drug1, drug2])) if drug1.lower() < drug2.lower(): comb_id = "%s.%s" % (drug1, drug2) else: comb_id = "%s.%s" % (drug2, drug1) if comb_id not in combination_to_values: continue if task != "ch2-test": # task.startswith("ch1-") or task.endswith("-train"): cell_line_to_mono_values = combination_to_values[comb_id] targets1 = drug_to_values[drug1][0] targets2 = drug_to_values[drug2][0] for cell_line in cell_line_to_value: if task != "ch2-test": if cell_line not in cell_line_to_mono_values: continue #print comb_id, cell_line, drug1, drug2 seen_combinations.add((comb_id, cell_line)) feature_values = [] # GEXP categorized values = [0] * len(targets_all) if cell_line in cell_line_to_idx: for targets in (targets1, targets2): for target in targets: if target in gene_to_idx: values[target_to_idx[target]] += gexp_norm[ gene_to_idx[target], cell_line_to_idx[cell_line]] feature_values.extend(values) #print len(feature_values) # MUT categorized values = [0] * len(targets_all) for targets in (targets1, targets2): for target in targets: if target in gene_to_cell_line_to_mutation: d = gene_to_cell_line_to_mutation[target] if cell_line in d: values[target_to_idx[target]] += d[cell_line] feature_values.extend(values) #print len(feature_values) # CNV & ZYG categorized values = [0] * len(targets_all) values2 = [0] * len(targets_all) for targets in (targets1, targets2): for target in targets: if target in gene_to_cell_line_to_cnv: d = gene_to_cell_line_to_cnv[target] if cell_line in d: # d contains cnv value and zygosity values[ target_to_idx[target]] += d[cell_line][0] values2[ target_to_idx[target]] += d[cell_line][1] feature_values.extend(values) feature_values.extend(values2) # METH categorized values = [0] * len(targets_all) if cell_line in meth_cell_line_to_idx: for targets in (targets1, targets2): for target in targets: if target in meth_gene_to_idx: values[target_to_idx[target]] += meth[ meth_gene_to_idx[target], meth_cell_line_to_idx[cell_line]] feature_values.extend(values) #print len(feature_values) # KEGG / COSMIC INVOLVEMENT categorized values = [0] * len(targets_pathway) #for targets in (targets1, targets2): #for target in targets: #if target in pathway_target_to_idx: # values[pathway_target_to_idx[target]] += 1 for k, pathway in enumerate(targets_pathway): values[k] += len(targets1 & pathway_to_genes[pathway]) values[k] += len(targets2 & pathway_to_genes[pathway]) feature_values.extend(values) #print len(feature_values) # GEXP for targets in (targets1, targets2): indices = [] for target in targets: if target in gene_to_idx: indices.append(gene_to_idx[target]) if len(indices) == 0 or cell_line not in cell_line_to_idx: vals = ["NA"] * 2 else: values = gexp_norm[indices, cell_line_to_idx[cell_line]] vals = [ numpy.median(values), numpy.median(numpy.abs(values)) ] #values.flat[numpy.abs(values).argmax()]] feature_values.extend(vals) #print len(feature_values) # MUT for targets in (targets1, targets2): values = [] for target in targets: if target in gene_to_cell_line_to_mutation: d = gene_to_cell_line_to_mutation[target] if cell_line in d: values.append(d[cell_line]) if len(values) == 0: val = "NA" else: val = numpy.max(values) feature_values.append(val) #print len(feature_values) # CNV for targets in (targets1, targets2): values = [] for target in targets: if target in gene_to_cell_line_to_cnv: d = gene_to_cell_line_to_cnv[target] if cell_line in d: values.append(d[cell_line][0]) if len(values) == 0: val = "NA" else: val = numpy.max(values) feature_values.append(val) #print len(feature_values) # METH for targets in (targets1, targets2): indices = [] for target in targets: if target in meth_gene_to_idx: indices.append(meth_gene_to_idx[target]) if len(indices ) == 0 or cell_line not in meth_cell_line_to_idx: vals = ["NA"] * 2 else: #val = numpy.median(numpy.abs(meth[indices, meth_cell_line_to_idx[cell_line]])) values = meth[indices, meth_cell_line_to_idx[cell_line]] vals = [ numpy.median(values), numpy.median(numpy.abs(values)) ] feature_values.extend(vals) #print len(feature_values) # MEDIAN SYNERGY per cell line / combination vals = ["NA"] * 2 if cell_line in cell_line_to_synergy: vals[0] = cell_line_to_synergy[cell_line] if comb_id in combination_to_synergy: vals[1] = combination_to_synergy[comb_id] feature_values.extend(vals) #print len(feature_values) # SIMILARITY vals = ["NA"] * 2 if comb_id in combination_to_similarity: vals = combination_to_similarity[comb_id] feature_values.extend(vals) #print len(feature_values) # Interaction network based features (degree A/B) for targets in (targets1, targets2): values = [] for target in targets: if target in network_nodes: d = network.degree(target) values.append(d) if len(values) == 0: val = "NA" else: val = numpy.max(values) feature_values.append(val) # Interaction network based distance between A-B values = [] for target1 in targets1: if target1 not in network_nodes: continue for target2 in targets2: if target2 not in network_nodes: continue d = network_utilities.get_shortest_path_length_between( network, target1, target2) values.append(d) if len(values) == 0: val = "NA" else: val = numpy.min(values) feature_values.append(val) # GUILD vals = ["NA"] * 3 if comb_id in combination_to_guild_values: values_guild = combination_to_guild_values[comb_id] if cell_line in values_guild: vals = values_guild[cell_line] feature_values.extend(vals) #print len(feature_values) # KEGG / COSMIC for genes in (pathway_to_genes["kegg"], pathway_to_genes["census"]): # INVOLVEMENT val = 0 for targets in (targets1, targets2): val = len(targets & genes) feature_values.append(val) #print len(feature_values) # GEXP for target in genes: if target in gene_to_idx: indices.append(gene_to_idx[target]) if len(indices) == 0 or cell_line not in cell_line_to_idx: vals = ["NA"] * 2 else: values = gexp_norm[indices, cell_line_to_idx[cell_line]] vals = [numpy.median(values), numpy.max(values)] feature_values.extend(vals) #print len(feature_values) # MUT values = [] for target in genes: if target in gene_to_cell_line_to_mutation: d = gene_to_cell_line_to_mutation[target] if cell_line in d: values.append(d[cell_line]) if len(values) == 0: vals = ["NA"] * 2 else: vals = [numpy.median(values), numpy.max(values)] feature_values.extend(vals) #print len(feature_values) # CNV values = [] for target in genes: if target in gene_to_cell_line_to_cnv: d = gene_to_cell_line_to_cnv[target] if cell_line in d: values.append(d[cell_line][0]) if len(values) == 0: vals = ["NA"] * 2 else: vals = [numpy.median(values), numpy.max(values)] feature_values.extend(vals) #print len(feature_values) # CNV target for targets in (targets1, targets2): values = [] for target in targets & genes: if target in gene_to_cell_line_to_cnv: d = gene_to_cell_line_to_cnv[target] if cell_line in d: values.append(d[cell_line][0]) if len(values) == 0: val = "NA" else: val = numpy.mean(values) feature_values.append(val) #print len(feature_values) f.write( "%s %s %s\n" % (comb_id, cell_line, " ".join(map(str, feature_values)))) #if comb_id == "BCL2L1.Vinorelbine" and cell_line == "NCI-H1437": # return #f.close() print "Not in seen combinations:" for comb_id, cell_line_to_mono_values in combination_to_values.iteritems(): for cell_line in cell_line_to_mono_values: if (comb_id, cell_line) not in seen_combinations: print comb_id, cell_line f.write("%s %s %s\n" % (comb_id, cell_line, " ".join( map(str, [0] * len(feature_values)))) ) # to amend problematic ch2 test file f.close() return
# ================================================== # -- load data # PPI graph if args.ppi == 'STRING': G = nx.read_edgelist( "resources/BioNEV/data/STRING_PPI/STRING_PPI.edgelist") nodes = pd.read_csv( 'resources/BioNEV/data/STRING_PPI/node_list.txt', sep='\t', index_col=0) drug_target_dict = load_DTI() covid_protein_list = make_SARSCOV2_PPI() # ppid2id nodes['id'] = nodes.index ppid2id = nodes.set_index('STRING_id').to_dict()['id'] # len 15131 else: network_file = "2016data/network/network.sif" G = wrappers.get_network(network_file, only_lcc=True) # provide drugs objects drugs = load_drugs_from("2016data/target/drug_to_geneids.pcl.all") # provide disease objects diseases = load_diseases_from("2016data/disease/disease_genes.tsv") # ================================================== # ================================================== # -- graph embedding - graphrep # try loading pretrained embedding file_name = f"{args.ppi}_PPI_{args.embs}_embs.txt" emb_folder = 'saved/embs' if args.embs == 'struc2vec': if exists(join(emb_folder, file_name)):