Exemple #1
0
def get_guild_based_synergy():
    # Get network
    network = wrappers.get_network(network_file=CONFIG.get("network_file"),
                                   only_lcc=True)
    nodes = set(network.nodes())
    #create_edge_file(network)
    # Get drug info
    drug_to_values = get_drug_info(nodes=nodes)
    #print drug_to_values.keys()
    # Get gexp info
    gexp_norm, gene_to_idx, cell_line_to_idx = None, None, None
    #gexp_norm, gene_to_idx, cell_line_to_idx = wrappers.get_expression_info(gexp_file = CONFIG.get("gexp_file"), process=set(["z", "abs"]), dump_file = CONFIG.get("gexp_dump"))
    # Check individual drugs
    guild_drugs(drug_to_values, nodes, gexp_norm, gene_to_idx,
                cell_line_to_idx)
    # Check pairwise drug combinations
    guild_combinations(drug_to_values, nodes, gexp_norm, gene_to_idx,
                       cell_line_to_idx)
    return
    # Now using create_feature_file instead of below
    # Get synergy info
    combination_to_values = get_synergy_info()
    # Check synergy between known pairs
    combination_to_guild_values = get_guild_based_synergy_scores(
        drug_to_values.keys(), None, gexp_norm, gene_to_idx, cell_line_to_idx)
    out_file = CONFIG.get("guild_file")
    f = open(out_file, 'w')
    f.write("comb.id cell.line max.a max.b med mean sd max min syn\n")
    for comb_id, cell_line_to_values in combination_to_values.iteritems():
        #drug1, drug2 = comb_id.split(".")
        for cell_line, vals in cell_line_to_values.iteritems():
            values = combination_to_guild_values[comb_id]
            max_a, max_b, synergy = vals
            f.write("%s %s %f %f %f %f %f %f %f %s\n" %
                    (comb_id, cell_line, max_a, max_b, numpy.median(values),
                     numpy.mean(values), numpy.std(values), numpy.max(values),
                     numpy.min(values), synergy))
    f.close()
    return
Exemple #2
0
def create_feature_file():
    # Get drug info
    drug_to_values = get_drug_info(nodes=None)
    #drug_to_values = get_drug_info_with_predicted_targets() # worsens the performance
    #for drug, values in drug_to_values.iteritems():
    #	if values[1] != "":
    #	    print drug, values[1]
    #print drug_to_values.items()[:3]
    # Get cell line info
    cell_line_to_value = get_cell_line_info()
    #print cell_line_to_value.items()[:3]
    # Get synergy info
    combination_to_values = get_synergy_info()
    cell_line_to_synergy, combination_to_synergy = get_synergy_values_per_cell_line_and_combination(
    )
    #print combination_to_values.items()[:3]
    # Get gexp info
    gexp_norm, gene_to_idx, cell_line_to_idx = wrappers.get_expression_info(
        gexp_file=CONFIG.get("gexp_file"),
        process=set(["z"]),
        dump_file=CONFIG.get("gexp_dump"))  # process=set(["z", "abs"])
    #values = [gene_to_idx["TSPAN6"], gene_to_idx["TNMD"]]
    #print gexp_norm[values, cell_line_to_idx["647-V"]]
    #print "TNMD @ 647-V", gexp_norm[gene_to_idx["TNMD"], cell_line_to_idx["647-V"]]
    # Get methylation info
    meth, meth_gene_to_idx, meth_cell_line_to_idx = wrappers.get_expression_info(
        gexp_file=CONFIG.get("methylation_file"),
        process=set(["z"]),
        dump_file=CONFIG.get("methylation_dump"))
    #print "A1BG @ 647-V", meth[meth_gene_to_idx["A1BG"], meth_cell_line_to_idx["647-V"]]
    # Get mutation info
    gene_to_cell_line_to_mutation = get_mutation_info()
    #print gene_to_cell_line_to_mutation.items()[:3]
    # Get CNV info
    #gene_to_cell_line_to_cnv = {}
    gene_to_cell_line_to_cnv = get_cnv_info(CONFIG.get("cnv_file"))
    #print gene_to_cell_line_to_cnv.items()[:2]
    # Get cancer gene & pathway info
    pathway_to_genes = get_pathway_info(nodes=None)
    #print pathway_to_genes.keys()
    genes_pathway = set()
    for genes in (pathway_to_genes["kegg"], pathway_to_genes["census"]):
        genes_pathway |= genes
    genes_pathway = list(genes_pathway)
    # Get network
    network = wrappers.get_network(network_file=CONFIG.get("network_file"),
                                   only_lcc=True)
    network_nodes = set(network.nodes())
    # Get GUILD info
    #combination_to_guild_values = {}
    combination_to_guild_values = get_guild_based_synergy_scores(
        drug_to_values.keys(), genes_pathway, gexp_norm, gene_to_idx,
        cell_line_to_idx)
    #print combination_to_guild_values.items()[:2]
    # Get drug similarity info
    combination_to_similarity = get_drug_similarity(drug_to_values)
    #print combination_to_similarity.items()[:5]
    #values = combination_to_similarity.items()
    #values.sort(key=lambda x: x[1])
    #print values[-20:]
    task = CONFIG.get("task")
    if task.endswith("-train"):
        out_file = CONFIG.get("feature_file_train")
    elif task == "ch1-test":
        out_file = CONFIG.get("feature_file_test_ch1")
    elif task == "ch2-test":
        out_file = CONFIG.get("feature_file_test_ch2")
    else:
        raise ValueError("Uknown task: " + task)
    f = open(out_file, 'w')
    features = [
        "gexpA.med", "gexpA.amed", "gexpB.med", "gexpB.amed", "mutA", "mutB",
        "cnvA", "cnvB", "metA.med", "metA.amed", "metB.med", "metB.amed",
        "cell.med", "comb.med", "sim.target", "sim.chemical", "kA", "kB",
        "dAB", "guild.common", "guild.med", "guild.max", "kegg.inA",
        "kegg.inB", "kegg.gexp.med", "kegg.gexp.max", "kegg.mut.med",
        "kegg.mut.max", "kegg.cnv.med", "kegg.cnv.max", "kegg.cnvA",
        "kegg.cnvB", "cosmic.inA", "cosmic.inB", "cosmic.gexp.med",
        "cosmic.gexp.max", "cosmic.mut.med", "cosmic.mut.max",
        "cosmic.cnv.med", "cosmic.cnv.max", "cosmic.cnvA", "cosmic.cnvB"
    ]
    drugs = drug_to_values.keys()
    seen_combinations = set()
    # Get all targets
    targets_all = set()
    for i, drug1 in enumerate(drugs):
        for j, drug2 in enumerate(drugs):
            if i >= j:
                continue
            targets1 = drug_to_values[drug1][0]
            targets2 = drug_to_values[drug2][0]
            targets_all |= targets1 | targets2
    targets_all = list(targets_all)
    target_to_idx = dict((target, i) for i, target in enumerate(targets_all))
    # Get all pathway genes
    #targets_pathway = set()
    #for genes in (pathway_to_genes["kegg"], pathway_to_genes["census"]):
    #	targets_pathway |= genes
    #targets_pathway = list(targets_pathway & set(targets_all))
    #pathway_target_to_idx = dict((gene, i) for i, gene in enumerate(targets_pathway))
    targets_pathway = []
    for pathway, genes in pathway_to_genes.iteritems():
        targets_pathway.append(pathway)
    # Create header
    features = map(lambda x: ".g" + x,
                   targets_all) + map(lambda x: ".m" + x, targets_all) + map(
                       lambda x: ".c" + x, targets_all) + map(
                           lambda x: ".z" + x, targets_all) + map(
                               lambda x: ".e" + x, targets_all) + map(
                                   lambda x: ".p" + x.replace("_", "."),
                                   targets_pathway) + features
    f.write("comb.id cell.line %s\n" % " ".join(features))
    for i, drug1 in enumerate(drugs):
        for j, drug2 in enumerate(drugs):
            if i >= j:
                continue
            #comb_id = ".".join(sorted([drug1, drug2]))
            if drug1.lower() < drug2.lower():
                comb_id = "%s.%s" % (drug1, drug2)
            else:
                comb_id = "%s.%s" % (drug2, drug1)
            if comb_id not in combination_to_values:
                continue
            if task != "ch2-test":  # task.startswith("ch1-") or task.endswith("-train"):
                cell_line_to_mono_values = combination_to_values[comb_id]
            targets1 = drug_to_values[drug1][0]
            targets2 = drug_to_values[drug2][0]
            for cell_line in cell_line_to_value:
                if task != "ch2-test":
                    if cell_line not in cell_line_to_mono_values:
                        continue
                #print comb_id, cell_line, drug1, drug2
                seen_combinations.add((comb_id, cell_line))
                feature_values = []
                # GEXP categorized
                values = [0] * len(targets_all)
                if cell_line in cell_line_to_idx:
                    for targets in (targets1, targets2):
                        for target in targets:
                            if target in gene_to_idx:
                                values[target_to_idx[target]] += gexp_norm[
                                    gene_to_idx[target],
                                    cell_line_to_idx[cell_line]]
                feature_values.extend(values)
                #print len(feature_values)
                # MUT categorized
                values = [0] * len(targets_all)
                for targets in (targets1, targets2):
                    for target in targets:
                        if target in gene_to_cell_line_to_mutation:
                            d = gene_to_cell_line_to_mutation[target]
                            if cell_line in d:
                                values[target_to_idx[target]] += d[cell_line]
                feature_values.extend(values)
                #print len(feature_values)
                # CNV & ZYG categorized
                values = [0] * len(targets_all)
                values2 = [0] * len(targets_all)
                for targets in (targets1, targets2):
                    for target in targets:
                        if target in gene_to_cell_line_to_cnv:
                            d = gene_to_cell_line_to_cnv[target]
                            if cell_line in d:
                                # d contains cnv value and zygosity
                                values[
                                    target_to_idx[target]] += d[cell_line][0]
                                values2[
                                    target_to_idx[target]] += d[cell_line][1]
                feature_values.extend(values)
                feature_values.extend(values2)
                # METH categorized
                values = [0] * len(targets_all)
                if cell_line in meth_cell_line_to_idx:
                    for targets in (targets1, targets2):
                        for target in targets:
                            if target in meth_gene_to_idx:
                                values[target_to_idx[target]] += meth[
                                    meth_gene_to_idx[target],
                                    meth_cell_line_to_idx[cell_line]]
                feature_values.extend(values)
                #print len(feature_values)
                # KEGG / COSMIC INVOLVEMENT categorized
                values = [0] * len(targets_pathway)
                #for targets in (targets1, targets2):
                #for target in targets:
                #if target in pathway_target_to_idx:
                #    values[pathway_target_to_idx[target]] += 1
                for k, pathway in enumerate(targets_pathway):
                    values[k] += len(targets1 & pathway_to_genes[pathway])
                    values[k] += len(targets2 & pathway_to_genes[pathway])
                feature_values.extend(values)
                #print len(feature_values)
                # GEXP
                for targets in (targets1, targets2):
                    indices = []
                    for target in targets:
                        if target in gene_to_idx:
                            indices.append(gene_to_idx[target])
                    if len(indices) == 0 or cell_line not in cell_line_to_idx:
                        vals = ["NA"] * 2
                    else:
                        values = gexp_norm[indices,
                                           cell_line_to_idx[cell_line]]
                        vals = [
                            numpy.median(values),
                            numpy.median(numpy.abs(values))
                        ]  #values.flat[numpy.abs(values).argmax()]]
                    feature_values.extend(vals)
                    #print len(feature_values)
                # MUT
                for targets in (targets1, targets2):
                    values = []
                    for target in targets:
                        if target in gene_to_cell_line_to_mutation:
                            d = gene_to_cell_line_to_mutation[target]
                            if cell_line in d:
                                values.append(d[cell_line])
                    if len(values) == 0:
                        val = "NA"
                    else:
                        val = numpy.max(values)
                    feature_values.append(val)
                    #print len(feature_values)
                # CNV
                for targets in (targets1, targets2):
                    values = []
                    for target in targets:
                        if target in gene_to_cell_line_to_cnv:
                            d = gene_to_cell_line_to_cnv[target]
                            if cell_line in d:
                                values.append(d[cell_line][0])
                    if len(values) == 0:
                        val = "NA"
                    else:
                        val = numpy.max(values)
                    feature_values.append(val)
                    #print len(feature_values)
                # METH
                for targets in (targets1, targets2):
                    indices = []
                    for target in targets:
                        if target in meth_gene_to_idx:
                            indices.append(meth_gene_to_idx[target])
                    if len(indices
                           ) == 0 or cell_line not in meth_cell_line_to_idx:
                        vals = ["NA"] * 2
                    else:
                        #val = numpy.median(numpy.abs(meth[indices, meth_cell_line_to_idx[cell_line]]))
                        values = meth[indices,
                                      meth_cell_line_to_idx[cell_line]]
                        vals = [
                            numpy.median(values),
                            numpy.median(numpy.abs(values))
                        ]
                    feature_values.extend(vals)
                    #print len(feature_values)
                # MEDIAN SYNERGY per cell line / combination
                vals = ["NA"] * 2
                if cell_line in cell_line_to_synergy:
                    vals[0] = cell_line_to_synergy[cell_line]
                if comb_id in combination_to_synergy:
                    vals[1] = combination_to_synergy[comb_id]
                feature_values.extend(vals)
                #print len(feature_values)
                # SIMILARITY
                vals = ["NA"] * 2
                if comb_id in combination_to_similarity:
                    vals = combination_to_similarity[comb_id]
                feature_values.extend(vals)
                #print len(feature_values)
                # Interaction network based features (degree A/B)
                for targets in (targets1, targets2):
                    values = []
                    for target in targets:
                        if target in network_nodes:
                            d = network.degree(target)
                            values.append(d)
                    if len(values) == 0:
                        val = "NA"
                    else:
                        val = numpy.max(values)
                    feature_values.append(val)
                # Interaction network based distance between A-B
                values = []
                for target1 in targets1:
                    if target1 not in network_nodes:
                        continue
                    for target2 in targets2:
                        if target2 not in network_nodes:
                            continue
                        d = network_utilities.get_shortest_path_length_between(
                            network, target1, target2)
                        values.append(d)
                if len(values) == 0:
                    val = "NA"
                else:
                    val = numpy.min(values)
                feature_values.append(val)
                # GUILD
                vals = ["NA"] * 3
                if comb_id in combination_to_guild_values:
                    values_guild = combination_to_guild_values[comb_id]
                    if cell_line in values_guild:
                        vals = values_guild[cell_line]
                feature_values.extend(vals)
                #print len(feature_values)
                # KEGG / COSMIC
                for genes in (pathway_to_genes["kegg"],
                              pathway_to_genes["census"]):
                    # INVOLVEMENT
                    val = 0
                    for targets in (targets1, targets2):
                        val = len(targets & genes)
                        feature_values.append(val)
                    #print len(feature_values)
                    # GEXP
                    for target in genes:
                        if target in gene_to_idx:
                            indices.append(gene_to_idx[target])
                    if len(indices) == 0 or cell_line not in cell_line_to_idx:
                        vals = ["NA"] * 2
                    else:
                        values = gexp_norm[indices,
                                           cell_line_to_idx[cell_line]]
                        vals = [numpy.median(values), numpy.max(values)]
                    feature_values.extend(vals)
                    #print len(feature_values)
                    # MUT
                    values = []
                    for target in genes:
                        if target in gene_to_cell_line_to_mutation:
                            d = gene_to_cell_line_to_mutation[target]
                            if cell_line in d:
                                values.append(d[cell_line])
                    if len(values) == 0:
                        vals = ["NA"] * 2
                    else:
                        vals = [numpy.median(values), numpy.max(values)]
                    feature_values.extend(vals)
                    #print len(feature_values)
                    # CNV
                    values = []
                    for target in genes:
                        if target in gene_to_cell_line_to_cnv:
                            d = gene_to_cell_line_to_cnv[target]
                            if cell_line in d:
                                values.append(d[cell_line][0])
                    if len(values) == 0:
                        vals = ["NA"] * 2
                    else:
                        vals = [numpy.median(values), numpy.max(values)]
                    feature_values.extend(vals)
                    #print len(feature_values)
                    # CNV target
                    for targets in (targets1, targets2):
                        values = []
                        for target in targets & genes:
                            if target in gene_to_cell_line_to_cnv:
                                d = gene_to_cell_line_to_cnv[target]
                                if cell_line in d:
                                    values.append(d[cell_line][0])
                        if len(values) == 0:
                            val = "NA"
                        else:
                            val = numpy.mean(values)
                        feature_values.append(val)
                        #print len(feature_values)
                f.write(
                    "%s %s %s\n" %
                    (comb_id, cell_line, " ".join(map(str, feature_values))))
                #if comb_id == "BCL2L1.Vinorelbine" and cell_line == "NCI-H1437":
                #    return
                #f.close()
    print "Not in seen combinations:"
    for comb_id, cell_line_to_mono_values in combination_to_values.iteritems():
        for cell_line in cell_line_to_mono_values:
            if (comb_id, cell_line) not in seen_combinations:
                print comb_id, cell_line
                f.write("%s %s %s\n" % (comb_id, cell_line, " ".join(
                    map(str, [0] * len(feature_values))))
                        )  # to amend problematic ch2 test file
    f.close()
    return
# ==================================================
# -- load data
# PPI graph
if args.ppi == 'STRING':
    G = nx.read_edgelist(
        "resources/BioNEV/data/STRING_PPI/STRING_PPI.edgelist")
    nodes = pd.read_csv(
        'resources/BioNEV/data/STRING_PPI/node_list.txt', sep='\t', index_col=0)
    drug_target_dict = load_DTI()
    covid_protein_list = make_SARSCOV2_PPI()
    # ppid2id
    nodes['id'] = nodes.index
    ppid2id = nodes.set_index('STRING_id').to_dict()['id']  # len 15131
else:
    network_file = "2016data/network/network.sif"
    G = wrappers.get_network(network_file, only_lcc=True)

    # provide drugs objects
    drugs = load_drugs_from("2016data/target/drug_to_geneids.pcl.all")

    # provide disease objects
    diseases = load_diseases_from("2016data/disease/disease_genes.tsv")
# ==================================================

# ==================================================
# -- graph embedding - graphrep
# try loading pretrained embedding
file_name = f"{args.ppi}_PPI_{args.embs}_embs.txt"
emb_folder = 'saved/embs'
if args.embs == 'struc2vec':
    if exists(join(emb_folder, file_name)):