예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Answers questions of the form: 'what pathways are most enriched by $protein_list?'",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-s',
                        '--source',
                        type=str,
                        help="source curie ID",
                        default="UniProtKB:Q96M43")
    parser.add_argument('-t',
                        '--target',
                        type=str,
                        help="target node type",
                        default="pathway")
    parser.add_argument('-y',
                        '--type',
                        type=str,
                        help="source node type",
                        default="protein")
    parser.add_argument(
        '-j',
        '--json',
        action='store_true',
        help=
        'Flag specifying that results should be printed in JSON format (to stdout)',
        default=False)
    parser.add_argument(
        '-r',
        '--rel_type',
        type=str,
        help='Only do the Fisher exact test along edges of this type',
        default=None)
    parser.add_argument(
        '--describe',
        action='store_true',
        help='Print a description of the question to stdout and quit',
        default=False)
    parser.add_argument('--num_show',
                        type=int,
                        help='Maximum number of results to return',
                        default=20)

    # Parse and check args
    args = parser.parse_args()
    source_arg = args.source
    target_type = args.target
    source_type = args.type
    use_json = args.json
    describe_flag = args.describe
    num_show = args.num_show
    rel_type = args.rel_type

    if source_arg[0] == "[":
        if "','" not in source_arg:
            source_arg = source_arg.replace(",", "','").replace("[",
                                                                "['").replace(
                                                                    "]", "']")
        source_list = ast.literal_eval(source_arg)
        source_list_strip = []
        for source in source_list:
            source_list_strip.append(source.strip())
        source_list = source_list_strip
    else:
        source_list = [source_arg]

    # Initialize the question class
    Q = QuestionFisher()

    if describe_flag:
        res = Q.describe()
        print(res)
    else:
        # Initialize the response class
        response = FormatOutput.FormatResponse(6)
        response.response.table_column_names = [
            "target name", "target ID", "P value"
        ]
        graph_weight_tuples = []

        q_answer = Q.answer(source_list,
                            source_type,
                            target_type,
                            use_json=use_json,
                            num_show=num_show,
                            rel_type=rel_type)

        if not q_answer:  # if q_answer == None
            return None  # All messages printed out; safe to quit

        p_dict, target_list = q_answer

        # print out the results
        if not use_json:
            for target_name in target_list:
                target_description = RU.get_node_property(
                    target_name, "name", node_label=target_type)
                print("%s %f" % (target_description, p_dict[target_name]))
        else:
            #response.response.table_column_names = ["source name", "source ID", "target name", "target ID", "path weight",
            #                                        "target source google distance",
            #                                        "ML probability target treats source"]
            for target_name in target_list:
                target_description = RU.get_node_property(
                    target_name, "name", node_label=target_type)
                target_id_old_curie = target_name.replace(
                    "CHEMBL.COMPOUND:CHEMBL", "ChEMBL:")
                confidence = p_dict[target_name]
                # populate the graph
                graph = RU.get_graph_from_nodes([target_name])
                res = response.add_subgraph(
                    graph.nodes(data=True),
                    graph.edges(data=True),
                    "The target %s is enriched by %s." %
                    (target_description, str(source_list)),
                    confidence,
                    return_result=True)
                res.essence = "%s" % target_description  # populate with essence of question result
                row_data = []  # initialize the row data
                #row_data.append("%s" % source_description)
                #row_data.append("%s" % source_id)
                row_data.append("%s" % target_description)
                row_data.append("%s" % target_name)
                row_data.append("%f" % confidence)
                #row_data.append("%f" % gd)
                #row_data.append("%f" % prob)
                res.row_data = row_data
            response.print()
예제 #2
0
    def answer(drug_id,
               use_json=False,
               num_show=20,
               rev=True,
               conservative=True):
        """
		Answers the question 'what diseases does $drug commonly treat?'
		:param disease_id: KG disease node name
		:param use_json: bool, use JSON output
		:param num_show: int, number to display
		:param rev: bool. order by most frequent
		:param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD
		:return: none
		"""

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # get the description
        drug_description = RU.get_node_property(drug_id,
                                                'name',
                                                name_type='id')

        # Get the conditions that COHD says it's used to treat
        conditions_treated = COHDUtilities.get_conditions_treating(
            drug_description, conservative=conservative)

        # sort the diseases by frequency
        ids_counts = []
        for id in conditions_treated:
            cond = conditions_treated[id]
            ids_counts.append((id, cond['concept_count']))

        ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev)
        ids_sorted = [i[0] for i in ids_counts_sorted]

        # reduce to top n
        ids_sorted_top_n = ids_sorted
        if len(ids_sorted_top_n) > num_show:
            ids_sorted_top_n = ids_sorted_top_n[0:num_show]

        # return the results
        if not use_json:
            if rev:
                to_print = "The most common conditions "
            else:
                to_print = "The least common conditions "
            to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description
            for id in ids_sorted_top_n:
                to_print += "condition: %s\t count %d \t frequency %f \n" % (
                    conditions_treated[id]['associated_concept_name'],
                    conditions_treated[id]['concept_count'],
                    conditions_treated[id]['concept_frequency'])
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            #  Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements
            drug_as_graph = RU.get_node_as_graph(drug_id)
            drug_node_info = list(drug_as_graph.nodes(data=True))[0][1]
            id_to_KG_name = dict()
            id_to_name = dict()
            id_to_count = dict()
            id_to_frequency = dict()
            id_to_id = dict()

            # Map ID's to all relevant values
            for id in ids_sorted_top_n:
                id_to_name[id] = conditions_treated[id][
                    'associated_concept_name']
                id_to_count[id] = conditions_treated[id]['concept_count']
                id_to_frequency[id] = conditions_treated[id][
                    'concept_frequency']
                id_to_KG_name[id] = None
                try:
                    id_to_KG_name[id] = RU.get_id_from_property(
                        id_to_name[id], 'name', label="phenotypic_feature")
                    id_to_id[id_to_KG_name[id]] = id
                except:
                    try:
                        id_to_KG_name[id] = RU.get_id_from_property(
                            id_to_name[id], 'name', label="disease")
                        id_to_id[id_to_KG_name[id]] = id
                    except:
                        try:
                            id_to_KG_name[id] = RU.get_id_from_property(
                                id_to_name[id].lower(),
                                'name',
                                label="phenotypic_feature")
                            id_to_id[id_to_KG_name[id]] = id
                        except:
                            try:
                                id_to_KG_name[id] = RU.get_id_from_property(
                                    id_to_name[id].lower(),
                                    'name',
                                    label="disease")
                                id_to_id[id_to_KG_name[id]] = id
                            except:
                                pass

            # get the graph (one call) of all the nodes that wer mapped
            KG_names = []
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    KG_names.append(id_to_KG_name[id])

            if not KG_names:
                error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description
                error_code = "EmptyResult"
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

            all_conditions_graph = RU.get_graph_from_nodes(KG_names)

            # Get the info of the mapped nodes
            id_to_info = dict()
            for u, data in all_conditions_graph.nodes(data=True):
                id = data['properties']['id']
                id = id_to_id[id]
                id_to_info[id] = data

            # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped)
            for id in ids_sorted_top_n:
                if id_to_KG_name[id] is not None:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                         "%f out of all patients treated with %s (count=%d)." % (
                    drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    nodes = []
                    disease_node_info = id_to_info[id]
                    nodes.append((2, disease_node_info))
                    nodes.append((1, drug_node_info))
                    edges = [(1, 2, {
                        'id': 3,
                        'properties': {
                            'is_defined_by':
                            'RTX',
                            'predicate':
                            'treats',
                            'provided_by':
                            'COHD',
                            'relation':
                            'treats',
                            'seed_node_uuid':
                            '-1',
                            'source_node_uuid':
                            drug_node_info['properties']['UUID'],
                            'target_node_uuid':
                            disease_node_info['properties']['UUID']
                        },
                        'type': 'treats'
                    })]
                    response.add_subgraph(nodes, edges, to_print,
                                          id_to_frequency[id])
                else:
                    to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \
                      "%f out of all patients treated with %s (count=%d). This condition is not in our " \
                      "Knowledge graph, so no graph is shown." % (
                     drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id])
                    g = RU.get_node_as_graph(drug_id)
                    response.add_subgraph(g.nodes(data=True),
                                          g.edges(data=True), to_print,
                                          id_to_frequency[id])
            response.print()
예제 #3
0
    def answer(tissue_id,
               input_protein_list,
               use_json=False,
               num_show=20,
               rev=True):

        # Initialize the response class
        response = FormatOutput.FormatResponse(6)

        # Make sure everything exists in the graph
        if not RU.node_exists_with_property(tissue_id, "id"):
            tissue_id = RU.get_node_property(tissue_id,
                                             "id",
                                             node_label="anatomical_entity")

        for i in range(len(input_protein_list)):
            id = input_protein_list[i]
            if not RU.node_exists_with_property(id, "id"):
                input_protein_list[i] = RU.get_node_property(
                    id, "id", node_label="protein")

        # Initialize the QueryLilGim class
        q = QueryLilGIM.QueryLilGIM()

        # get the description
        tissue_description = RU.get_node_property(
            tissue_id, 'name', node_label="anatomical_entity")

        # Get the correlated proteins
        try:
            correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy(
                tissue_id, tuple(input_protein_list))
            #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347}
        except:
            error_message = "Lil'GIM is experiencing a problem."
            error_code = "LilGIMerror"
            response.add_error_message(error_code, error_message)
            response.print()
            return 1

        # as a list of tuples
        correlated_proteins_tupes = []
        for k, v in correlated_proteins_dict.items():
            correlated_proteins_tupes.append((k, v))

        # sort by freq
        correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes,
                                                  key=lambda x: x[1],
                                                  reverse=rev)
        correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[
            0:num_show]
        correlated_proteins_tupes = correlated_proteins_tupes_sorted

        # return the results
        if not use_json:
            try:
                protein_descriptions = RU.get_node_property(
                    input_protein_list[0],
                    "name",
                    node_label="protein",
                    name_type="id")
            except:
                protein_descriptions = input_protein_list[0]
            for id in input_protein_list[1:-1]:
                protein_descriptions += ", "
                try:
                    protein_descriptions += RU.get_node_property(
                        id, "name", node_label="protein", name_type="id")
                except:
                    protein_descriptions += id
            if len(input_protein_list) > 1:
                try:
                    protein_descriptions += ", and %s" % RU.get_node_property(
                        input_protein_list[-1],
                        "name",
                        node_label="protein",
                        name_type="id")
                except:
                    protein_descriptions += ", and %s" % input_protein_list[-1]
            if rev:
                to_print = "In the tissue: %s, the proteins that correlate most with %s" % (
                    tissue_description, protein_descriptions)
            else:
                to_print = "In the tissue: %s, the proteins that correlate least with %s" % (
                    tissue_description, protein_descriptions)
            to_print += " according to Lil'GIM, are:\n"
            for id, val in correlated_proteins_tupes_sorted:
                try:
                    to_print += "protein: %s\t correlation %f\n" % (
                        RU.get_node_property(
                            id, "name", node_label="protein",
                            name_type="id"), val)
                except:
                    to_print += "protein: %s\t correlation %f\n" % (id, val)
            print(to_print)
        else:
            #  otherwise, you want a JSON output
            protein_descriptions = []
            is_in_KG_list = []
            for protein, corr in correlated_proteins_tupes:
                try:
                    description = RU.get_node_property(protein,
                                                       "name",
                                                       node_label="protein",
                                                       name_type="id")
                    protein_descriptions.append(description)
                    is_in_KG_list.append(True)
                except:
                    protein_description = protein
                    protein_descriptions.append(protein_description)
                    is_in_KG_list.append(False)

            # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG
            correlated_proteins_tupes_in_KG = []
            for i in range(len(correlated_proteins_tupes)):
                if is_in_KG_list[i]:
                    correlated_proteins_tupes_in_KG.append(
                        correlated_proteins_tupes[i])

            # Return the results
            full_g = RU.get_graph_from_nodes(
                [id for id, val in correlated_proteins_tupes_in_KG],
                node_property_label="id")
            id2node = dict()
            for nx_id, node in full_g.nodes(data=True):
                id2node[node['properties']['id']] = node
            for id, corr in correlated_proteins_tupes_in_KG:
                to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % (
                    tissue_description,
                    RU.get_node_property(
                        id, "name", node_label="protein",
                        name_type="id"), corr)
                response.add_subgraph([(id, id2node[id])], [], to_print, corr)
            response.print()