def make_one_hop_query():
    query = build_query(
            genes=["ENSEMBL:ENSG00000121879"],
            therapeutic_wildcard=True,
            one_hop=True,
            )

    with open('one_hop.json', 'w') as f_:
        json.dump(query, f_)
def make_drug_wildcard_query():
    for i in [1,2,5,10]:
        query = build_query(
                genes=["ENSEMBL:ENSG00000012048"],
                disease="MONDO:0007254",
                outcome=("EFO:0000714", ">=", 365*i),
                therapeutic_wildcard=True,
                )

        with open('dw_brac1_{}yr.json'.format(i), 'w') as f_:
            json.dump(query, f_)
def make_standard_probablistic_query_one_gene():
    """ Build a standard probablistic query with one gene and one drug.
    """
    query = build_query(
            genes=["ENSEMBL:ENSG00000141510"],
            therapeutic="CHEMBL:CHEMBL88",
            disease="MONDO:0007254",
            outcome=("EFO:0000714", ">=", 500)
            )
    
    with open('standard_one_gene.json', 'w') as f_:
        json.dump(query, f_)    
def make_gene_wildcard_query():
    """ Builds a gene wildcard query
    """
    query = build_query(
            therapeutic="CHEMBL:CHEMBL88",
            disease="MONDO:0007254",
            outcome=("EFO:0000714", ">=", 500),
            num_gene_wildcards=1,
            )

    with open('gene_wildcard.json', 'w') as f_:
        json.dump(query, f_)    
예제 #5
0
# Set number of queries to build
NUM_QUERIES = 10

# Set seed
random.seed(111)

# Get client
#client = get_client()

# Get curies
logger.info('Getting curies.')
curies = TrapiInterface().get_curies()
#curies = client.curies()
logger.info('Got curies.')

# Build all simple single gene, single drug, breast cancer, survival queries.
queries = []
for _ in range(NUM_QUERIES):
    drug = [random.choice(list(curies["biolink:Drug"].keys()))]
    q = build_query(
        therapeutic=drug,
        num_gene_wildcards=1,
        one_hop=True,
    )
    print(json.dumps(q, indent=2))
    queries.append(q)

# Pickle the queries
with open('random_gene_one_hop_queries.pk', 'wb') as f_:
    pickle.dump(queries, f_)
# Set number of queries to build
NUM_QUERIES = 10

# Set seed
random.seed(111)

# Get client
#client = get_client()

# Get curies
logger.info('Getting curies.')
curies = TrapiInterface().get_curies()
#curies = client.curies()
logger.info('Got curies.')

# Build all simple single gene, single drug, breast cancer, survival queries.
queries = []
for _ in range(NUM_QUERIES):
    genes = [random.choice(list(curies["biolink:Gene"].keys()))]
    q = build_query(
        genes=genes,
        therapeutic_wildcard=True,
        one_hop=True,
    )
    print(json.dumps(q, indent=2))
    queries.append(q)

# Pickle the queries
with open('random_drug_one_hop_queries.pk', 'wb') as f_:
    pickle.dump(queries, f_)
예제 #7
0
    def _answer_query_using_CHP_client(
            self, query_graph: QueryGraph,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        log.debug(
            f"Processing query results for edge {qedge_key} by using CHP client"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        gene_label_list = ['gene']
        drug_label_list = ['drug', 'chemicalsubstance']
        # use for checking the requirement
        source_pass_nodes = None
        source_category = None
        target_pass_nodes = None
        target_category = None

        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]

        # check if both ends of edge have no curie
        if (source_qnode.id is None) and (target_qnode.id is None):
            log.error(f"Both ends of edge {qedge_key} are None",
                      error_code="BadEdge")
            return final_kg

        # check if the query nodes are drug or disease
        if source_qnode.id is not None:

            if type(source_qnode.id) is str:
                source_pass_nodes = [source_qnode.id]
            else:
                source_pass_nodes = source_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                source_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(source_qnode.id) is str:
                        log.error(
                            f"The curie id of {source_qnode.id} is not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {source_qnode.id} are not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
        else:
            category = source_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            source_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                source_category = category
            else:
                log.error(
                    f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if target_qnode.id is not None:

            if type(target_qnode.id) is str:
                target_pass_nodes = [target_qnode.id]
            else:
                target_pass_nodes = target_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                target_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(target_qnode.id) is str:
                        log.error(
                            f"The curie id of {target_qnode.id} is not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {target_qnode.id} are not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
        else:
            category = target_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            target_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                target_category = category
            else:
                log.error(
                    f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if (source_pass_nodes is None) and (target_pass_nodes is None):
            return final_kg

        elif (source_pass_nodes is not None) and (target_pass_nodes
                                                  is not None):
            source_dict = dict()
            target_dict = dict()
            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                for (source_curie, target_curie) in itertools.product(
                        source_pass_nodes, target_pass_nodes):

                    if source_category_temp == 'drug':
                        source_curie_temp = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[target_curie],
                                        therapeutic=source_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            target_curie, source_curie, "paired_with",
                            max_probability)
                    else:
                        target_curie_temp = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[source_curie],
                                        therapeutic=target_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            source_curie, target_curie, "paired_with",
                            max_probability)

                    source_dict[source_curie] = source_qnode_key
                    target_dict[target_curie] = target_qnode_key

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

        elif source_pass_nodes is not None:
            source_dict = dict()
            target_dict = dict()

            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_category in drug_label_list:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if source_category_temp == 'drug':
                    for source_curie in source_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, source_curie, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[gene] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                else:
                    for source_curie in source_pass_nodes:

                        genes = [source_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                source_curie, drug, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[drug] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
        else:
            source_dict = dict()
            target_dict = dict()

            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category in drug_label_list:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if target_category_temp == 'drug':
                    for target_curie in target_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, target_curie, "paired_with", prob)

                            source_dict[gene] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                else:
                    for target_curie in target_pass_nodes:

                        genes = [target_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                target_curie, drug, "paired_with", prob)

                            source_dict[drug] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
import pickle
import numpy as np

from chp.trapi_interface import TrapiInterface

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Get curies
logger.info('Getting curies.')
curies = TrapiInterface().get_curies()
logger.info('Got curies.')

survival_times = list(np.linspace(350, 5000, 10))
genes = [gene for gene in curies["gene"]]

queries = []
for gene_curie in tqdm.tqdm(genes, desc='Building gene queries', leave=False):
    for st in survival_times:
        queries.append(
            build_query(
                genes=[gene_curie],
                disease='MONDO:0007254',
                outcome=('EFO:0000714', '>=', st),
            ))

# Pickle the queries
with open('simple_queries_1.pk', 'wb') as f_:
    pickle.dump(queries, f_)
logger.setLevel(logging.INFO)

# Get curies
logger.info('Getting curies.')
curies = TrapiInterface().get_curies()
#curies = client.curies()
logger.info('Got curies.')

# Build queries
queries = []

# One gene curie
queries.append(
    build_query(
        genes=['ENSEMBL:ENSG00000155657'],
        disease='MONDO:0007254',
        outcome=('EFO:0000714', '>=', 1000),
    ))

# One gene one drug curie
queries.append(
    build_query(
        genes=['ENSEMBL:ENSG00000155657'],
        therapeutic='CHEMBL:CHEMBL83',
        disease='MONDO:0007254',
        outcome=('EFO:0000714', '>=', 1000),
    ))

# Two gene one drug curie
queries.append(
    build_query(
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Get curies
logger.info('Getting curies.')
curies = TrapiInterface().get_curies()
logger.info('Got curies.')

survival_times = list(np.linspace(350, 5000, 10))
genes = [gene for gene in curies["gene"]]
drugs = [drug for drug in curies["chemical_substance"]]

queries = []
for gene_curie in tqdm.tqdm(genes, desc='Building gene queries', leave=False):
    for drug_curie in drugs:
        for st in survival_times:
            queries.append(
                build_query(
                    genes=[gene_curie],
                    therapeutic=drug_curie,
                    disease='MONDO:0007254',
                    outcome=('EFO:0000714', '>=', st),
                )
            )

print(len(queries))
# Pickle the queries
with open('all_simple_queries.pk', 'wb') as f_:
    pickle.dump(queries, f_)