Ejemplo n.º 1
0
    def _setup_handler(self):
        # Only do the rest of this if a query is passed
        if self.init_query is not None:
            # Setup queries
            self._setup_queries()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)
class ChpBrainApiConfig(AppConfig):
    logger.warning('Running CHP Brain API Configuration. May take a minute.')
    name = 'chp_core_brain'

    # Used for distrbuted reasoning
    # Get Hosts File if it exists
    #parent_dir = os.path.dirname(os.path.realpath(__file__))
    #HOSTS_FILENAME = os.path.join(parent_dir, 'hosts')
    #NUM_PROCESSES_PER_HOST = multiprocessing.cpu_count()
    #if not os.path.exists(HOSTS_FILENAME):
    hosts_filename = None
    num_processes_per_host = 0

    # Instantiate BKB handler
    bkb_handler = BkbDataHandler(disease='tcga_gbm',
                                 bkb_major_version='darwin',
                                 bkb_minor_version='2.0')

    logger.info('Instantiating reasoners.')
    # Instantiate Reasoners
    dynamic_reasoner = ChpDynamicReasoner(
        bkb_handler=bkb_handler,
        hosts_filename=hosts_filename,
        num_processes_per_host=num_processes_per_host)
    joint_reasoner = ChpJointReasoner(
        bkb_handler=bkb_handler,
        hosts_filename=hosts_filename,
        num_processes_per_host=num_processes_per_host)
Ejemplo n.º 3
0
    def _setup_handler(self):
        self.default_survival_target = {
            "EFO:0000714": {
                "op": '>=',
                "value": 970
            }
        }

        # Only do the rest of this if a query is passed
        if self.init_query is not None:
            # Setup queries
            self._setup_queries()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)
    def _setup_handler(self):
        # Only do the rest of this if a message is passed
        if self.messages is not None:
            # Setup messages
            self._setup_messages()

            # Instiatate Reasoners
            if 'default' in self.message_dict:
                if self.dynamic_reasoner is None:
                    self.dynamic_reasoner = ChpDynamicReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)
            if 'simple' in self.message_dict:
                if self.joint_reasoner is None:
                    self.joint_reasoner = ChpJointReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)
 def setUpClass(cls):
     super(TestOneHopHandler, cls).setUpClass()
     # load in sample query graphs
     with open('query_samples/onehop/standard_queries.json', 'r') as f_:
         cls.standard_queries = json.load(f_)
     with open('query_samples/onehop/wildcard_queries.json', 'r') as f_:
         cls.wildcard_queries = json.load(f_)
     cls.bkb_handler = BkbDataHandler()
     cls.dynamic_reasoner = ChpDynamicReasoner(cls.bkb_handler)
     cls.joint_reasoner = ChpJointReasoner(cls.bkb_handler)
Ejemplo n.º 6
0
class WildCardHandlerMixin:
    def _setup_handler(self):
        # Only do the rest of this if a query is passed
        if self.init_query is not None:
            # Setup queries
            self._setup_queries()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)

    def _setup_queries(self):
        if type(self.init_query) == list:
            self.query_dict = defaultdict(list)
            self.query_map = []
            for query in self.init_query:
                self.query_map.append(query["query_id"])
                self.query_dict[self._get_wildcard_type(query)].append(self._setup_single_query(query))
        else:
            self.query_dict[self._get_wildcard_type(query)].append(self._setup_single_query(query))

    def _get_wildcard_type(self, query):
        wildcard_type = None
        for node_id, node in query["query_graph"]["nodes"].items():
            if 'id' not in node:
                if wildcard_type is None:
                    wildcard_type = node['category']
                else:
                    sys.exit('You can only have one contribution target. Make sure to leave only one node with a black curie.')
        if wildcard_type == BIOLINK_DRUG:
            return 'drug'
        elif wildcard_type == BIOLINK_GENE:
            return 'gene'
        else:
            raise ValueError('Did not understand wildcard type {}.'.format(wildcard_type))

    def _extract_chp_query(self, query, query_type):
        evidence = {}
        targets = []
        dynamic_evidence = {}
        dynamic_targets = {}
        # ensure we are using all nodes/edges
        total_nodes = 0
        total_edges = 0

        # get phenotype node
        targets = list()
        acceptable_target_curies = ['EFO:0000714']
        self.implicit_survival_node = False
        for node_key in query["query_graph"]['nodes'].keys():
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_PHENOTYPIC_FEATURE and node['id'] in acceptable_target_curies:
                target_id = node_key
                total_nodes += 1
        if total_nodes == 0:
            # Use Default Survival
            self.implicit_survival_node = True
            total_nodes += 1
            #acceptable_target_curies_print = ','.join(acceptable_target_curies)
            #sys.exit("Survival Node not found. Node category must be '{}' and id must be in: {}".format(BIOLINK_PHENOTYPIC_FEATURE,
            #                                                                                            acceptable_target_curies_print))
        elif total_nodes > 1:
            sys.exit('Too many target nodes')

        # get disease node info and ensure only 1 disease:
        acceptable_disease_curies = ['MONDO:0007254']
        for node_key in query["query_graph"]['nodes'].keys():
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_DISEASE and node['id'] in acceptable_disease_curies:
                disease_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_DISEASE_TO_PHENOTYPIC_FEATURE_PREDICATE and edge['subject'] == disease_id and edge['object'] == target_id:
                        if 'properties' in edge.keys():
                            days = edge['properties']['days']
                            qualifier = edge['properties']['qualifier']
                        else:
                            days = 970
                            qualifier = '>='
                        total_edges += 1
                if total_edges > 1:
                    sys.exit('Disease has too many outgoing edges')
                total_nodes += 1

        if self.implicit_survival_node:
            days=970
            qualifier = '>='
            total_edges += 1

        if total_nodes  == 1:
            acceptable_disease_curies_print = ','.join(acceptable_disease_curies)
            sys.exit("Disease node not found. Node type must be '{}' and curie must be in: {}".format(BIOLINK_DISEASE,
                                                                                                      acceptable_disease_curies_print))
        elif total_nodes > 2:
            sys.exit('Too many disease nodes')
        # set BKB target
        dynamic_targets['EFO:0000714'] = {
            "op": qualifier,
            "value": days,
        }
        truth_target = ('EFO:0000714', '{} {}'.format(qualifier, days))

        # get evidence
        for node_key in query["query_graph"]['nodes'].keys():
            # genes
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_GENE:
                # check for appropriate gene node structure
                gene_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_GENE_TO_DISEASE_PREDICATE and edge['subject'] == gene_id and edge['object'] == disease_id:
                        total_edges += 1
                if total_edges == total_nodes - 1:
                    sys.exit("Gene and disease edge not found. Edge type must be '{}'".format(BIOLINK_GENE_TO_DISEASE_PREDICATE))
                elif total_edges > total_nodes:
                    sys.exit('Gene has too many outgoing edges')
                # check for appropriate gene node curie
                if query_type != 'gene':
                    gene_curie = node['id']
                    if gene_curie in self.curies[BIOLINK_GENE]:
                        gene = gene_curie
                    else:
                        sys.exit('Invalid ENSEMBL Identifier. Must be in form ENSEMBL:<ID>.')
                    evidence["_" + gene] = 'True'
                total_nodes += 1
            # drugs
            if node['category'] == BIOLINK_DRUG:
                # check for appropriate drug node structure
                drug_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE and edge['subject'] == drug_id and edge['object'] == disease_id:
                        total_edges += 1
                if total_edges == total_nodes - 1:
                    sys.exit("Drug and disease edge not found. Edge type must be '{}'".format(BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE))
                elif total_edges > total_nodes:
                    sys.exit('Drug has too many outgoing edges')
                # check for appropriate drug node curie
                if query_type != 'drug':
                    drug_curie = node['id']
                    if drug_curie in self.curies[BIOLINK_DRUG]:
                        drug = drug_curie
                    else:
                        sys.exit('Invalid CHEMBL Identifier: {}. Must be in form CHEMBL:<ID>'.format(drug_curie))
                    evidence['_' + drug] = 'True'
                total_nodes += 1

        # Temporary solution to no evidence linking
        if len(evidence.keys()) == 0 and len(dynamic_evidence.keys()) == 0:
            self.no_evidence_probability_check = True
        else:
            self.no_evidence_probability_check = False

        # produce BKB query
        chp_query = Query(
            evidence=evidence,
            targets=targets,
            dynamic_evidence=dynamic_evidence,
            dynamic_targets=dynamic_targets,
            type='updating')
        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        chp_query.query_id = query["query_id"] if 'query_id' in query else None
        return chp_query

    def _run_query(self, chp_query, query_type):
        """ Runs build BKB query to calculate probability of survival.
            A probability is returned to specificy survival time w.r.t a drug.
            Contributions for each gene are calculuated and classified under
            their true/false target assignments.
        """

        # temporary solution to no evidence linking
        if not self.no_evidence_probability_check:
            if query_type == 'gene':
                chp_query = self.dynamic_reasoner.run_query(chp_query, bkb_type='drug')
            elif query_type == 'drug':
                chp_query = self.dynamic_reasoner.run_query(chp_query, bkb_type='gene')
            chp_res_dict = chp_query.result.process_updates()
            chp_res_norm_dict = chp_query.result.process_updates(normalize=True)
            #chp_query.result.summary()
            chp_res_contributions = chp_query.result.process_inode_contributions()
            chp_query.truth_prob = max([0, chp_res_norm_dict[chp_query.truth_target[0]][chp_query.truth_target[1]]])

            # Collect all source inodes and process patient hashes
            patient_contributions = defaultdict(lambda: defaultdict(int))
            for target, contrib_dict in chp_res_contributions.items():
                target_comp_name, target_state_name = target
                for inode, contrib in contrib_dict.items():
                    comp_name, state_name = inode
                    if '_Source_' in comp_name:
                        # Split source state name to get patient hashes
                        source_hashes_str = state_name.split('_')[-1]
                        source_hashes = [int(source_hash) for source_hash in source_hashes_str.split(',')]
                        hash_len = len(source_hashes)
                        # Process patient contributions
                        for _hash in source_hashes:
                            # Normalize to get relative contribution
                            patient_contributions[target][_hash] += contrib/hash_len #/ chp_res_dict[target_comp_name][target_state_name]

        else:
            # probability of survival
            num_survived = 0
            num_all = len(self.dynamic_reasoner.raw_patient_data.keys())
            str_op = chp_query.dynamic_targets['EFO:0000714']['op']
            opp_op = get_opposite_operator(str_op)
            op = get_operator(str_op)
            days = chp_query.dynamic_targets['EFO:0000714']['value']
            for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                if op(pat_dict['survival_time'], days):
                    num_survived += 1
            chp_query.truth_prob = num_survived/num_all

            # patient_contributions
            patient_contributions = defaultdict(lambda: defaultdict(int))
            for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                if op(pat_dict['survival_time'], days):
                    if num_survived == 0:
                        patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = 0
                    else:
                        patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = chp_query.truth_prob/num_survived
                else:
                    if num_survived == 0:
                        patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/num_all
                    else:
                        patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/(num_all-num_survived)

        # Now iterate through the patient data to translate patient contributions to drug/gene contributions
        wildcard_contributions = defaultdict(lambda: defaultdict(int))
        for target, patient_contrib_dict in patient_contributions.items():
            for patient, contrib in patient_contrib_dict.items():
                if query_type == 'gene':
                    for gene_curie in self.dynamic_reasoner.raw_patient_data[patient]["gene_curies"]:
                        wildcard_contributions[gene_curie][target] += contrib
                elif query_type == 'drug':
                    for drug_curie in self.dynamic_reasoner.raw_patient_data[patient]["drug_curies"]:
                        wildcard_contributions[drug_curie][target] += contrib

        # normalize gene contributions by the target and take relative difference
        for curie in wildcard_contributions.keys():
            truth_target_gene_contrib = 0
            nontruth_target_gene_contrib = 0
            for target, contrib in wildcard_contributions[curie].items():
                if target[0] == chp_query.truth_target[0] and target[1] == chp_query.truth_target[1]:
                    truth_target_gene_contrib += contrib / chp_query.truth_prob
                else:
                    nontruth_target_gene_contrib += contrib / (1 - chp_query.truth_prob)
            wildcard_contributions[curie]['relative'] = truth_target_gene_contrib - nontruth_target_gene_contrib

        chp_query.report = None
        chp_query.wildcard_contributions = wildcard_contributions

        return chp_query

    def _construct_trapi_response(self, chp_query, query_type):
        # Get orginal query
        if len(self.init_query) == 1:
            query = self.init_query[0]
            query_id = None
        else:
            for _query in self.init_query:
                if _query["query_id"] == chp_query.query_id:
                    query = _query
                    query_id = query["query_id"]
                    break

        # Construct first result which is the result of the standard probablistic query.
        kg = copy.deepcopy(query["query_graph"])
        # Process Nodes
        node_pairs = defaultdict(None)
        contrib_qg_id = None
        for node_key in list(kg["nodes"].keys())[:]:
            qg_node_curie = kg['nodes'][node_key].pop('id', None)
            if qg_node_curie is not None:
                kg['nodes'][qg_node_curie] = kg['nodes'].pop(node_key)
                if kg['nodes'][qg_node_curie]['category'] == BIOLINK_GENE:
                    kg['nodes'][qg_node_curie]['name'] = self.curies["biolink:Gene"][qg_node_curie][0]
                elif kg['nodes'][qg_node_curie]['category'] == BIOLINK_DRUG:
                    kg['nodes'][qg_node_curie]['name'] = self.curies["biolink:Drug"][qg_node_curie][0]
                node_pairs[node_key] = qg_node_curie
            else:
                kg["nodes"].pop(node_key)

        if not self.implicit_survival_node:
            # Process Edges
            edge_pairs = dict()
            knowledge_edges = 0
            for edge_key in list(kg['edges'].keys())[:]:
                subject_node = kg['edges'][edge_key]['subject']
                if kg['edges'][edge_key]['predicate'] == BIOLINK_GENE_TO_DISEASE_PREDICATE and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_GENE and query_type == 'gene':
                    kg['edges'].pop(edge_key)
                elif kg['edges'][edge_key]['predicate'] == BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_DRUG and query_type == 'drug':
                    kg['edges'].pop(edge_key)
                else:
                    kg_id = 'kge{}'.format(knowledge_edges)
                    knowledge_edges += 1
                    kg['edges'][kg_id] = kg['edges'].pop(edge_key)
                    kg['edges'][kg_id]['subject'] = node_pairs[kg['edges'][kg_id]['subject']]
                    kg['edges'][kg_id]['object'] = node_pairs[kg['edges'][kg_id]['object']]
                    edge_pairs[edge_key] = kg_id
                    if kg['edges'][kg_id]['predicate'] == BIOLINK_DISEASE_TO_PHENOTYPIC_FEATURE_PREDICATE:
                        if 'properties' in kg['edges'][kg_id].keys():
                            kg['edges'][kg_id].pop('properties')
                        kg['edges'][kg_id]['attributes'] = [{'name':'Probability of Survival',
                                                             'type':BIOLINK_PROBABILITY,
                                                             'value':chp_query.truth_prob}]

            # Put first result of standard prob query of only curie nodes (i.e. no wildcard nodes where used as evidence)
            results = []
            results.append({'edge_bindings':dict(),
                            'node_bindings':dict()})
            for edge_pair_key in edge_pairs:
                results[0]['edge_bindings'][edge_pair_key] = [{ 'id': str(edge_pairs[edge_pair_key])}]
            for node_pair_key in node_pairs:
                results[0]['node_bindings'][node_pair_key] = [{ 'id': str(node_pairs[node_pair_key])}]

        else:
            knowledge_edges = 0
            kg['edges'] = {}
            results = []

        # Build relative contribution results and added associated edges into knowledge graph
        unsorted_wildcard_contributions = []
        for wildcard, contrib_dict in chp_query.wildcard_contributions.items():
            unsorted_wildcard_contributions.append((contrib_dict['relative'], wildcard))
        sorted_wildcard_contributions = [(contrib,wildcard) for contrib, wildcard in sorted(unsorted_wildcard_contributions, key=lambda x: abs(x[0]), reverse=True)]

        for contrib, wildcard in sorted_wildcard_contributions[:self.max_results]:
            rg = copy.deepcopy(query["query_graph"])
            _node_pairs = {}
            _edge_pairs = {}
            # Process node pairs
            for node_id, node in rg["nodes"].items():
                if node["category"] == BIOLINK_GENE and query_type == 'gene':
                    kg["nodes"][wildcard] = copy.deepcopy(node)
                    kg["nodes"][wildcard].update({"name": self.curies[BIOLINK_GENE][wildcard][0]})
                    _node_pairs[node_id] = wildcard
                elif node["category"] == BIOLINK_DRUG and query_type == 'drug':
                    kg["nodes"][wildcard] = copy.deepcopy(node)
                    kg["nodes"][wildcard].update({"name": self.curies[BIOLINK_DRUG][wildcard][0]})
                    _node_pairs[node_id] = wildcard
                else:
                    _node_pairs[node_id] = node_pairs[node_id]
            # Process edge pairs
            for edge_id, edge in rg["edges"].items():
                subject_node = edge['subject']
                if query_type == 'gene'  and edge["predicate"] == BIOLINK_GENE_TO_DISEASE_PREDICATE and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_GENE:
                    knowledge_edges += 1
                    kg_edge_id = 'kge{}'.format(knowledge_edges)
                    kg["edges"][kg_edge_id] = copy.deepcopy(edge)
                    kg["edges"][kg_edge_id]["subject"] = _node_pairs[kg["edges"][kg_edge_id]["subject"]]
                    kg["edges"][kg_edge_id]["object"] = _node_pairs[kg["edges"][kg_edge_id]["object"]]
                    #kg["edges"][kg_edge_id]["value"] = contrib
                    kg["edges"][kg_edge_id]["attributes"] = [{'name':'Contribution',
                                                              'type':BIOLINK_CONTRIBUTION,
                                                              'value':contrib}]
                    _edge_pairs[edge_id] = kg_edge_id
                elif query_type == 'drug'  and edge["predicate"] == BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_DRUG:
                    knowledge_edges += 1
                    kg_edge_id = 'kge{}'.format(knowledge_edges)
                    kg["edges"][kg_edge_id] = copy.deepcopy(edge)
                    kg["edges"][kg_edge_id]["subject"] = _node_pairs[kg["edges"][kg_edge_id]["subject"]]
                    kg["edges"][kg_edge_id]["object"] = _node_pairs[kg["edges"][kg_edge_id]["object"]]
                    #kg["edges"][kg_edge_id]["value"] = contrib
                    kg["edges"][kg_edge_id]["attributes"] = [{'name':'Contribution',
                                                              'type':BIOLINK_CONTRIBUTION,
                                                              'value':contrib}]
                    _edge_pairs[edge_id] = kg_edge_id
                else:
                    _edge_pairs[edge_id] = edge_pairs[edge_id]
            # Process node and edge binding results
            _res = {"edge_bindings": {},
                    "node_bindings": {}}
            for edge_pair_key in _edge_pairs:
                _res["edge_bindings"][edge_pair_key] = [{ "id": str(_edge_pairs[edge_pair_key])}]
            for node_pair_key in _node_pairs:
                _res["node_bindings"][node_pair_key] = [{ "id": str(_node_pairs[node_pair_key])}]
            results.append(_res)

        # query response
        trapi_message = {'query_graph': query["query_graph"],
                        'knowledge_graph': kg,
                        'results': results}
        trapi_response = {'message' : trapi_message}
        return query_id, trapi_response
Ejemplo n.º 7
0
class OneHopHandlerMixin:
    """ OneHopeHandler is the handler for 1-hop queries. That is
        query graphs (QGs) that consists of 2 nodes and a single edge.

        :param query: the query graph sent by the ARA.
        :type query: dict
        :param hosts_filename: a filename for a stored QG. Defaults to None
        :type hosts_filename: str
        :param num_processes_per_host: Not implemented thouroughly, but would be
            used for distributed reasoning.
        :type num_processes_per_host: int
        :param max_results: specific to 1-hop queries, specifies the number of
            wildcard genes to return.
        :type max_results: int
    """
    def _setup_handler(self):
        self.default_survival_target = {
            "EFO:0000714": {
                "op": '>=',
                "value": 970
            }
        }

        # Only do the rest of this if a query is passed
        if self.init_query is not None:
            # Setup queries
            self._setup_queries()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)

    def _setup_queries(self):
        if type(self.init_query) == list:
            self.query_dict = defaultdict(list)
            self.query_map = []
            for query in self.init_query:
                self.query_map.append(query["query_id"])
                self.query_dict[self._get_wildcard_type(query)].append(
                    self._setup_single_query(query))
        else:
            self.query_dict[self._get_wildcard_type(query)].append(
                self._setup_single_query(query))

    def _get_wildcard_type(self, query):
        wildcard_type = None
        for node_id, node in query["query_graph"]["nodes"].items():
            if 'id' not in node:
                if wildcard_type is None:
                    wildcard_type = node['category']
                else:
                    sys.exit(
                        'You can only have one contribution target. Make sure to leave only one node with a black curie.'
                    )
        if wildcard_type == BIOLINK_DRUG:
            return 'drug'
        elif wildcard_type == BIOLINK_GENE:
            return 'gene'
        else:
            raise ValueError(
                'Did not understand wildcard type {}.'.format(wildcard_type))

    def check_query(self):
        """ Currently not implemented. Would check validity of query.
        """
        return True

    def _extract_chp_query(self, query, query_type=None):
        evidence = {}
        dynamic_targets = {}

        if len(query["query_graph"]['nodes']) > 2 or len(
                query["query_graph"]['edges']) > 1:
            sys.exit('1 hop quries can only have 2 nodes and 1 edge')

        # check edge for source and target
        edge_key = list(query["query_graph"]["edges"].keys())[0]
        edge = query["query_graph"]['edges'][edge_key]
        if 'subject' not in edge.keys() or 'object' not in edge.keys():
            sys.exit(
                'Edge must have both a \'subject\' and and \'object\' key')
        subject = edge['subject']
        obj = edge['object']

        # Get non-wildcard node
        if query_type == 'gene':
            if query["query_graph"]['nodes'][subject][
                    'category'] != BIOLINK_GENE:
                sys.exit('Subject node must be \'category\' {}'.format(
                    BIOLINK_GENE))
            drug_curie = query["query_graph"]['nodes'][obj]['id']
            if drug_curie not in self.curies[BIOLINK_DRUG]:
                sys.exit('Invalid CHEMBL Identifier. Must be CHEMBL:<ID>')
            evidence['_{}'.format(drug_curie)] = 'True'
        elif query_type == 'drug':
            if query["query_graph"]['nodes'][subject][
                    'category'] != BIOLINK_DRUG:
                sys.exit('Subject node must be \'category\' {}'.format(
                    BIOLINK_DRUG))
            gene_curie = query["query_graph"]['nodes'][obj]['id']
            if gene_curie not in self.curies[BIOLINK_GENE]:
                sys.exit('Invalid ENSEMBL Identifier. Must be ENSEMBL:<ID>')
            evidence['_{}'.format(gene_curie)] = 'True'

        # default survival time
        dynamic_targets.update(self.default_survival_target)
        truth_target = ('EFO:0000714', '{} {}'.format(
            self.default_survival_target["EFO:0000714"]["op"],
            self.default_survival_target["EFO:0000714"]["value"]))

        chp_query = Query(evidence=evidence,
                          targets=None,
                          dynamic_evidence=None,
                          dynamic_targets=dynamic_targets,
                          type='updating')
        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        chp_query.query_id = query["query_id"] if 'query_id' in query else None
        return chp_query

    def _run_query(self, chp_query, query_type):
        """ Runs build BKB query to calculate probability of survival.
            A probability is returned to specificy survival time w.r.t a drug.
            Contributions for each gene are calculuated and classified under
            their true/false target assignments.
        """
        if query_type == 'gene':
            chp_query = self.dynamic_reasoner.run_query(chp_query,
                                                        bkb_type='drug')
        elif query_type == 'drug':
            chp_query = self.dynamic_reasoner.run_query(chp_query,
                                                        bkb_type='gene')
        chp_res_dict = chp_query.result.process_updates()
        chp_res_norm_dict = chp_query.result.process_updates(normalize=True)
        #chp_query.result.summary()
        chp_res_contributions = chp_query.result.process_inode_contributions()
        chp_query.truth_prob = max([
            0, chp_res_norm_dict[chp_query.truth_target[0]][
                chp_query.truth_target[1]]
        ])

        #print(chp_res_contributions)

        # Collect all source inodes and process patient hashes
        patient_contributions = defaultdict(lambda: defaultdict(int))
        for target, contrib_dict in chp_res_contributions.items():
            target_comp_name, target_state_name = target
            for inode, contrib in contrib_dict.items():
                comp_name, state_name = inode
                if '_Source_' in comp_name:
                    # Split source state name to get patient hashes
                    source_hashes_str = state_name.split('_')[-1]
                    source_hashes = [
                        int(source_hash)
                        for source_hash in source_hashes_str.split(',')
                    ]
                    hash_len = len(source_hashes)
                    # Process patient contributions
                    for _hash in source_hashes:
                        # Normalize to get relative contribution
                        patient_contributions[target][
                            _hash] += contrib / hash_len  #/ chp_res_dict[target_comp_name][target_state_name]

        # Now iterate through the patient data to translate patient contributions to drug/gene contributions
        wildcard_contributions = defaultdict(lambda: defaultdict(int))
        for target, patient_contrib_dict in patient_contributions.items():
            for patient, contrib in patient_contrib_dict.items():
                if query_type == 'gene':
                    for gene_curie in self.dynamic_reasoner.raw_patient_data[
                            patient]["gene_curies"]:
                        wildcard_contributions[gene_curie][target] += contrib
                elif query_type == 'drug':
                    for drug_curie in self.dynamic_reasoner.raw_patient_data[
                            patient]["drug_curies"]:
                        wildcard_contributions[drug_curie][target] += contrib

        # normalize gene contributions by the target and take relative difference
        for curie in wildcard_contributions.keys():
            truth_target_gene_contrib = 0
            nontruth_target_gene_contrib = 0
            for target, contrib in wildcard_contributions[curie].items():
                if target[0] == chp_query.truth_target[0] and target[
                        1] == chp_query.truth_target[1]:
                    truth_target_gene_contrib += contrib / chp_res_dict[
                        target[0]][target[1]]
                else:
                    nontruth_target_gene_contrib += contrib / chp_res_dict[
                        target[0]][target[1]]
            wildcard_contributions[curie][
                'relative'] = truth_target_gene_contrib - nontruth_target_gene_contrib

        chp_query.report = None
        chp_query.wildcard_contributions = wildcard_contributions

        return chp_query

    def _construct_trapi_response(self, chp_query, query_type):
        # Get orginal query
        if len(self.init_query) == 1:
            query = self.init_query[0]
            query_id = None
        else:
            for _query in self.init_query:
                if _query["query_id"] == chp_query.query_id:
                    query = _query
                    query_id = query["query_id"]
                    break

        kg = copy.deepcopy(query["query_graph"])

        edge_bindings = {}
        node_bindings = {}

        # get edge subject, object, edge label and pop edge
        edge_key = list(kg['edges'].keys())[0]
        edge = kg['edges'][edge_key]
        edge_label = edge['predicate']
        subject = edge['subject']
        obj = edge['object']
        kg['edges'].pop(edge_key)

        # move curie to key
        non_wildcard_curie = kg['nodes'][obj].pop('id')
        kg['nodes'][non_wildcard_curie] = kg['nodes'].pop(obj)
        if query_type == 'gene':
            kg['nodes'][non_wildcard_curie]['name'] = self._get_curie_name(
                BIOLINK_DRUG, non_wildcard_curie)[0]
        elif query_type == 'drug':
            kg['nodes'][non_wildcard_curie]['name'] = self._get_curie_name(
                BIOLINK_GENE, non_wildcard_curie)[0]
        node_bindings[obj] = non_wildcard_curie

        # remove wildcard gene node from kg
        kg['nodes'].pop(subject)

        # Build relative contribution results and added associated edges into knowledge graph
        unsorted_wildcard_contributions = []
        for wildcard, contrib_dict in chp_query.wildcard_contributions.items():
            unsorted_wildcard_contributions.append(
                (contrib_dict['relative'], wildcard))
        sorted_wildcard_contributions = [
            (contrib, wildcard)
            for contrib, wildcard in sorted(unsorted_wildcard_contributions,
                                            key=lambda x: abs(x[0]),
                                            reverse=True)
        ]

        # add kg gene nodes and edges
        edge_count = 0
        node_count = 1
        results = []
        for contrib, wildcard in sorted_wildcard_contributions[:self.
                                                               max_results]:
            if query_type == 'gene':
                kg['nodes'][wildcard] = {
                    "name": self._get_curie_name(BIOLINK_GENE, wildcard)[0],
                    "category": BIOLINK_GENE
                }
                # add edge
                kg['edges']['kge{}'.format(edge_count)] = {
                    "predicate":
                    BIOLINK_CHEMICAL_TO_GENE_PREDICATE,
                    "subject":
                    wildcard,
                    "object":
                    non_wildcard_curie,
                    "attributes": [{
                        'name': 'Contribution',
                        'type': BIOLINK_CONTRIBUTION,
                        'value': contrib
                    }]
                }
            elif query_type == 'drug':
                kg['nodes'][wildcard] = {
                    "name": self._get_curie_name(BIOLINK_DRUG, wildcard)[0],
                    "category": BIOLINK_DRUG
                }
                # add edge
                kg['edges']['kge{}'.format(edge_count)] = {
                    "predicate":
                    BIOLINK_CHEMICAL_TO_GENE_PREDICATE,
                    "subject":
                    wildcard,
                    "object":
                    non_wildcard_curie,
                    "attributes": [{
                        'name': 'Contribution',
                        'type': BIOLINK_CONTRIBUTION,
                        'value': contrib
                    }]
                }
            # add to results
            node_binding = {
                obj: [{
                    'id': non_wildcard_curie
                }],
                subject: [{
                    'id': wildcard
                }]
            }
            edge_binding = {edge_key: [{'id': 'kge{}'.format(edge_count)}]}
            results.append({
                'node_bindings': node_binding,
                'edge_bindings': edge_binding
            })

            edge_count += 1
            node_count += 1

        # query response
        trapi_message = {
            'query_graph': query["query_graph"],
            'knowledge_graph': kg,
            'results': results
        }
        trapi_response = {'message': trapi_message}
        return query_id, trapi_response
class DefaultHandlerMixin:
    def _setup_handler(self):
        # Only do the rest of this if a message is passed
        if self.messages is not None:
            # Setup messages
            self._setup_messages()

            # Instiatate Reasoners
            if 'default' in self.message_dict:
                if self.dynamic_reasoner is None:
                    self.dynamic_reasoner = ChpDynamicReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)
            if 'simple' in self.message_dict:
                if self.joint_reasoner is None:
                    self.joint_reasoner = ChpJointReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)

    def _setup_messages(self):
        self.message_dict = defaultdict(list)
        for message in self.messages:
            if self._is_simple_message(message):
                self.message_dict['simple'].append(message)
            else:
                self.message_dict['default'].append(message)

    def _is_simple_message(self, message):
        """ Check if this is a {0 or 1} drug, {0 or 1} gene, one outcome standard message.
        """
        _found_outcome = False
        _found_disease = False
        _found_gene = False
        _found_drug = False
        query_graph = message.query_graph
        for node_key, node in query_graph.nodes.items():
            if node.categories[0] == BIOLINK_PHENOTYPIC_FEATURE_ENTITY:
                # If we've already found the target and there's another phenotypic feature, then this isn't simple.
                if _found_outcome:
                    return False
                else:
                    _found_outcome = True
            if node.categories[0] == BIOLINK_DISEASE_ENTITY:
                # If we've already found disease and there's another disease, then this isn't simple.
                if _found_disease:
                    return False
                else:
                    _found_disease = True
            if node.categories[0] == BIOLINK_GENE_ENTITY:
                if _found_gene:
                    return False
                else:
                    _found_gene = True
            if node.categories[0] == BIOLINK_DRUG_ENTITY:
                if _found_drug:
                    return False
                else:
                    _found_drug = True
        return True

    def _extract_chp_query(self, message, message_type=None):
        # Initialize Chp Query
        chp_query = ChpQuery(reasoning_type='updating')
        # Ensure we are using all nodes/edges
        total_nodes = 0
        total_edges = 0

        query_graph = message.query_graph

        # get phenotype node
        targets = list()
        for node_key in query_graph.nodes.keys():
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_PHENOTYPIC_FEATURE_ENTITY:
                target_id = node_key
                total_nodes += 1

        survival_value = 970
        survival_operator = '>='

        # get disease node info and ensure only 1 disease:
        for node_key in query_graph.nodes.keys():
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_DISEASE_ENTITY:
                disease_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(
                            edge.predicates[0], BIOLINK_HAS_PHENOTYPE_ENTITY
                    ) and edge.subject == disease_id and edge.object == target_id:
                        survival_time_constraint = edge.find_constraint(
                            name='survival_time')
                        if survival_time_constraint is not None:
                            survival_value = survival_time_constraint.value
                            survival_operator = survival_time_constraint.operator
                            if survival_operator == 'matches':
                                survival_operator = '=='
                        total_edges += 1
                total_nodes += 1
        # set BKB target
        chp_query.add_dynamic_target(node.ids[0], survival_operator,
                                     survival_value)
        truth_target = (node.ids[0], '{} {}'.format(survival_operator,
                                                    survival_value))

        # get evidence
        for node_key in query_graph.nodes.keys():
            # genes
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_GENE_ENTITY:
                # check for appropriate gene node structure
                gene_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(
                            edge.predicates[0],
                            BIOLINK_GENE_ASSOCIATED_WITH_CONDITION_ENTITY
                    ) and edge.subject == gene_id and edge.object == disease_id:
                        total_edges += 1
                # check for appropriate gene node curie
                gene_curie = node.ids[0]
                gene = gene_curie
                chp_query.add_meta_evidence(gene, 'True')
                total_nodes += 1
            # drugs
            if node.categories[0] == BIOLINK_DRUG_ENTITY:
                # check for appropriate drug node structure
                drug_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(
                            edge.predicates[0], BIOLINK_TREATS_ENTITY
                    ) and edge.subject == drug_id and edge.object == disease_id:
                        total_edges += 1
                # check for appropriate drug node curie
                drug_curie = node.ids[0]
                drug = drug_curie
                chp_query.add_dynamic_evidence(node.ids[0], '==', 'True')
                total_nodes += 1

        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        return chp_query

    def _run_query(self, chp_query, query_type):
        if query_type == 'simple':
            chp_query = self.joint_reasoner.run_query(chp_query)
            # If a probability was found for the target
            if len(chp_query.result) > 0:
                # If a probability was found for the truth target
                if chp_query.truth_target in chp_query.result:
                    total_unnormalized_prob = 0
                    for target, contrib in chp_query.result.items():
                        prob = max(0, contrib)
                        total_unnormalized_prob += prob
                    chp_query.truth_prob = max([
                        0, chp_query.result[(chp_query.truth_target)]
                    ]) / total_unnormalized_prob
                else:
                    chp_query.truth_prob = 0
            else:
                chp_query.truth_prob = -1
            chp_query.report = None
        else:
            chp_query = self.dynamic_reasoner.run_query(chp_query)
            chp_res_dict = chp_query.result.process_updates(normalize=True)
            try:
                chp_query.truth_prob = max([
                    0, chp_res_dict[chp_query.truth_target[0]][
                        chp_query.truth_target[1]]
                ])
            except KeyError:
                # May need to come back and fix this.
                chp_query.truth_prob = -1

            chp_query.report = None
        return chp_query

    def _construct_trapi_message(self, chp_query, message, query_type=None):

        # update target node info and form edge pair combos for results graph

        qg = message.query_graph
        kg = message.knowledge_graph
        node_bindings = {}
        for qnode_key, qnode in qg.nodes.items():
            if qnode.categories[0] == BIOLINK_GENE_ENTITY:
                knode_key = kg.add_node(
                    qnode.ids[0],
                    self.curies[BIOLINK_GENE_ENTITY.get_curie()][qnode.ids[0]]
                    [0],
                    qnode.categories[0].get_curie(),
                )
            elif qnode.categories[0] == BIOLINK_DRUG_ENTITY:
                knode_key = kg.add_node(
                    qnode.ids[0],
                    self.curies[BIOLINK_DRUG_ENTITY.get_curie()][qnode.ids[0]]
                    [0],
                    qnode.categories[0].get_curie(),
                )
            else:
                knode_key = kg.add_node(
                    qnode.ids[0],
                    qnode.ids[0],
                    qnode.categories[0].get_curie(),
                )
            node_bindings[qnode_key] = [knode_key]

        edge_bindings = {}
        for qedge_key, qedge in qg.edges.items():
            kedge_key = kg.add_edge(
                node_bindings[qedge.subject][0],
                node_bindings[qedge.object][0],
                predicate=qedge.predicates[0].get_curie(),
                relation=qedge.relation,
            )
            edge_bindings[qedge_key] = [kedge_key]
            # Add Attribute
            if self.check_predicate_support(qedge.predicates[0],
                                            BIOLINK_HAS_PHENOTYPE_ENTITY):
                kg.edges[kedge_key].add_attribute(
                    attribute_type_id='Probability of Survival',
                    value=chp_query.truth_prob,
                    value_type_id=BIOLINK_HAS_CONFIDENCE_LEVEL_ENTITY.
                    get_curie(),
                )
        # Proces results
        message.results.add_result(
            node_bindings,
            edge_bindings,
        )
        return message
Ejemplo n.º 9
0
 def setUp(self):
     self.bkb_handler = BkbDataHandler(
         bkb_major_version='coulomb',
         bkb_minor_version='1.0',
     )
     self.dynamic_reasoner = ChpDynamicReasoner(self.bkb_handler)
Ejemplo n.º 10
0
class TestDynamicReasoner(unittest.TestCase):
    def setUp(self):
        self.bkb_handler = BkbDataHandler(
            bkb_major_version='coulomb',
            bkb_minor_version='1.0',
        )
        self.dynamic_reasoner = ChpDynamicReasoner(self.bkb_handler)

    def test_dynamic_reasoner_one_gene(self):
        # Specify evidence
        evidence = {'_ENSEMBL:ENSG00000155657': 'True'}
        # Specify targets
        dynamic_targets = {"EFO:0000714": {"op": '>=', "value": 1000}}
        # Setup query
        query = Query(evidence=evidence, dynamic_targets=dynamic_targets)
        query = self.dynamic_reasoner.run_query(query)
        query.result.summary(include_contributions=False)

    def test_dynamic_reasoner_one_gene_one_drug(self):
        # Specify evidence
        evidence = {
            '_ENSEMBL:ENSG00000155657': 'True',
            'CHEMBL:CHEMBL83': 'True',
        }
        # Specify targets
        dynamic_targets = {"EFO:0000714": {"op": '>=', "value": 1000}}
        # Setup query
        query = Query(evidence=evidence, dynamic_targets=dynamic_targets)
        query = self.dynamic_reasoner.run_query(query)
        query.result.summary(include_contributions=False)

    def test_dynamic_reasoner_two_gene_one_drug(self):
        # Specify evidence
        evidence = {
            '_ENSEMBL:ENSG00000155657': 'True',
            '_ENSEMBL:ENSG00000241973': 'True',
            'CHEMBL:CHEMBL83': 'True',
        }
        # Specify targets
        dynamic_targets = {"EFO:0000714": {"op": '>=', "value": 1000}}
        # Setup query
        query = Query(evidence=evidence, dynamic_targets=dynamic_targets)
        query = self.dynamic_reasoner.run_query(query)
        query.result.summary(include_contributions=False)

    def test_dynamic_reasoner_one_drug_survival(self):
        # Specify evidence
        evidence = {
            '_CHEMBL:CHEMBL83': 'True',
        }
        # Specify targets
        dynamic_targets = {"EFO:0000714": {"op": '>=', "value": 1000}}
        # Setup query
        query = Query(evidence=evidence, dynamic_targets=dynamic_targets)
        query = self.dynamic_reasoner.run_query(query, bkb_type='drug')
        query.result.summary(include_contributions=False)

    def test_dynamic_reasoner_two_drug_survival(self):
        # Specify evidence
        evidence = {
            '_CHEMBL:CHEMBL83': 'True',
            '_CHEMBL:CHEMBL1201247': 'True',
        }
        # Specify targets
        dynamic_targets = {"EFO:0000714": {"op": '>=', "value": 1000}}
        # Setup query
        query = Query(evidence=evidence, dynamic_targets=dynamic_targets)
        query = self.dynamic_reasoner.run_query(query, bkb_type='drug')
        query.result.summary(include_contributions=False)
Ejemplo n.º 11
0
class DefaultHandlerMixin:
    def _setup_handler(self):
        # Only do the rest of this if a query is passed
        if self.init_query is not None:
            # Setup queries
            self._setup_queries()

            # Instiatate Reasoners
            if 'default' in self.query_dict:
                if self.dynamic_reasoner is None:
                    self.dynamic_reasoner = ChpDynamicReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)
            if 'simple' in self.query_dict:
                if self.joint_reasoner is None:
                    self.joint_reasoner = ChpJointReasoner(
                        bkb_handler=self.bkb_data_handler,
                        hosts_filename=self.hosts_filename,
                        num_processes_per_host=self.num_processes_per_host)

    def _setup_queries(self):
        if type(self.init_query) == list:
            self.query_dict = defaultdict(list)
            self.query_map = []
            for query in self.init_query:
                self.query_map.append(query["query_id"])
                if self._is_simple_query(query):
                    self.query_dict['simple'].append(
                        self._setup_single_query(query))
                else:
                    self.query_dict['default'].append(
                        self._setup_single_query(query))
        else:
            if self._is_simple_query(self.init_query):
                self.query_dict = {
                    "simple": [self._setup_single_query(self.init_query)]
                }
            else:
                self.query_dict = {
                    "default": [self._setup_single_query(self.init_query)]
                }

    def _is_simple_query(self, query):
        """ Check if this is a {0 or 1} drug, {0 or 1} gene, one outcome standard query.
        """
        _found_outcome = False
        _found_disease = False
        _found_gene = False
        _found_drug = False
        for node_key, node in query["query_graph"]["nodes"].items():
            if node["category"] == BIOLINK_PHENOTYPIC_FEATURE:
                # If we've already found the target and there's another phenotypic feature, then this isn't simple.
                if _found_outcome:
                    return False
                else:
                    _found_outcome = True
            if node['category'] == BIOLINK_DISEASE:
                # If we've already found disease and there's another disease, then this isn't simple.
                if _found_disease:
                    return False
                else:
                    _found_disease = True
            if node["category"] == BIOLINK_GENE:
                if _found_gene:
                    return False
                else:
                    _found_gene = True
            if node['category'] == BIOLINK_DRUG:
                if _found_drug:
                    return False
                else:
                    _found_drug = True
        return True

    def _extract_chp_query(self, query, query_type=None):
        evidence = {}
        targets = []
        dynamic_evidence = {}
        dynamic_targets = {}
        # ensure we are using all nodes/edges
        total_nodes = 0
        total_edges = 0

        # get phenotype node
        targets = list()
        for node_key in query["query_graph"]['nodes'].keys():
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_PHENOTYPIC_FEATURE:
                target_id = node_key
                total_nodes += 1

        # get disease node info and ensure only 1 disease:
        for node_key in query["query_graph"]['nodes'].keys():
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_DISEASE:
                disease_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_DISEASE_TO_PHENOTYPIC_FEATURE_PREDICATE and edge[
                            'subject'] == disease_id and edge[
                                'object'] == target_id:
                        if 'properties' in edge.keys():
                            days = edge['properties']['days']
                            qualifier = edge['properties']['qualifier']
                        else:
                            days = 970
                            qualifier = '>='
                        total_edges += 1
                total_nodes += 1
        # set BKB target
        dynamic_targets[node["id"]] = {
            "op": qualifier,
            "value": days,
        }
        truth_target = (node["id"], '{} {}'.format(qualifier, days))

        # get evidence
        for node_key in query["query_graph"]['nodes'].keys():
            # genes
            node = query["query_graph"]['nodes'][node_key]
            if node['category'] == BIOLINK_GENE:
                # check for appropriate gene node structure
                gene_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_GENE_TO_DISEASE_PREDICATE and edge[
                            'subject'] == gene_id and edge[
                                'object'] == disease_id:
                        total_edges += 1
                # check for appropriate gene node curie
                gene_curie = node['id']
                gene = gene_curie
                evidence["_" + gene] = 'True'
                total_nodes += 1
            # drugs
            if node['category'] == BIOLINK_DRUG:
                # check for appropriate drug node structure
                drug_id = node_key
                for edge_key in query["query_graph"]['edges'].keys():
                    edge = query["query_graph"]['edges'][edge_key]
                    if edge['predicate'] == BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE and edge[
                            'subject'] == drug_id and edge[
                                'object'] == disease_id:
                        total_edges += 1
                # check for appropriate drug node curie
                drug_curie = node['id']
                drug = drug_curie
                evidence[node["id"]] = 'True'
                total_nodes += 1

        # produce BKB query
        chp_query = Query(evidence=evidence,
                          targets=targets,
                          dynamic_evidence=dynamic_evidence,
                          dynamic_targets=dynamic_targets,
                          type='updating')
        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        chp_query.query_id = query["query_id"] if 'query_id' in query else None
        return chp_query

    def _run_query(self, chp_query, query_type):
        if query_type == 'simple':
            chp_query = self.joint_reasoner.run_query(chp_query)
            # If a probability was found for the target
            if len(chp_query.result) > 0:
                # If a probability was found for the truth target
                if chp_query.truth_target in chp_query.result:
                    total_unnormalized_prob = 0
                    for target, contrib in chp_query.result.items():
                        prob = max(0, contrib)
                        total_unnormalized_prob += prob
                    chp_query.truth_prob = max([
                        0, chp_query.result[(chp_query.truth_target)]
                    ]) / total_unnormalized_prob
                else:
                    chp_query.truth_prob = 0
            else:
                chp_query.truth_prob = -1
            chp_query.report = None
        else:
            chp_query = self.dynamic_reasoner.run_query(chp_query)
            chp_res_dict = chp_query.result.process_updates(normalize=True)
            chp_query.truth_prob = max([
                0, chp_res_dict[chp_query.truth_target[0]][
                    chp_query.truth_target[1]]
            ])
            chp_query.report = None
        return chp_query

    def _construct_trapi_response(self, chp_query, query_type=None):
        # Get orginal query
        if len(self.init_query) == 1:
            query = self.init_query[0]
            query_id = None
        else:
            for _query in self.init_query:
                if _query["query_id"] == chp_query.query_id:
                    query = _query
                    query_id = query["query_id"]
                    break

        kg = copy.deepcopy(query["query_graph"])
        # update target node info and form edge pair combos for results graph

        node_pairs = dict()
        for node_key in list(kg['nodes'].keys())[:]:
            qg_node_curie = kg['nodes'][node_key].pop('id')
            kg['nodes'][qg_node_curie] = kg['nodes'].pop(node_key)
            node_pairs[node_key] = qg_node_curie
            if kg['nodes'][qg_node_curie]['category'] == BIOLINK_GENE:
                kg['nodes'][qg_node_curie]['name'] = self._get_curie_name(
                    BIOLINK_GENE, qg_node_curie)[0]
            elif kg['nodes'][qg_node_curie]['category'] == BIOLINK_DRUG:
                kg['nodes'][qg_node_curie]['name'] = self._get_curie_name(
                    BIOLINK_DRUG, qg_node_curie)[0]

        edge_pairs = dict()
        knowledge_edges = 0
        for edge_key in list(kg['edges'].keys())[:]:
            kg_id = 'kge{}'.format(knowledge_edges)
            knowledge_edges += 1
            kg['edges'][kg_id] = kg['edges'].pop(edge_key)
            kg['edges'][kg_id]['subject'] = node_pairs[kg['edges'][kg_id]
                                                       ['subject']]
            kg['edges'][kg_id]['object'] = node_pairs[kg['edges'][kg_id]
                                                      ['object']]
            edge_pairs[edge_key] = kg_id
            if kg['edges'][kg_id][
                    'predicate'] == BIOLINK_DISEASE_TO_PHENOTYPIC_FEATURE_PREDICATE:
                if 'properties' in kg['edges'][kg_id].keys():
                    kg['edges'][kg_id].pop('properties')
                kg['edges'][kg_id]['attributes'] = [{
                    'name':
                    'Probability of Survival',
                    'type':
                    BIOLINK_PROBABILITY,
                    'value':
                    chp_query.truth_prob
                }]

        results = []
        results.append({
            'edge_bindings': {},
            'node_bindings': {},
        })
        for edge_pair_key in edge_pairs:
            results[0]['edge_bindings'][edge_pair_key] = [{
                'id':
                edge_pairs[edge_pair_key]
            }]
        for node_pair_key in node_pairs:
            results[0]['node_bindings'][node_pair_key] = [{
                'id':
                node_pairs[node_pair_key]
            }]

        # query response
        trapi_message = {
            'query_graph': query["query_graph"],
            'knowledge_graph': kg,
            'results': results
        }
        trapi_response = {'message': trapi_message}
        return query_id, trapi_response
    def setUpClass(cls):
        super(TestBaseHandler, cls).setUpClass()

        cls.bkb_handler = BkbDataHandler()
        cls.dynamic_reasoner = ChpDynamicReasoner(cls.bkb_handler)
        cls.joint_reasoner = ChpJointReasoner(cls.bkb_handler)
Ejemplo n.º 13
0
 def setUpClass(cls):
     super(TestDynamicReasoner, cls).setUpClass()
     cls.bkb_handler = BkbDataHandler(disease='tcga_brca')
     cls.dynamic_reasoner = ChpDynamicReasoner(cls.bkb_handler)
class WildCardHandlerMixin:
    def _setup_handler(self):
        # Only do the rest of this if a query is passed
        if self.messages is not None:
            # Setup messages
            self._setup_messages()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)

    def _setup_messages(self):
        if type(self.messages) == list:
            self.message_dict = defaultdict(list)
            for message in self.messages:
                self.message_dict[self._get_wildcard_type(message)].append(message)

    def _get_wildcard_type(self, message):
        wildcard_type = None
        for node_id, node in message.query_graph.nodes.items():
            if node.ids is None:
                if wildcard_type is None:
                    wildcard_type = node.categories[0]
        if wildcard_type == BIOLINK_DRUG_ENTITY:
            return 'drug'
        elif wildcard_type == BIOLINK_GENE_ENTITY:
            return 'gene'
        else:
            raise ValueError('Did not understand wildcard type {}.'.format(wildcard_type))

    def _extract_chp_query(self, message, message_type):
        # Initialize CHP BKB Query
        chp_query = ChpQuery(reasoning_type='updating')
        # ensure we are using all nodes/edges
        total_nodes = 0
        total_edges = 0

        query_graph = message.query_graph
        # get phenotype node
        targets = list()
        acceptable_target_curies = ['EFO:0000714']
        self.implicit_survival_node = False
        for node_key in query_graph.nodes.keys():
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_PHENOTYPIC_FEATURE_ENTITY and node.ids[0] in acceptable_target_curies:
                target_id = node_key
                total_nodes += 1
        if total_nodes == 0:
            # Use Default Survival
            self.implicit_survival_node = True
            total_nodes += 1
            #acceptable_target_curies_print = ','.join(acceptable_target_curies)
            #sys.exit("Survival Node not found. Node category must be '{}' and id must be in: {}".format(Biolink(BIOLINK_PHENOTYPIC_FEATURE),
            #                                                                                            acceptable_target_curies_print))

        survival_value = 970
        survival_operator = '>='
        # get disease node info and ensure only 1 disease:
        acceptable_disease_curies = ['MONDO:0007254']
        for node_key in query_graph.nodes.keys():
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_DISEASE_ENTITY and node.ids[0] in acceptable_disease_curies:
                disease_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(edge.predicates[0], BIOLINK_HAS_PHENOTYPE_ENTITY) and edge.subject == disease_id and edge.object == target_id:
                        survival_time_constraint = edge.find_constraint(name='survival_time')
                        if survival_time_constraint is not None:
                            survival_value = survival_time_constraint.value
                            survival_operator = survival_time_constraint.operator
                            if survival_operator == 'matches':
                                survival_operator = '=='
                        total_edges += 1
                total_nodes += 1

        if self.implicit_survival_node:
            days=970
            qualifier = '>='
            total_edges += 1

        # set BKB target
        chp_query.add_dynamic_target('EFO:0000714', survival_operator, survival_value)
        truth_target = ('EFO:0000714', '{} {}'.format(survival_operator, survival_value))

        # get evidence
        for node_key in query_graph.nodes.keys():
            # genes
            node = query_graph.nodes[node_key]
            if node.categories[0] == BIOLINK_GENE_ENTITY:
                # check for appropriate gene node structure
                gene_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(edge.predicates[0], BIOLINK_GENE_ASSOCIATED_WITH_CONDITION_ENTITY) and edge.subject == gene_id and edge.object == disease_id:
                        total_edges += 1
                # check for appropriate gene node curie
                if message_type != 'gene':
                    gene_curie = node.ids[0]
                    if gene_curie in self.curies[BIOLINK_GENE_ENTITY.get_curie()]:
                        gene = gene_curie
                    chp_query.add_meta_evidence(gene, 'True')
                total_nodes += 1
            # drugs
            if node.categories[0] == BIOLINK_DRUG_ENTITY:
                # check for appropriate drug node structure
                drug_id = node_key
                for edge_key in query_graph.edges.keys():
                    edge = query_graph.edges[edge_key]
                    if self.check_predicate_support(edge.predicates[0], BIOLINK_TREATS_ENTITY) and edge.subject == drug_id and edge.object == disease_id:
                        total_edges += 1
                # check for appropriate drug node curie
                if message_type != 'drug':
                    drug_curie = node.ids[0]
                    if drug_curie in self.curies[BIOLINK_DRUG_ENTITY.get_curie()]:
                        drug = drug_curie
                    chp_query.add_meta_evidence(drug, 'True')
                total_nodes += 1

        # Temporary solution to no evidence linking
        if len(chp_query.evidence.keys()) == 0 and len(chp_query.dynamic_evidence.keys()) == 0:
            self.no_evidence_probability_check = True
        else:
            self.no_evidence_probability_check = False

        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        return chp_query

    def _run_query(self, chp_query, query_type):
        """ Runs build BKB query to calculate probability of survival.
            A probability is returned to specificy survival time w.r.t a drug.
            Contributions for each gene are calculuated and classified under
            their true/false target assignments.
        """

        # temporary solution to no evidence linking
        if not self.no_evidence_probability_check:
            if query_type == 'gene':
                chp_query = self.dynamic_reasoner.run_query(chp_query, bkb_type='drug')
            elif query_type == 'drug':
                chp_query = self.dynamic_reasoner.run_query(chp_query, bkb_type='gene')
            chp_res_dict = chp_query.result.process_updates()
            chp_res_norm_dict = chp_query.result.process_updates(normalize=True)
            #chp_query.result.summary()
            chp_res_contributions = chp_query.result.process_inode_contributions()
            try:
                chp_query.truth_prob = max([0, chp_res_dict[chp_query.truth_target[0]][chp_query.truth_target[1]]])
            except KeyError:
                # May need to come back and fix this.
                chp_query.truth_prob = -1

            # Collect all source inodes and process patient hashes
            patient_contributions = defaultdict(lambda: defaultdict(int))
            for target, contrib_dict in chp_res_contributions.items():
                target_comp_name, target_state_name = target
                for inode, contrib in contrib_dict.items():
                    comp_name, state_name = inode
                    if '_Source_' in comp_name:
                        # Split source state name to get patient hashes
                        source_hashes_str = state_name.split('_')[-1]
                        source_hashes = [int(source_hash) for source_hash in source_hashes_str.split(',')]
                        hash_len = len(source_hashes)
                        # Process patient contributions
                        for _hash in source_hashes:
                            # Normalize to get relative contribution
                            patient_contributions[target][_hash] += contrib/hash_len #/ chp_res_dict[target_comp_name][target_state_name]

        else:
            # probability of survival
            num_survived = 0
            num_all = len(self.dynamic_reasoner.raw_patient_data.keys())
            str_op = chp_query.dynamic_targets['EFO:0000714']['op']
            opp_op = get_opposite_operator(str_op)
            op = get_operator(str_op)
            days = chp_query.dynamic_targets['EFO:0000714']['value']
            for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                if op(pat_dict['survival_time'], days):
                    num_survived += 1
            chp_query.truth_prob = num_survived/num_all

            # patient_contributions
            patient_contributions = defaultdict(lambda: defaultdict(int))
            for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                if op(pat_dict['survival_time'], days):
                    if num_survived == 0:
                        patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = 0
                    else:
                        patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = chp_query.truth_prob/num_survived
                else:
                    if num_survived == 0:
                        patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/num_all
                    else:
                        patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/(num_all-num_survived)

        # Now iterate through the patient data to translate patient contributions to drug/gene contributions
        wildcard_contributions = defaultdict(lambda: defaultdict(int))
        for target, patient_contrib_dict in patient_contributions.items():
            for patient, contrib in patient_contrib_dict.items():
                if query_type == 'gene':
                    for gene_curie in self.dynamic_reasoner.raw_patient_data[patient]["gene_curies"]:
                        wildcard_contributions[gene_curie][target] += contrib
                elif query_type == 'drug':
                    for drug_curie in self.dynamic_reasoner.raw_patient_data[patient]["drug_curies"]:
                        wildcard_contributions[drug_curie][target] += contrib

        # normalize gene contributions by the target and take relative difference
        for curie in wildcard_contributions.keys():
            truth_target_gene_contrib = 0
            nontruth_target_gene_contrib = 0
            for target, contrib in wildcard_contributions[curie].items():
                if target[0] == chp_query.truth_target[0] and target[1] == chp_query.truth_target[1]:
                    truth_target_gene_contrib += contrib / chp_query.truth_prob
                else:
                    nontruth_target_gene_contrib += contrib / (1 - chp_query.truth_prob)
            wildcard_contributions[curie]['relative'] = truth_target_gene_contrib - nontruth_target_gene_contrib

        chp_query.report = None
        chp_query.wildcard_contributions = wildcard_contributions

        return chp_query

    def _construct_trapi_message(self, chp_query, message, query_type=None):

        # update target node info and form edge pair combos for results graph

        qg = message.query_graph
        kg = message.knowledge_graph

        # Process Standard QUery as first result.
        # Process Nodes
        node_bindings = {}
        contrib_qg_id = None
        for qnode_key, qnode in qg.nodes.items():
            if qnode.ids is not None:
                if qnode.categories[0] == BIOLINK_GENE_ENTITY:
                    knode_key = kg.add_node(
                            qnode.ids[0],
                            self.curies[BIOLINK_GENE_ENTITY.get_curie()][qnode.ids[0]][0],
                            qnode.categories[0].get_curie(),
                            )
                elif qnode.categories[0] == BIOLINK_DRUG_ENTITY:
                    knode_key = kg.add_node(
                            qnode.ids[0],
                            self.curies[BIOLINK_DRUG_ENTITY.get_curie()][qnode.ids[0]][0],
                            qnode.categories[0].get_curie(),
                            )
                else:
                    knode_key = kg.add_node(
                            qnode.ids[0],
                            qnode.ids[0],
                            qnode.categories[0].get_curie(),
                            )
                node_bindings[qnode_key] = [knode_key]
        if not self.implicit_survival_node:
            # Process Edges
            edge_bindings = {}
            knowledge_edges = 0
            for qedge_key, qedge in qg.edges.items():
                if not qedge.subject in node_bindings or not qedge.object in node_bindings:
                    continue
                kedge_key = kg.add_edge(
                        node_bindings[qedge.subject][0],
                        node_bindings[qedge.object][0],
                        predicate=qedge.predicates[0].get_curie(),
                        relation=qedge.relation,
                        )
                edge_bindings[qedge_key] = [kedge_key]
                # Add Attribute
                if self.check_predicate_support(qedge.predicates[0], BIOLINK_HAS_PHENOTYPE_ENTITY):
                    kg.edges[kedge_key].add_attribute(
                            attribute_type_id='Probability of Survival',
                            value=chp_query.truth_prob,
                            value_type_id=BIOLINK_HAS_CONFIDENCE_LEVEL_ENTITY.get_curie(),
                            )
                '''
                subject_node = kg['edges'][edge_key]['subject']
                if kg['edges'][edge_key]['predicate'] == BIOLINK_GENE_ENTITY_TO_DISEASE_PREDICATE, is_slot=True) and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_GENE_ENTITY and query_type == 'gene':
                    kg['edges'].pop(edge_key)
                elif kg['edges'][edge_key]['predicate'] == BIOLINK_CHEMICAL_TO_DISEASE_OR_PHENOTYPIC_FEATURE_PREDICATE, is_slot=True) and query['query_graph']['nodes'][subject_node]['category'] == BIOLINK_DRUG_ENTITY and query_type == 'drug':
                    kg['edges'].pop(edge_key)
                else:
                    kg_id = 'kge{}'.format(knowledge_edges)
                    knowledge_edges += 1
                    kg['edges'][kg_id] = kg['edges'].pop(edge_key)
                    kg['edges'][kg_id]['subject'] = node_pairs[kg['edges'][kg_id]['subject']]
                    kg['edges'][kg_id]['object'] = node_pairs[kg['edges'][kg_id]['object']]
                    edge_pairs[edge_key] = kg_id
                    if kg['edges'][kg_id]['predicate'] == BIOLINK_DISEASE_ENTITY_TO_PHENOTYPIC_FEATURE_PREDICATE, is_slot=True):
                        if 'properties' in kg['edges'][kg_id].keys():
                            kg['edges'][kg_id].pop('properties')
                        kg['edges'][kg_id]['attributes'] = [{'name':'Probability of Survival',
                                                             'type':BIOLINK_PROBABILITY,
                                                             'value':chp_query.truth_prob}]
                '''
            # Proces results
            message.results.add_result(
                    node_bindings,
                    edge_bindings,
                    )
            '''
            # Put first result of standard prob query of only curie nodes (i.e. no wildcard nodes where used as evidence)
            results = []
            results.append({'edge_bindings':dict(),
                            'node_bindings':dict()})
            for edge_pair_key in edge_pairs:
                results[0]['edge_bindings'][edge_pair_key] = [{ 'id': str(edge_pairs[edge_pair_key])}]
            for node_pair_key in node_pairs:
                results[0]['node_bindings'][node_pair_key] = [{ 'id': str(node_pairs[node_pair_key])}]
            '''
        #else:
        #    knowledge_edges = 0
        #    kg['edges'] = {}
        #    results = []

        # Build relative contribution results and added associated edges into knowledge graph
        unsorted_wildcard_contributions = []
        for wildcard, contrib_dict in chp_query.wildcard_contributions.items():
            unsorted_wildcard_contributions.append((contrib_dict['relative'], wildcard))
        sorted_wildcard_contributions = [(contrib,wildcard) for contrib, wildcard in sorted(unsorted_wildcard_contributions, key=lambda x: abs(x[0]), reverse=True)]

        for contrib, wildcard in sorted_wildcard_contributions[:self.max_results]:
            #TODO: Fix this!
            if wildcard == 'missing':
                continue
            #rg = copy.deepcopy(query["query_graph"])
            _node_bindings = {}
            _edge_bindings = {}
            # Process node bindings
            bad_wildcard = False
            for qnode_id, qnode in qg.nodes.items():
                if qnode.categories[0] == BIOLINK_GENE_ENTITY and query_type == 'gene':
                    try:
                        knode_id = kg.add_node(
                                wildcard,
                                self.curies[BIOLINK_GENE_ENTITY.get_curie()][wildcard][0],
                                qnode.categories[0].get_curie(),
                                )
                        _node_bindings[qnode_id] = [knode_id]
                    except KeyError:
                        logger.info("Couldn't find {} in curies[{}]".format(wildcard, BIOLINK_GENE_ENTITY.get_curie()))
                        bad_wildcard = True
                elif qnode.categories[0] == BIOLINK_DRUG_ENTITY and query_type == 'drug':
                    knode_id = kg.add_node(
                            wildcard,
                            self.curies[BIOLINK_DRUG_ENTITY.get_curie()][wildcard][0],
                            qnode.categories[0].get_curie(),
                            )
                    _node_bindings[qnode_id] = [knode_id]
                else:
                    _node_bindings[qnode_id] = node_bindings[qnode_id]
            if bad_wildcard:
                continue
            # Process edge bindings
            for qedge_id, qedge in qg.edges.items():
                subject_node = qedge.subject
                object_node = qedge.object
                if query_type == 'gene' and self.check_predicate_support(qedge.predicates[0], BIOLINK_GENE_ASSOCIATED_WITH_CONDITION_ENTITY) and qg.nodes[subject_node].categories[0] == BIOLINK_GENE_ENTITY:
                    kedge_id = kg.add_edge(
                            _node_bindings[qedge.subject][0],
                            _node_bindings[qedge.object][0],
                            predicate=qedge.predicates[0],
                            relation=qedge.relation,
                            )
                    kg.edges[kedge_id].add_attribute(
                            attribute_type_id='Contribution',
                            value=contrib,
                            value_type_id=BIOLINK_HAS_EVIDENCE_ENTITY.get_curie(),
                            )
                    _edge_bindings[qedge_id] = [kedge_id]
                elif query_type == 'gene' and self.check_predicate_support(qedge.predicates[0], BIOLINK_CONDITION_ASSOCIATED_WITH_GENE_ENTITY) and qg.nodes[object_node].categories[0] == BIOLINK_GENE_ENTITY:
                    kedge_id = kg.add_edge(
                            _node_bindings[qedge.subject][0],
                            _node_bindings[qedge.object][0],
                            predicate=qedge.predicates[0],
                            relation=qedge.relation,
                            )
                    kg.edges[kedge_id].add_attribute(
                            attribute_type_id='Contribution',
                            value=contrib,
                            value_type_id=BIOLINK_HAS_EVIDENCE_ENTITY.get_curie(),
                            )
                    _edge_bindings[qedge_id] = [kedge_id]
                elif query_type == 'drug' and self.check_predicate_support(qedge.predicates[0], BIOLINK_TREATS_ENTITY) and qg.nodes[subject_node].categories[0] == BIOLINK_DRUG_ENTITY:
                    kedge_id = kg.add_edge(
                            _node_bindings[qedge.subject][0],
                            _node_bindings[qedge.object][0],
                            predicate=qedge.predicates[0],
                            relation=qedge.relation,
                            )
                    kg.edges[kedge_id].add_attribute(
                            attribute_type_id='Contribution',
                            value=contrib,
                            value_type_id=BIOLINK_HAS_EVIDENCE_ENTITY.get_curie(),
                            )
                    _edge_bindings[qedge_id] = [kedge_id]
                elif query_type == 'drug' and self.check_predicate_support(qedge.predicates[0], BIOLINK_TREATED_BY_ENTITY) and qg.nodes[object_node].categories[0] == BIOLINK_DRUG_ENTITY:
                    kedge_id = kg.add_edge(
                            _node_bindings[qedge.subject][0],
                            _node_bindings[qedge.object][0],
                            predicate=qedge.predicates[0],
                            relation=qedge.relation,
                            )
                    kg.edges[kedge_id].add_attribute(
                            attribute_type_id='Contribution',
                            value=contrib,
                            value_type_id=BIOLINK_HAS_EVIDENCE_ENTITY.get_curie(),
                            )
                    _edge_bindings[qedge_id] = [kedge_id]
                else:
                    _edge_bindings[qedge_id] = edge_bindings[qedge_id]
            # Process node and edge binding results
            message.results.add_result(
                    _node_bindings,
                    _edge_bindings,
                    )
        return message
Ejemplo n.º 15
0
class OneHopHandlerMixin:
    """ OneHopeHandler is the handler for 1-hop queries. That is
        query graphs (QGs) that consists of 2 nodes and a single edge.

        :param query: the query graph sent by the ARA.
        :type query: dict
        :param hosts_filename: a filename for a stored QG. Defaults to None
        :type hosts_filename: str
        :param num_processes_per_host: Not implemented thouroughly, but would be
            used for distributed reasoning.
        :type num_processes_per_host: int
        :param max_results: specific to 1-hop queries, specifies the number of
            wildcard genes to return.
        :type max_results: int
    """
    def _setup_handler(self):
        self.default_survival_target = {
            "EFO:0000714": {
                "op": '>=',
                "value": 970
            }
        }

        # Only do the rest of this if a query is passed
        if self.messages is not None:
            # Setup queries
            self._setup_messages()

            # Instiatate Reasoners
            if self.dynamic_reasoner is None:
                self.dynamic_reasoner = ChpDynamicReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)
            if self.joint_reasoner is None:
                self.joint_reasoner = ChpJointReasoner(
                    bkb_handler=self.bkb_data_handler,
                    hosts_filename=self.hosts_filename,
                    num_processes_per_host=self.num_processes_per_host)

    def _setup_messages(self):
        self.message_dict = defaultdict(list)
        for message in self.messages:
            self.message_dict[self._get_onehop_type(message)].append(message)

    def _get_onehop_type(self, message):
        wildcard_type = None
        for node_id, node in message.query_graph.nodes.items():
            if node.ids is None:
                if wildcard_type is None:
                    wildcard_type = node.categories[0]
        # If standard onehop query
        if wildcard_type is None:
            return 'standard'
        elif wildcard_type == BIOLINK_DRUG_ENTITY:
            return 'drug'
        elif wildcard_type == BIOLINK_GENE_ENTITY:
            return 'gene'
        else:
            raise ValueError(
                'Did not understand wildcard type {}.'.format(wildcard_type))

    def check_query(self):
        """ Currently not implemented. Would check validity of query.
        """
        return True

    @staticmethod
    def _process_predicate_proxy(qedge):
        dynamic_targets = {}
        predicate_proxy_constraint = qedge.find_constraint('predicate_proxy')
        if predicate_proxy_constraint is None:
            predicate_proxy = get_default_predicate_proxy()
            proxy_constraint = qedge.find_constraint(predicate_proxy)
        else:
            predicate_proxy = predicate_proxy_constraint.value[0]
            proxy_constraint = qedge.find_constraint(predicate_proxy)
        if proxy_constraint is None:
            proxy_operator = get_default_operator(predicate_proxy)
            proxy_value = get_default_value(predicate_proxy)
        else:
            proxy_operator = proxy_constraint.operator
            proxy_value = proxy_constraint.value
        # Setup dynamic target
        dynamic_targets[predicate_proxy] = {
            "op": proxy_operator,
            "value": proxy_value,
        }
        return dynamic_targets

    @staticmethod
    def _process_predicate_context(qedge, message_type):
        evidence = {}
        dynamic_evidence = {}
        predicate_context_constraint = qedge.find_constraint(
            'predicate_context')
        if predicate_context_constraint is not None:
            for context in predicate_context_constraint.value:
                context_curie = get_biolink_entity(context)
                context_constraint = qedge.find_constraint(context)
                if context_constraint is None:
                    raise ValueError(
                        'Provided no context details for {}'.format(context))
                if context_curie == BIOLINK_GENE_ENTITY:
                    if message_type == 'gene':
                        if type(context_constraint.value) is list:
                            for _curie in context_constraint.value:
                                dynamic_evidence[_curie] = {
                                    "op": '==',
                                    "value": 'True',
                                }
                        else:
                            dynamic_evidence[context_constraint.value] = {
                                "op": '==',
                                "value": 'True',
                            }
                    else:
                        if type(context_constraint.value) is list:
                            for _curie in context_constraint.value:
                                evidence['_{}'.format(_curie)] = 'True'
                        else:
                            evidence['_{}'.format(_curie)] = 'True'
                elif context_curie == BIOLINK_DRUG_ENTITY:
                    if message_type == 'drug':
                        if type(context_constraint.value) is list:
                            for _curie in context_constraint.value:
                                dynamic_evidence[_curie] = {
                                    "op": '==',
                                    "value": 'True',
                                }
                        else:
                            dynamic_evidence[context_constraint.value] = {
                                "op": '==',
                                "value": 'True',
                            }
                    else:
                        if type(context_constraint.value) is list:
                            for _curie in context_constraint.value:
                                evidence['_{}'.format(_curie)] = 'True'
                        else:
                            evidence['_{}'.format(_curie)] = 'True'
                else:
                    raise ValueError(
                        'Unsupported context type: {}'.format(context_curie))
        return evidence, dynamic_evidence

    def _extract_chp_query(self, message, message_type):
        evidence = {}
        dynamic_targets = {}
        dynamic_evidence = {}

        if message_type == 'standard':
            # Setup gene and drug evidence
            for qnode_id, qnode in message.query_graph.nodes.items():
                if qnode.categories[
                        0] == BIOLINK_GENE_ENTITY or qnode.categories[
                            0] == BIOLINK_DRUG_ENTITY:
                    evidence['_{}'.format(qnode.ids[0])] = 'True'
        elif message_type == 'gene':
            for qnode_id, qnode in message.query_graph.nodes.items():
                if qnode.categories[0] == BIOLINK_DRUG_ENTITY:
                    #dynamic_evidence[qnode.ids[0]] = {
                    #        "op": '==',
                    #        "value": 'True',
                    #        }
                    evidence['_{}'.format(qnode.ids[0])] = 'True'
        elif message_type == 'drug':
            for qnode_id, qnode in message.query_graph.nodes.items():
                if qnode.categories[0] == BIOLINK_GENE_ENTITY:
                    #dynamic_evidence[qnode.ids[0]] = {
                    #        "op": '==',
                    #        "value": 'True',
                    #        }
                    evidence['_{}'.format(qnode.ids[0])] = 'True'
        # Grab edge
        for qedge_id, qedge in message.query_graph.edges.items():
            break
        # Process predicate proxy
        dynamic_targets = self._process_predicate_proxy(qedge)
        # Process predicate context
        _evidence, _dynamic_evidence = self._process_predicate_context(
            qedge, message_type)
        evidence.update(_evidence)
        dynamic_evidence.update(_dynamic_evidence)
        #TODO: Probably need a more robust solution for when no context is provided in wildcard queries and you need it.
        #if len(evidence) == 0:
        #    raise ValueError('Did not supply context with a query that required context.')

        target = list(dynamic_targets.keys())[0]
        truth_target = (target,
                        '{} {}'.format(dynamic_targets[target]["op"],
                                       dynamic_targets[target]["value"]))
        chp_query = Query(evidence=evidence,
                          targets=None,
                          dynamic_evidence=dynamic_evidence,
                          dynamic_targets=dynamic_targets,
                          type='updating')
        # Set some other helpful attributes
        chp_query.truth_target = truth_target
        return chp_query

    def _run_query(self, chp_query, query_type):
        """ Runs build BKB query to calculate probability of survival.
            A probability is returned to specificy survival time w.r.t a drug.
            Contributions for each gene are calculuated and classified under
            their true/false target assignments.
        """
        if query_type == 'standard':
            chp_query = self.joint_reasoner.run_query(chp_query)
            # If a probability was found for the target
            if len(chp_query.result) > 0:
                # If a probability was found for the truth target
                if chp_query.truth_target in chp_query.result:
                    total_unnormalized_prob = 0
                    for target, contrib in chp_query.result.items():
                        prob = max(0, contrib)
                        total_unnormalized_prob += prob
                    chp_query.truth_prob = max([
                        0, chp_query.result[(chp_query.truth_target)]
                    ]) / total_unnormalized_prob
                else:
                    chp_query.truth_prob = 0
            else:
                chp_query.truth_prob = -1
            chp_query.report = None
            return chp_query
        else:
            # Do this if a disease node is present
            if len(chp_query.evidence) == 0:
                # probability of survival
                chp_query = self.joint_reasoner.run_query(chp_query)
                if len(chp_query.result) > 0:
                    # If a probability was found for the truth target
                    if chp_query.truth_target in chp_query.result:
                        total_unnormalized_prob = 0
                        for target, contrib in chp_query.result.items():
                            prob = max(0, contrib)
                            total_unnormalized_prob += prob
                        chp_query.truth_prob = max([
                            0, chp_query.result[(chp_query.truth_target)]
                        ]) / total_unnormalized_prob
                    else:
                        chp_query.truth_prob = 0
                else:
                    chp_query.truth_prob = -1

                # patient_contributions
                num_all = len(self.joint_reasoner.patient_data)
                num_matched = chp_query.truth_prob * num_all
                patient_contributions = defaultdict(lambda: defaultdict(int))
                for patient, feature_dict in self.joint_reasoner.patient_data.items(
                ):
                    for predicate_proxy, proxy_info in chp_query.dynamic_targets.items(
                    ):
                        proxy_op = get_operator(proxy_info["op"])
                        proxy_opp_op = get_opposite_operator(proxy_info["op"])
                        proxy_value = proxy_info["value"]
                        if proxy_op(feature_dict[predicate_proxy],
                                    proxy_value):
                            if num_matched == 0:
                                patient_contributions[(
                                    predicate_proxy,
                                    '{} {}'.format(proxy_op,
                                                   proxy_value))][patient] = 0
                            else:
                                patient_contributions[(
                                    predicate_proxy,
                                    '{} {}'.format(proxy_op, proxy_value)
                                )][patient] = chp_query.truth_prob / num_matched
                        else:
                            if num_matched == 0:
                                patient_contributions[(
                                    predicate_proxy,
                                    '{} {}'.format(proxy_opp_op, proxy_value)
                                )][patient] = (
                                    1 - chp_query.truth_prob) / num_matched
                            else:
                                patient_contributions[(
                                    predicate_proxy,
                                    '{} {}'.format(proxy_opp_op, proxy_value)
                                )][patient] = (1 - chp_query.truth_prob) / (
                                    num_all - num_matched)
                '''
                num_survived = 0
                num_all = len(self.dynamic_reasoner.raw_patient_data.keys())
                str_op = chp_query.dynamic_targets['EFO:0000714']['op']
                opp_op = get_opposite_operator(str_op)
                op = get_operator(str_op)
                days = chp_query.dynamic_targets['EFO:0000714']['value']
                for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                    if op(pat_dict['survival_time'], days):
                        num_survived += 1
                chp_query.truth_prob = num_survived/num_all
                # patient_contributions
                patient_contributions = defaultdict(lambda: defaultdict(int))
                for patient, pat_dict in self.dynamic_reasoner.raw_patient_data.items():
                    if op(pat_dict['survival_time'], days):
                        if num_survived == 0:
                            patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = 0
                        else:
                            patient_contributions[('EFO:0000714', '{} {}'.format(str_op, days))][patient] = chp_query.truth_prob/num_survived
                    else:
                        if num_survived == 0:
                            patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/num_all
                        else:
                            patient_contributions[('EFO:0000714', '{} {}'.format(opp_op, days))][patient] = (1-chp_query.truth_prob)/(num_all-num_survived)
                '''
            else:
                if query_type == 'gene':
                    chp_query = self.dynamic_reasoner.run_query(
                        chp_query, bkb_type='drug')
                elif query_type == 'drug':
                    chp_query = self.dynamic_reasoner.run_query(
                        chp_query, bkb_type='gene')
                chp_res_dict = chp_query.result.process_updates()
                chp_res_norm_dict = chp_query.result.process_updates(
                    normalize=True)
                #chp_query.result.summary()
                chp_res_contributions = chp_query.result.process_inode_contributions(
                )
                chp_query.truth_prob = max([
                    0, chp_res_norm_dict[chp_query.truth_target[0]][
                        chp_query.truth_target[1]]
                ])

                # Collect all source inodes and process patient hashes
                patient_contributions = defaultdict(lambda: defaultdict(int))
                for target, contrib_dict in chp_res_contributions.items():
                    target_comp_name, target_state_name = target
                    for inode, contrib in contrib_dict.items():
                        comp_name, state_name = inode
                        if '_Source_' in comp_name:
                            # Split source state name to get patient hashes
                            source_hashes_str = state_name.split('_')[-1]
                            source_hashes = [
                                int(source_hash)
                                for source_hash in source_hashes_str.split(',')
                            ]
                            hash_len = len(source_hashes)
                            # Process patient contributions
                            for _hash in source_hashes:
                                # Normalize to get relative contribution
                                patient_contributions[target][
                                    _hash] += contrib / hash_len  #/ chp_res_dict[target_comp_name][target_state_name]

        # Now iterate through the patient data to translate patient contributions to drug/gene contributions
        wildcard_contributions = defaultdict(lambda: defaultdict(int))
        for target, patient_contrib_dict in patient_contributions.items():
            for patient, contrib in patient_contrib_dict.items():
                if query_type == 'gene':
                    for gene_curie in self.dynamic_reasoner.raw_patient_data[
                            int(patient)]["gene_curies"]:
                        wildcard_contributions[gene_curie][target] += contrib
                elif query_type == 'drug':
                    for drug_curie in self.dynamic_reasoner.raw_patient_data[
                            int(patient)]["drug_curies"]:
                        wildcard_contributions[drug_curie][target] += contrib

        # normalize gene contributions by the target and take relative difference
        for curie in wildcard_contributions.keys():
            truth_target_gene_contrib = 0
            nontruth_target_gene_contrib = 0
            for target, contrib in wildcard_contributions[curie].items():
                if target[0] == chp_query.truth_target[0] and target[
                        1] == chp_query.truth_target[1]:
                    truth_target_gene_contrib += contrib / chp_query.truth_prob
                else:
                    nontruth_target_gene_contrib += contrib / (
                        1 - chp_query.truth_prob)
            wildcard_contributions[curie][
                'relative'] = truth_target_gene_contrib - nontruth_target_gene_contrib

        chp_query.report = None
        chp_query.wildcard_contributions = wildcard_contributions

        return chp_query

    def _construct_trapi_message(self, chp_query, message, query_type):

        qg = message.query_graph
        kg = message.knowledge_graph

        edge_bindings = {}
        node_bindings = {}

        # Process nodes
        for qnode_id, qnode in qg.nodes.items():
            if qnode.ids is not None:
                if qnode.categories[0] == BIOLINK_GENE_ENTITY:
                    knode_key = kg.add_node(
                        qnode.ids[0],
                        self.curies[BIOLINK_GENE_ENTITY.get_curie()][
                            qnode.ids[0]][0],
                        qnode.categories[0].get_curie(),
                    )
                elif qnode.categories[0] == BIOLINK_DRUG_ENTITY:
                    knode_key = kg.add_node(
                        qnode.ids[0],
                        self.curies[BIOLINK_DRUG_ENTITY.get_curie()][
                            qnode.ids[0]][0],
                        qnode.categories[0].get_curie(),
                    )
                elif qnode.categories[0] == BIOLINK_DISEASE_ENTITY:
                    #TODO: Add diseases to curies and fix name hack below.
                    knode_key = kg.add_node(
                        qnode.ids[0],
                        qnode.
                        ids[0],  #TODO: Once curies is fixed, make this a name.
                        qnode.categories[0].get_curie(),
                    )
                node_bindings[qnode_id] = [knode_key]
            else:
                wildcard_node = qnode
        if query_type == 'standard':
            for qedge_key, qedge in qg.edges.items():
                kedge_key = kg.add_edge(
                    node_bindings[qedge.subject][0],
                    node_bindings[qedge.object][0],
                    predicate=qedge.predicates[0].get_curie(),
                    relation=qedge.relation,
                )
                edge_bindings[qedge_key] = [kedge_key]
                # Add Attribute
                kg.edges[kedge_key].add_attribute(
                    attribute_type_id='Probability of Survival',
                    value=chp_query.truth_prob,
                    value_type_id=BIOLINK_HAS_CONFIDENCE_LEVEL_ENTITY.
                    get_curie(),
                )
            message.results.add_result(
                node_bindings,
                edge_bindings,
            )
        else:
            # Build relative contribution results and added associated edges into knowledge graph
            unsorted_wildcard_contributions = []
            for wildcard, contrib_dict in chp_query.wildcard_contributions.items(
            ):
                unsorted_wildcard_contributions.append(
                    (contrib_dict['relative'], wildcard))
            sorted_wildcard_contributions = [
                (contrib, wildcard) for contrib, wildcard in sorted(
                    unsorted_wildcard_contributions,
                    key=lambda x: abs(x[0]),
                    reverse=True)
            ]

            # add kg gene nodes and edges
            edge_count = 0
            node_count = 1
            results = []
            for contrib, wildcard in sorted_wildcard_contributions[:self.
                                                                   max_results]:
                _node_bindings = {}
                _edge_bindings = {}
                # Process node bindings
                bad_wildcard = False
                for qnode_id, qnode in qg.nodes.items():
                    if qnode.categories[
                            0] == BIOLINK_GENE_ENTITY and query_type == 'gene':
                        try:
                            knode_id = kg.add_node(
                                wildcard,
                                self.curies[BIOLINK_GENE_ENTITY.get_curie()]
                                [wildcard][0],
                                qnode.categories[0].get_curie(),
                            )
                            _node_bindings[qnode_id] = [knode_id]
                        except KeyError:
                            logger.info(
                                "Couldn't find {} in curies[{}]".format(
                                    wildcard, BIOLINK_GENE))
                            bad_wildcard = True
                    elif qnode.categories[
                            0] == BIOLINK_DRUG_ENTITY and query_type == 'drug':
                        knode_id = kg.add_node(
                            wildcard,
                            self.curies[BIOLINK_DRUG_ENTITY.get_curie()]
                            [wildcard][0],
                            qnode.categories[0].get_curie(),
                        )
                        _node_bindings[qnode_id] = [knode_id]
                    else:
                        _node_bindings[qnode_id] = node_bindings[qnode_id]
                if bad_wildcard:
                    continue
                # Process edge bindings
                for qedge_id, qedge in qg.edges.items():
                    kedge_id = kg.add_edge(
                        _node_bindings[qedge.subject][0],
                        _node_bindings[qedge.object][0],
                        predicate=qedge.predicates[0],
                        relation=qedge.relation,
                    )
                    kg.edges[kedge_id].add_attribute(
                        attribute_type_id='Contribution',
                        value=contrib,
                        value_type_id=BIOLINK_HAS_EVIDENCE_ENTITY.get_curie(),
                    )
                    _edge_bindings[qedge_id] = [kedge_id]
                # Process node and edge binding results
                message.results.add_result(
                    _node_bindings,
                    _edge_bindings,
                )

        return message