def checkForEmptyResponseBody(self): """ Just log if the response body is empty, helpful for the users :return: None """ if self.responseBody is None: self.logs.append(clsLogEvent( identifier=self.name, level="ERROR", code="Error", message=f"Knowledge Provider {self.url} did not return a message" )) return True knowledge_graph = self.responseBody['message']['knowledge_graph'] if knowledge_graph is None: # todo, this is a bug, some of the knowledge providers return a None, which seems to valid by TRAPI v1.1 knowledge_graph = {'edges': {}, 'nodes': {}} self.responseBody['message']['knowledge_graph'] = knowledge_graph if len(knowledge_graph['edges']) == 0 or len(knowledge_graph['nodes']) == 0: self.logs.append(clsLogEvent( identifier=self.name, level="WARNING", code="KPEmptyResponse", message=f"Knowledge Provider {self.url} returned an empty knowledge graph" )) return True else: return False
def dispatch(self): if self.dispatchList is not None: self.logs.append( clsLogEvent( identifier="", level="DEBUG", code="", message= f"Executing {len(self.dispatchList)} predicate searches.")) query_dispatch(objects=self.dispatchList, method=self.dispatchMode, parentId=self.dispatchId, abort_time=int(self.started_time + self.processing_timeout)) # all solution managers have identified case solutions to use, now we aggregate and dispatch those self.caseMultiHops = [] for case_solution_manager in self.dispatchList: # also merge any logs self.logs += case_solution_manager.logs if case_solution_manager.dispatchList: self.caseMultiHops += case_solution_manager.dispatchList # self.caseMultiHops = self.caseMultiHops[:5] # test_solutions = [c for c in self.caseSolutions if c.caseId == 'Q000391' and c.id == 2065] # Solution ID '2065' for Case ID 'Q000391' # self.caseMultiHops = [c for c in self.caseMultiHops if c.caseId != 'Q001884'][:10] + test_solutions # self.caseMultiHops = test_solutions # self.caseSolutions = self.caseSolutions[:200] # self.logs.append(clsLogEvent( # identifier="", # level="DEBUG", # code="", # message=f"Matched {len(self.caseSolutions)} Case Solutions." # )) query_dispatch( objects=self.caseMultiHops, method=self.dispatchMode, parentId=self.dispatchId, abort_time=int(self.started_time + self.processing_timeout), # abort_time=10, pool_size=5) self.logs.append( clsLogEvent( identifier="", level="DEBUG", code="", message= f"Fully ran {len([s for s in self.caseMultiHops if s.finished])} Case Solutions." )) return
def execute(self): """ Sends a query to the Knowledge Provider and sets the responseBody property :return: None """ self.validateRequestBody() try: if self.url is None: raise ValueError(f"No URL identified for Knowledge Provider '{self.name}'") self.logs.append(clsLogEvent( identifier=self.name, level="DEBUG", code="", message=f"Sending request to Knowledge Provider {self.url}..." )) response = request_with_global_timeout( method="post", url=self.url, # global_timeout=None, global_timeout=self.timeoutSeconds, json=self.requestBody ) response.raise_for_status() except requests.exceptions.Timeout: self.logs.append(clsLogEvent( identifier=self.name, level="ERROR", code="KPTimeout", message=f"Knowledge Provider {self.url} timed out after {self.timeoutSeconds} seconds" )) raise except requests.exceptions.HTTPError: self.logs.append(clsLogEvent( identifier=self.name, level="ERROR", code="KPNotAvailable", message=f"Knowledge Provider {self.url} returned HTTP error code {str(response.status_code)}" )) raise self.logs.append(clsLogEvent( identifier=self.name, level="DEBUG", code="", message=f"KP 200 response in {response.elapsed}. Content size: {len(response.content)}." )) self.responseBody = response.json() self.validateResponseBody() self.checkForEmptyResponseBody()
def runner(object): startTime = time.time() status = "Started" try: logging.debug(printer(object=object, status=status)) object.execute() status = "Completed" except Exception as tb: status = "Failed" print(traceback.format_exc()) logging.error(str(tb)) object.logs.append( clsLogEvent( level="ERROR", code="", message= f"Unexpected error in object with dispatchId {object.dispatchId}: {tb}" )) # raise clsDelayedException(tb) finally: endTime = time.time() object.dispatchRunTimeSeconds = endTime - startTime message = printer(object=object, status=status) if status == "Completed": logging.debug(message) elif status == "Failed": logging.error(message) logging.debug( f"{findObjectType(object)} with dispatchId {object.dispatchId} took {object.dispatchRunTimeSeconds} seconds" ) return object
def execute(self, no_results=False): self.logs.append(clsLogEvent( identifier=self.id, level="DEBUG", code="", message=f"Executing Solution ID '{self.id}' for Case ID '{self.caseId}'" )) try: if len(self.paths) == 1: self.generateKnowledgeGraphForOnePath() self.generateQueryGraphForOnePath() # if no_results is False: # self.generateResultsForOnePath() # else: # self.results = [] elif len(self.paths) == 2: self.createCaseSolutions() self.generateKnowledgeGraphForTwoPaths() self.generateQueryGraphForTwoPaths() # if no_results is False: # self.generateResultsForTwoPaths() # else: # self.results = [] else: raise AttributeError("Number of knowledge provider paths supported is either 1 or 2") self.updateKnowledgeProviderLogs() self.logs.append(clsLogEvent( identifier=self.id, level="DEBUG", code="", message=f"Case executed successfully." )) # if no_results is False and len(self.knowledge_graph["edges"]) > 0 and len(self.results) == 0: # raise AssertionError(f"Results were not generated for solution ID {self.id}.") self.successful = True except Exception as e: print(traceback.format_exc()) self.logs.append(clsLogEvent( identifier=self.id, level="ERROR", code="Error", message=f"Unhandled {type(e).__name__} Exception: {e}" )) # self.logs.append("Exception: {}".format(e,)) finally: self.finished = True
def userResponseBodyValidation(self): """ A function to evaluate whether the JSON body sent to the client conforms to the proper input standard. :return: Boolean: True meaning the body is valid, False meaning the body is not valid """ validity_status = {"isValid": True, "error": None} # ensure all edge attribute lists end with the Explanatory Agent provenance attribute try: if self.userResponseBody['message'][ 'knowledge_graph'] and "edges" in self.userResponseBody[ 'message']['knowledge_graph']: for edge_id, edge in self.userResponseBody['message'][ 'knowledge_graph']['edges'].items(): last_attribute = edge['attributes'][-1] assert last_attribute[ "attribute_type_id"] == "biolink:aggregator_knowledge_source", "Edge missing xARA provenance data" assert last_attribute[ "attribute_source"] == "infores:explanatory-agent", "Edge missing xARA provenance data" except AssertionError as e: self.logs.append( clsLogEvent( identifier="", level="DEBUG", code="", message=f"Provenance assertion failure: {e}").dict()) # validity_status = {"isValid": False, "error": e} except Exception as e: self.logs.append( clsLogEvent( identifier="", level="DEBUG", code="", message=f"Provenance assertion failure: {e}").dict()) try: reasoner_validator.validate(self.userResponseBody, "Response", trapi_version) except ValidationError as e: validity_status = {"isValid": False, "error": e} return validity_status
def execute(self): try: self.execute_hops() self.generate_results() # self.merge_query_graphs() # not used anymore since the QG must remain constant (except for derived queries) self.merge_knowledge_graphs() self.merge_results() # self.merge_logs() self.successful = True except AbortMultihopException as e: self.logs.append( clsLogEvent(identifier=self.dispatchId, level="DEBUG", code="", message=f"Aborting Multihop execution: {e}")) except Exception as e: self.logs.append( clsLogEvent( identifier=self.dispatchId, level="ERROR", code="Error", message=f"Unhandled {type(e).__name__} Exception: {e}")) self.finished = True
def generateResultsForOnePath(self): self.explanation_case_solution = self.explanation_solution_finder.search(self.similarSubject, self.similarObject, self.similarPredicate, self.knowledge_graph, self.paths[0].knowledgeProvider.name, similarity_threshold=self.explanation_similarity) self.explanation_case_solution.logs = self.logs self.logs.append(clsLogEvent( identifier=self.id, level="DEBUG", code="", message=f"Executing explanation case '{self.explanation_case_solution.case_id}'." )) self.results = self.explanation_case_solution.create_results_and_explain(self)
def validateResponseBody(self): """ Verify its a valid TRAPI response :return: None :raises: Will raise InvalidSchema if not valid """ try: # reasoner_validator.validate_Response(self.responseBody) reasoner_validator.validate(self.responseBody, "Response", trapi_version) except ValidationError as e: self.logs.append(clsLogEvent( identifier=self.name, level="ERROR", code="KPMalformedResponse", message=f"Knowledge Provider {self.url} did not return a valid TRAPI v1.2 response" )) raise requests.exceptions.InvalidSchema # force raise a request.exception to know its the KP's fault
def findMostSimilarCases(self): self.triplets_case_ids = [] for i, triplet in enumerate(self.query_path.triplets): source_category = triplet.source_node.data["categories"][0] target_category = triplet.target_node.data["categories"][0] predicate = triplet.predicate.data["predicates"][0] logging.debug(f"Finding similar cases for {triplet}") origins = self.one_hop_origin if self.is_one_hop else self.multiple_hop_origin triplet_case_ids = self.case_problem_searcher.get_global_sim_triplets(source_category, target_category, predicate, origins) self.logs.append(clsLogEvent( identifier=f"Hop {i+1} of {len(self.query_path.triplets)} - {triplet}", level="DEBUG", code="", message=f"Identified {len(self.case_problem_searcher.selected_case_ids_and_similarities)} Case Problems: (Case ID, Similarity) '{self.case_problem_searcher.selected_case_ids_and_similarities}'" )) logging.debug(f"Identified {len(self.case_problem_searcher.selected_case_ids_and_similarities)} Case Problems: (Case ID, Similarity) '{self.case_problem_searcher.selected_case_ids_and_similarities}'") self.triplets_case_ids.append(triplet_case_ids)
def execute_hops(self): for i, case_solution in enumerate(self.case_solutions): # if we are on a second+ hop, then try to populate its unknown node ids if i > 0: previous_solution = self.case_solutions[i - 1] response_body = previous_solution.paths[ -1].knowledgeProvider.responseBody if response_body is None or len( response_body["message"]["results"]) <= 0: raise AbortMultihopException( f"Previous hop KP did not return results: '{previous_solution.paths[-1].knowledgeProvider.url}'" ) # build a list of all ids in the previous case's response for each node to be used in the current case response_node_ids = {} for result in response_body["message"]["results"]: for node_id, node_bindings in result[ "node_bindings"].items(): if node_id not in response_node_ids: response_node_ids[node_id] = set() for node_binding in node_bindings: if "id" in node_binding: response_node_ids[node_id].add( node_binding["id"]) # for each current case node see if it does not have any ids set already. If it doesn't set them. if case_solution.subjectCurieIds is None: if case_solution.subject_query_graph_node_id in response_node_ids: subject_new_ids = sorted( list(response_node_ids[ case_solution.subject_query_graph_node_id])) if len(subject_new_ids) > 50: self.logs.append( clsLogEvent( identifier=self.dispatchId, level="INFO", code="Info", message= f"Too many previous hop subject ids to provide {len(subject_new_ids)}, reducing to 50." )) subject_new_ids = subject_new_ids[:50] case_solution.subjectCurieIds = subject_new_ids if case_solution.objectCurieIds is None: if case_solution.object_query_graph_node_id in response_node_ids: object_new_ids = sorted( list(response_node_ids[ case_solution.object_query_graph_node_id])) if len(object_new_ids) > 50: self.logs.append( clsLogEvent( identifier=self.dispatchId, level="INFO", code="Info", message= f"Too many previous hop object ids to provide {len(object_new_ids)}, reducing to 50." )) object_new_ids = object_new_ids[:50] case_solution.objectCurieIds = object_new_ids case_solution.execute(no_results=False)
def create_results_and_explain(self, case_solution): query_graph = case_solution.query_graph knowledge_graph = case_solution.knowledge_graph e00_id = list(query_graph["edges"].keys())[0] node_ids = sorted(list(query_graph["nodes"].keys())) n00_id = node_ids[0] n01_id = node_ids[1] results = [] score_attribute_lower = self.score_attribute.lower() # MolePro changed their attribute value to "CMAP:similarity score" (no underscores) so test for that, as well. score_attribute_lower_no_underscores = score_attribute_lower.replace("_", " ") # first pass on edges to get all scores similarity_scores = [] for edge_id, edge in knowledge_graph['edges'].items(): for attribute in edge["attributes"]: attribute_type_id_lower = attribute['attribute_type_id'].lower() if attribute_type_id_lower == score_attribute_lower or attribute_type_id_lower == score_attribute_lower_no_underscores: similarity_scores.append(float(attribute['value'])) if len(similarity_scores) > 0: min_similarity_score = min(similarity_scores) max_similarity_score = max(similarity_scores) else: min_similarity_score = -1 max_similarity_score = -1 # sorting for deterministic results. This can go away after index isn't used for rank. for index, (edgeId, edge) in enumerate(sorted(list(knowledge_graph['edges'].items()), key=lambda i: i[0])): if index == 0: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Starting explanations for {len(knowledge_graph['edges'])} edges." )) if index % 50 == 0 and index > 0: logging.debug(f"{self.case_id} edge {index} of {len(knowledge_graph['edges'])}") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained {index} of {len(knowledge_graph['edges'])} edges." )) result = OrderedDict() result['edge_bindings'] = { e00_id: [{ "id": edgeId, }] } result['node_bindings'] = { n00_id: [{"id": edge['subject']}], n01_id: [{"id": edge['object']}], } similarity_score = None for attribute in edge["attributes"]: attribute_type_id_lower = attribute['attribute_type_id'].lower() if attribute_type_id_lower == score_attribute_lower or attribute_type_id_lower == score_attribute_lower_no_underscores: similarity_score = float(attribute['value']) if similarity_score is None: similarity_score = -1 result['score'] = similarity_score / max_similarity_score result['attributes'] = [ { "original_attribute_name": "Explanation Rationale", "attribute_type_id": "biolink:description", "value": "The score was obtained based on the similarity score produced by CMAP (portals.broadinstitute.org/cmap).", "description": "Describes to user the Rationale for explaining the ranking" }, { "original_attribute_name": "Explanation Text", "attribute_type_id": "biolink:has_evidence", "value": f"This result was given a score of {result['score']} based on the range (i.e, {min_similarity_score} to {max_similarity_score}) observed in the current set of results for the attribute {self.score_attribute} supplied by {self.kp_name}.", "description": "Describes the reason this specific edge receives a score w.r.t Rationale" } ] results.append(result) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained all {len(knowledge_graph['edges'])} edges." )) return results
def create_results_and_explain(self, case_solution): query_graph = case_solution.query_graph knowledge_graph = case_solution.knowledge_graph e00_id = list(query_graph["edges"].keys())[0] node_ids = sorted(list(query_graph["nodes"].keys())) n00_id = node_ids[0] n01_id = node_ids[1] results = [] # sorting for deterministic results. This can go away after index isn't used for rank. for index, (edgeId, edge) in enumerate(sorted(list(knowledge_graph['edges'].items()), key=lambda i: i[0])): if index == 0: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Starting explanations for {len(knowledge_graph['edges'])} edges." )) if index % 50 == 0 and index > 0: logging.debug(f"{self.case_id} edge {index} of {len(knowledge_graph['edges'])}") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained {index} of {len(knowledge_graph['edges'])} edges." )) explanation, score = self.identify_edge(edgeId) result = OrderedDict() result['edge_bindings'] = { e00_id: [{ "id": edgeId, }] } result['node_bindings'] = { n00_id: [{"id": edge['subject']}], n01_id: [{"id": edge['object']}], } result['score'] = score result['attributes'] = [ { "original_attribute_name": "Explanation Rationale", "attribute_type_id": "biolink:description", "value": self.rationale, "description": "Describes to user the Rationale for explaining the ranking" }, { "original_attribute_name": "Explanation Text", "attribute_type_id": "biolink:has_evidence", "value": explanation, "description": "Describes the reason this specific edge receives a score w.r.t Rationale" } ] results.append(result) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained all {len(knowledge_graph['edges'])} edges." )) return results
def findCaseSolutions(self): case_solution_sorter = clsCaseSolutionRetrival() # if isNullOrEmptyList(self.caseIds): return all_case_ids = set() for case_ids in self.triplets_case_ids: all_case_ids |= set(case_ids) case_id_to_solutions = {} with self.app.app_context(): results = db.session.execute( statement=self.sqlFindCaseSolutions, params={ "caseIds": list(all_case_ids) } ).fetchall() for result in results: case_id = result[1] if case_id not in case_id_to_solutions: case_id_to_solutions[case_id] = [] case_id_to_solutions[case_id].append(result) caseSolutionDispatchId = 1 triplets_case_solutions = [] for i, triplet in enumerate(self.query_path.triplets): case_ids = self.triplets_case_ids[i] triplet_case_solutions = [] for caseId in case_ids: results = case_id_to_solutions.get(caseId, []) # logging.debug(f"Getting case solutions for Case ID {caseId}") if isNullOrEmptyList(results): logging.error(f"No case solutions retrieved for Case ID '{caseId}") continue sorted_results = case_solution_sorter.retrieveRows_xARA_CaseSolutions(results) for result in sorted_results: triplet_case_solutions.append( self.buildCaseSolutionFromResult( triplet, caseSolutionDispatchId=self.dispatchId + caseSolutionDispatchId, result=self.populateResultKPInfo(result) ) ) caseSolutionDispatchId += 1 triplets_case_solutions.append(triplet_case_solutions) # source_category = triplet.given_node.data["categories"][0] # opposite_category = triplet.opposite_node.data["categories"][0] # predicate = triplet.predicate.data["predicates"][0] self.logs.append(clsLogEvent( identifier=f"Hop {i+1} of {len(self.query_path.triplets)} - {triplet}", level="DEBUG", code="", message=f"Identified {len(triplet_case_solutions)} Case Solutions (first 50): '{triplet_case_solutions[:50]}...'" )) logging.debug(f"Identified {len(triplet_case_solutions)} Case Solutions (first 50): '{triplet_case_solutions[:50]}...'") self.triplets_case_solutions = triplets_case_solutions # trim the solutions down to the smallest set size so they can be iterated with no missing steps smallest_solutions = min([len(s) for s in self.triplets_case_solutions]) logging.debug(f"Reducing triplet solutions length to {smallest_solutions}") for i in range(len(self.triplets_case_solutions)): self.triplets_case_solutions[i] = self.triplets_case_solutions[i][:smallest_solutions] self.dispatchList = [] for solution_index in range(len(self.triplets_case_solutions[0])): path_solutions = [] for triplet_index, triplet in enumerate(self.query_path.triplets): triplet_case_solution = self.triplets_case_solutions[triplet_index][solution_index] path_solutions.append(triplet_case_solution) multi_hop = clsMultiHop(self.dispatchId + 10000 * (solution_index+1) + 1000 * (triplet_index + 1), "", path_solutions) self.dispatchList.append(multi_hop)
def create_results_and_explain(self, case_solution): query_graph = case_solution.query_graph knowledge_graph = case_solution.knowledge_graph e00_id = list(query_graph["edges"].keys())[0] node_ids = sorted(list(query_graph["nodes"].keys())) n00_id = node_ids[0] n01_id = node_ids[1] results = [] start_time = time.time() # sorting edge IDs for deterministic results. edges = sorted(list(knowledge_graph['edges'].items()), key=lambda i: i[0]) # bulk get all canonical identifiers for all node ids node_identifiers = set() for index, (edgeId, edge) in enumerate(edges): edge_subject = edge["subject"] edge_object = edge["object"] node_identifiers.add(edge_subject) node_identifiers.add(edge_object) logging.debug(f"Getting node identifiers") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Getting all ({len(node_identifiers)}) node identifiers." )) node_normalizer_provider = clsNodeNormalizerProvider(sorted(list(node_identifiers))) node_normalizer_provider.get_identifiers() identifiers = node_normalizer_provider.identifiers # bulk get all synonyms logging.debug(f"Getting node synonyms") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Getting all node synonyms." )) name_resolution_provider = clsNameResolutionProvider(sorted(list(identifiers.values()))) name_resolution_provider.get_synonyms() # collect all edges that have acceptable sentences to bulk predict later predictable_edges = [] for index, (edgeId, edge) in enumerate(edges): # # TODO: Remove! Debugging only! # if index > 10: # break if index == 0: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Starting explanations for {len(knowledge_graph['edges'])} edges." )) if index % 50 == 0 and index > 0: logging.debug(f"{self.case_id} edge {index} of {len(knowledge_graph['edges'])}") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained {index} of {len(knowledge_graph['edges'])} edges." )) if time.time() > start_time + 3.0 * 60: self.logs.append(clsLogEvent( identifier=self.case_id, level="ERROR", code="", message=f"Explanation timeout." )) # break raise TimeoutError("Explanation execution ran too long") result = OrderedDict() result['edge_bindings'] = { e00_id: [{ "id": edgeId, }] } # TODO: Some KGs may have reversed edge subjects and objects compared to the result node bindings! Should use result node bindings instead. result['node_bindings'] = { n00_id: [{"id": edge['subject']}], n01_id: [{"id": edge['object']}], } edge_subject = edge["subject"] subject_categories = knowledge_graph['nodes'][edge_subject]['categories'] subject_categories_lower = [s.lower() for s in subject_categories] edge_object = edge["object"] object_categories = knowledge_graph['nodes'][edge_object]['categories'] object_categories_lower = [o.lower() for o in object_categories] self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explaining edge {index+1}: {edgeId} Predicate {edge['predicate']}, Subject: {edge_subject} {subject_categories}, Object: {edge_object} {object_categories}." )) # if either the subject's or object's categories don't have tables representing them, stop. if len(set(subject_categories_lower) & self.valid_categories) == 0 or len(set(object_categories_lower) & self.valid_categories) == 0: result['score'] = 0.0 result['attributes'] = ExplanationX00002.no_found_references_attributes results.append(result) continue subject_synonyms = name_resolution_provider.synonyms[identifiers[edge_subject]] logging.debug(f"{self.case_id} edge {index+1} of {len(knowledge_graph['edges'])} getting subject article references") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} getting subject article references." )) subject_references = self.find_references(subject_synonyms, subject_categories) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Subject Synonyms: {subject_synonyms}, References: {len(subject_references)}." )) if len(subject_references) <= 0: result['score'] = 0.0 result['attributes'] = ExplanationX00002.no_found_references_attributes results.append(result) continue object_synonyms = name_resolution_provider.synonyms[identifiers[edge_object]] logging.debug(f"{self.case_id} edge {index + 1} of {len(knowledge_graph['edges'])} getting object article references") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} getting object article references." )) object_references = self.find_references(object_synonyms, object_categories) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Object Synonyms: {object_synonyms}, References: {len(object_references)}." )) # object_references = [ # {'ID': 261595, 'Article_Number': '1336', 'Article_ID': '288e7bf5d16ac18731d2f1ee7db6d6ed2a720cc2', 'SENT_ID': 85, # 'SENT_TEXT': 'On the other hand, AP2 and SP1 are known to activate epidermal growth factor receptor some chemical.', # 'TOKS': 'some chemical'} # ] if len(object_references) <= 0: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} No object references found, stopping." )) result['score'] = 0.0 result['attributes'] = ExplanationX00002.no_found_references_attributes results.append(result) continue # find matching article ids between the two lists of subject and object results matching_reference_pairs_by_article = {} for subject_reference in subject_references: for object_reference in object_references: if subject_reference["Article_ID"] == object_reference["Article_ID"]: if subject_reference["Article_ID"] not in matching_reference_pairs_by_article: matching_reference_pairs_by_article[subject_reference["Article_ID"]] = [] matching_reference_pairs_by_article[subject_reference["Article_ID"]].append((subject_reference, object_reference)) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Matched subject/object articles: {len(matching_reference_pairs_by_article)}." )) if len(matching_reference_pairs_by_article) <= 0: result['score'] = 0.0 attributes = copy.deepcopy(ExplanationX00002.no_matching_reference_pairs_attributes) # get all unique PMIDs to return to the user pmids = set() for subject_reference in subject_references: pmids.add(subject_reference['Article_ID']) for object_reference in object_references: pmids.add(object_reference['Article_ID']) attributes[1]["value"] += ", ".join(pmids) result['attributes'] = attributes results.append(result) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} No matched articles found, stopping." )) continue article_information = self.get_article_information(list(matching_reference_pairs_by_article.keys())) result['attributes'] = [] score = 0.5 pairs_to_predict = [] for article_id, reference_pairs in matching_reference_pairs_by_article.items(): if time.time() > start_time + 4.5 * 60: self.logs.append(clsLogEvent( identifier=self.case_id, level="ERROR", code="", message=f"Explanation timeout." )) raise TimeoutError("Explanation execution ran too long") pair_distances = [] for reference_pair in reference_pairs: distance = abs(reference_pair[0]["SENT_ID"] - reference_pair[1]["SENT_ID"]) pair_distances.append((reference_pair, distance, reference_pair[0]["Article_Number"])) pair_distances.sort(key=lambda x: (x[1], x[2])) closest_reference_pair_data = pair_distances[0] closest_reference_pair = closest_reference_pair_data[0] self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Article {article_id} closest pair distance {closest_reference_pair_data[1]}." )) if closest_reference_pair_data[1] > 9: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Sentence distance too great: {closest_reference_pair_data[1]}, stopping." )) # if False: attributes = copy.deepcopy(ExplanationX00002.reference_distance_too_large_attributes) attributes[1]["value"] = attributes[1]["value"].format(closest_reference_pair[0]["TOKS"], closest_reference_pair[1]["TOKS"], article_id) result['attributes'] += attributes result['score'] = score # result['attributes'] = attributes # results.append(result) continue pairs_to_predict.append((article_id, closest_reference_pair)) # # combine two sentences and wrap the matched text in special characters # subject_special_sentence = closest_reference_pair[0]['SENT_TEXT'].replace(closest_reference_pair[0]['TOKS'], f"<<{closest_reference_pair[0]['TOKS']}>>") # object_special_sentence = closest_reference_pair[1]['SENT_TEXT'].replace(closest_reference_pair[1]['TOKS'], f"[[{closest_reference_pair[1]['TOKS']}]]") # instance = f"{subject_special_sentence} {object_special_sentence}" # # data_try = 'Agonistic activity of << ICI 182 780 >> on activation of GSK 3β/[[ AKT ]] pathway in the rat uterus during the estrous cycle.' # self.logs.append(clsLogEvent( # identifier=self.case_id, # level="DEBUG", # code="", # message=f"Edge {edgeId} Beginning relationship classification..." # )) # # relation_label = relationship_classification(instance) # self.logs.append(clsLogEvent( # identifier=self.case_id, # level="DEBUG", # code="", # message=f"Edge {edgeId} Relationship classification finished {relation_label}" # )) # # import random # # random.seed(1234) # # relation_label = random.choice(["UPREGULATOR|ACTIVATOR|INDIRECT_UPREGULATOR", "DOWNREGULATOR|INHIBITOR|INDIRECT_DOWNREGULATOR", "AGONIST|AGONIST‐ACTIVATOR|AGONIST‐INHIBITOR", "ANTAGONIST", "SUBSTRATE|PRODUCT_OF|SUBSTRATE_PRODUCT_OF" "No RELATION"]) # # if relation_label == "No RELATION": # attributes = copy.deepcopy(ExplanationX00002.reference_model_fail_attributes) # attributes[1]["value"] = attributes[1]["value"].format(closest_reference_pair[0]["TOKS"], closest_reference_pair[1]["TOKS"], article_id) # result['attributes'] += attributes # else: # article = article_information.get(article_id, {"Title": "Article Not Found", "Author": "Article Not Found"}) # attributes = copy.deepcopy(ExplanationX00002.reference_model_pass_attributes) # attributes[1]["value"] = attributes[1]["value"].format(subject_name=closest_reference_pair[0]["TOKS"], # object_name=closest_reference_pair[1]["TOKS"], # relation_label=relation_label, article_title=article["Title"], article_author=article["Author"]) # result['attributes'] += attributes # score = 1.0 # # result['score'] = score if len(pairs_to_predict) > 0: self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} explanation queued." )) predictable_edges.append((edge, result, article_information, pairs_to_predict)) else: results.append(result) if len(predictable_edges) > 0: prediction_sentences = [] for edge, result, article_information, pairs_to_predict in predictable_edges: for article_id, closest_reference_pair in pairs_to_predict: # combine two sentences and wrap the matched text in special characters subject_pattern = re.compile(re.escape(str(closest_reference_pair[0]['TOKS'])), re.IGNORECASE) subject_special_sentence = subject_pattern.sub(f"<<{closest_reference_pair[0]['TOKS']}>>", closest_reference_pair[0]['SENT_TEXT']) # subject_special_sentence = closest_reference_pair[0]['SENT_TEXT'].replace(closest_reference_pair[0]['TOKS'], f"<<{closest_reference_pair[0]['TOKS']}>>") object_pattern = re.compile(re.escape(str(closest_reference_pair[1]['TOKS'])), re.IGNORECASE) object_special_sentence = object_pattern.sub(f"<<{closest_reference_pair[1]['TOKS']}>>", closest_reference_pair[1]['SENT_TEXT']) # object_special_sentence = closest_reference_pair[1]['SENT_TEXT'].replace(closest_reference_pair[1]['TOKS'], f"[[{closest_reference_pair[1]['TOKS']}]]") instance = f"{subject_special_sentence} {object_special_sentence}" prediction_sentences.append(instance) t0 = time.time() relation_labels = relationship_classification(prediction_sentences) t1 = time.time() logging.debug(f"Predicting {len(prediction_sentences)} sentences took {t1-t0} seconds.") self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Predicting {len(prediction_sentences)} sentences took {t1-t0} seconds." )) # for i, (edge, article_id, closest_reference_pair, result) in enumerate(predictable_edges): prediction_index = 0 for i, (edge, result, article_information, pairs_to_predict) in enumerate(predictable_edges): for j, (article_id, closest_reference_pair) in enumerate(pairs_to_predict): relation_label = relation_labels[prediction_index] self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Beginning relationship classification..." )) # relation_label = relationship_classification(instance) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Edge {edgeId} Relationship classification finished {relation_label}" )) # import random # random.seed(1234) # relation_label = random.choice(["UPREGULATOR|ACTIVATOR|INDIRECT_UPREGULATOR", "DOWNREGULATOR|INHIBITOR|INDIRECT_DOWNREGULATOR", "AGONIST|AGONIST‐ACTIVATOR|AGONIST‐INHIBITOR", "ANTAGONIST", "SUBSTRATE|PRODUCT_OF|SUBSTRATE_PRODUCT_OF" "No RELATION"]) if relation_label == "No RELATION": attributes = copy.deepcopy(ExplanationX00002.reference_model_fail_attributes) attributes[1]["value"] = attributes[1]["value"].format(closest_reference_pair[0]["TOKS"], closest_reference_pair[1]["TOKS"], article_id) result['attributes'] += attributes else: article = article_information.get(article_id, {"Title": "Article Not Found", "Author": "Article Not Found"}) attributes = copy.deepcopy(ExplanationX00002.reference_model_pass_attributes) attributes[1]["value"] = attributes[1]["value"].format(subject_name=closest_reference_pair[0]["TOKS"], object_name=closest_reference_pair[1]["TOKS"], relation_label=relation_label, article_title=article["Title"], article_author=article["Author"]) result['attributes'] += attributes score = 1.0 result['score'] = score prediction_index += 1 results.append(result) self.logs.append(clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained all {len(knowledge_graph['edges'])} edges." )) return results
def create_results_and_explain(self, case_solution): query_graph = case_solution.query_graph knowledge_graph = case_solution.knowledge_graph e00_id = list(query_graph["edges"].keys())[0] node_ids = sorted(list(query_graph["nodes"].keys())) n00_id = node_ids[0] n01_id = node_ids[1] results = [] # sorting for deterministic results. This can go away after index isn't used for rank. for index, (edgeId, edge) in enumerate( sorted(list(knowledge_graph['edges'].items()), key=lambda i: i[0])): if index == 0: self.logs.append( clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message= f"Starting explanations for {len(knowledge_graph['edges'])} edges." )) if index % 50 == 0 and index > 0: logging.debug( f"{self.case_id} edge {index} of {len(knowledge_graph['edges'])}" ) self.logs.append( clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message= f"Explained {index} of {len(knowledge_graph['edges'])} edges." )) result = OrderedDict() result['edge_bindings'] = { e00_id: [{ "id": edgeId, }] } result['node_bindings'] = { n00_id: [{ "id": edge['subject'] }], n01_id: [{ "id": edge['object'] }], } result['score'] = 0.0 result['attributes'] = [] results.append(result) self.logs.append( clsLogEvent( identifier=self.case_id, level="DEBUG", code="", message=f"Explained all {len(knowledge_graph['edges'])} edges." )) return results