def __init__(self): ## Connect to neo4j database rtxc = RTXConfiguration() rtxc.live = 'KG2c' self.driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password))
def __init__(self): ## Connect to neo4j database rtxc = RTXConfiguration() rtxc.live = 'KG2c' print(f"You're using '{rtxc.neo4j_bolt}'", flush=True) self.driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password))
def run_neo4j_query(cypher, kg_name, data_type): rtx_config = RTXConfiguration() if kg_name != "KG1": rtx_config.live = kg_name driver = GraphDatabase.driver(rtx_config.neo4j_bolt, auth=(rtx_config.neo4j_username, rtx_config.neo4j_password)) with driver.session() as session: start = time.time() print(f"Grabbing {data_type} from {kg_name} neo4j...") results = session.run(cypher).data() print(f"...done. Query took {round((time.time() - start) / 60, 2)} minutes.") driver.close() return results
def _run_cypher_query(cypher_query: str, kg='KG2') -> List[Dict[str, any]]: rtxc = RTXConfiguration() if kg == 'KG2': rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"Encountered an error interacting with {kg} neo4j. {tb}") return [] else: return query_results
def run_neo4j_query(cypher, kg_name): rtx_config = RTXConfiguration() if kg_name == "KG2": # Flip into KG2 mode if this is a KG2 query (otherwise we're already set to use KG1) rtx_config.live = "KG2" driver = GraphDatabase.driver(rtx_config.neo4j_bolt, auth=(rtx_config.neo4j_username, rtx_config.neo4j_password)) with driver.session() as session: start = time.time() print(f"Grabbing node pairs from {kg_name} neo4j...") results = session.run(cypher).data() print( f"...done. Query took {round((time.time() - start) / 60, 2)} minutes." ) driver.close() return results
def _setup_rtx_config_local(kg2pre_neo4j_endpoint: str, synonymizer_name: str): """ This function creates a config_local.json file based off of configv2.json, but modified for our needs. """ logging.info( "Creating a config_local.json file pointed to the right synonymizer and KG2pre Neo4j.." ) # First remove any existing configv2.json or config_local.json subprocess.call(["rm", "-f", f"{CODE_DIR}/configv2.json"]) # Save a copy of any pre-existing config_local.json so we don't overwrite it original_config_local_file = pathlib.Path(f"{CODE_DIR}/config_local.json") if original_config_local_file.exists(): subprocess.check_call([ "mv", f"{CODE_DIR}/config_local.json", f"{CODE_DIR}/config_local.json_KG2CBUILDTEMP" ]) RTXConfiguration() # Regenerates configv2.json with the latest version with open(f"{CODE_DIR}/configv2.json") as configv2_file: rtx_config_dict = json.load(configv2_file) # Point to the 'right' KG2 Neo4j (the one specified in the KG2c config) and synonymizer (we always use simple name) rtx_config_dict["Contextual"]["KG2"]["neo4j"][ "bolt"] = f"bolt://{kg2pre_neo4j_endpoint}:7687" for mode, path_info in rtx_config_dict["Contextual"].items(): path_info["node_synonymizer"][ "path"] = f"/something/{synonymizer_name}" # Only need name, not full path # Save our new config_local.json file with open(f"{CODE_DIR}/config_local.json", "w+") as config_local_file: json.dump(rtx_config_dict, config_local_file)
class TestPatchKG(TestCase): rtxConfig = RTXConfiguration() def test_add_disease_has_phenotype_relations(self): conn = Neo4jConnection(self.rtxConfig.neo4j_bolt, self.rtxConfig.neo4j_username, self.rtxConfig.neo4j_password) disease_nodes = conn.get_disease_nodes() # generate random number array random_indexes = random_int_list(0, len(disease_nodes) - 1, 10) # query BioLink relation_array = [] for random_index in random_indexes: d_id = disease_nodes[random_index] hp_array = QueryBioLink.map_disease_to_phenotype(d_id) for hp_id in hp_array: relation_array.append({"d_id": d_id, "p_id": hp_id}) # query Neo4j Database for relation_item in relation_array: result = conn.count_has_phenotype_relation(relation_item) self.assertEqual(result, 1) conn.close()
def _run_cypher_query(cypher_query: str, kp: str, log: Response) -> List[Dict[str, any]]: rtxc = RTXConfiguration() if kp == "KG2": # Flip into KG2 mode if that's our KP (rtx config is set to KG1 info by default) rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered an error interacting with {kp} neo4j. {tb}", error_code=error_type.__name__) return [] else: return query_results
def addNewResponse(self, response, query): session = self.session n_results = 0 if response.result_list is not None: n_results = len(response.result_list) rtxConfig = RTXConfiguration() response.tool_version = rtxConfig.version storedResponse = Response( response_datetime=datetime.now(), restated_question=response.restated_question_text, query_type=query["known_query_type_id"], terms=str(query["terms"]), tool_version=rtxConfig.version, result_code=response.result_code, message=response.message, n_results=n_results, response_object=pickle.dumps(ast.literal_eval(repr(response)))) session.add(storedResponse) session.flush() #print("Returned response_id is "+str(storedResponse.response_id)) response.id = "http://rtx.ncats.io/api/rtx/v1/response/" + str( storedResponse.response_id) self.addNewResults(storedResponse.response_id, response) #### After updating all the ids, store an updated object storedResponse.response_object = pickle.dumps( ast.literal_eval(repr(response))) session.commit() return storedResponse.response_id
def __init__(self, mapfile='node_cui_map.csv', mysql_timeout=30): # self.smdb = QuerySemMedDB("rtxdev.saramsey.org",3306,"rtx_read","rtxd3vT3amXray","semmeddb", mysql_timeout) # self.umls = QueryUMLSSQL("rtxdev.saramsey.org",3406, "rtx_read","rtxd3vT3amXray","umls") rtxConfig = RTXConfiguration() self.smdb = QuerySemMedDB(rtxConfig.mysql_semmeddb_host, rtxConfig.mysql_semmeddb_port, rtxConfig.mysql_semmeddb_username, rtxConfig.mysql_semmeddb_password, "semmeddb", mysql_timeout) self.umls = QueryUMLSSQL(rtxConfig.mysql_umls_host, rtxConfig.mysql_umls_port, rtxConfig.mysql_umls_username, rtxConfig.mysql_umls_password, "umls") self.semrep_url = "http://rtxdev.saramsey.org:5000/semrep/convert?string=" self.timeout_sec = 120 self.mg = QueryMyGene() try: df = pandas.read_csv(mapfile, converters={'cuis': ast.literal_eval}) cui_dict = {} if 'cuis' in df.columns and 'id' in df.columns: for a in range(len(df)): for df_cui in df['cuis'][a]: if df_cui in cui_dict.keys(): cui_dict[df_cui] += [df['id'][a]] else: cui_dict[df_cui] = [df['id'][a]] self.map_df = df self.cui_dict = cui_dict except FileNotFoundError: self.cui_dict = {}
def __init__(self): self.rtxConfig = RTXConfiguration() self.databaseName = "ResponseCache" self.engine_type = 'sqlite' if self.rtxConfig.is_production_server: self.engine_type = 'mysql' self.connect()
def _answer_query_using_plover( qg: QueryGraph, log: ARAXResponse ) -> Tuple[Dict[str, Dict[str, Set[Union[str, int]]]], int]: rtxc = RTXConfiguration() rtxc.live = "Production" log.debug(f"Sending query to Plover") response = requests.post(f"{rtxc.plover_url}/query", json=qg.to_dict(), headers={'accept': 'application/json'}) if response.status_code == 200: log.debug(f"Got response back from Plover") return response.json(), response.status_code else: log.warning( f"Plover returned a status code of {response.status_code}. Response was: {response.text}" ) return dict(), response.status_code
def estimate_percent_nodes_covered_by_ultrafast_ngd(kg: str): print(f"Estimating the percent of {kg} nodes covered by the local NGD system..") rtxc = RTXConfiguration() if kg == 'KG2': rtxc.live = "KG2" #curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite") curie_to_pmids_path = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'NormalizedGoogleDistance']) curie_to_pmid_db = SqliteDict(f"{curie_to_pmids_path}{os.path.sep}{rtxc.curie_to_pmids_path.sep('/')[-1]}") percentages_mapped = [] num_batches = 20 batch_size = 4000 all_nodes_mapped_by_type = dict() for number in range(num_batches): # Get random selection of node IDs from the KG random_node_ids = _get_random_node_ids(batch_size, kg) # Use synonymizer to get their canonicalized info synonymizer = NodeSynonymizer() canonical_curie_info = synonymizer.get_canonical_curies(list(random_node_ids)) recognized_curies = {input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie)} # See if those canonical curies are in our local database num_mapped_to_pmids = 0 for input_curie in recognized_curies: canonical_curie = canonical_curie_info[input_curie].get('preferred_curie') preferred_type = canonical_curie_info[input_curie].get('preferred_type') if preferred_type not in all_nodes_mapped_by_type: all_nodes_mapped_by_type[preferred_type] = {'covered': 0, 'not_covered': 0} if canonical_curie and canonical_curie in curie_to_pmid_db: num_mapped_to_pmids += 1 all_nodes_mapped_by_type[preferred_type]['covered'] += 1 else: all_nodes_mapped_by_type[preferred_type]['not_covered'] += 1 percentage_mapped = (num_mapped_to_pmids / len(random_node_ids)) * 100 percentages_mapped.append(percentage_mapped) average = sum(percentages_mapped) / len(percentages_mapped) print(f"Estimated coverage of {kg} nodes: {round(average)}%.") node_type_percentages_dict = dict() for node_type, coverage_info in all_nodes_mapped_by_type.items(): num_covered = coverage_info['covered'] num_total = coverage_info['covered'] + coverage_info['not_covered'] percentage = round((num_covered / num_total) * 100) node_type_percentages_dict[node_type] = percentage for node_type, percentage in sorted(node_type_percentages_dict.items(), key=lambda item: item[1], reverse=True): print(f" {node_type}: {percentage}%")
def addNewResponse(self, response, query): session = self.session n_results = 0 if response.result_list is not None: n_results = len(response.result_list) #### Add result metadata if response.result_list is not None: for result in response.result_list: if result.reasoner_id is None: result.reasoner_id = "RTX" #### Update the response with current information rtxConfig = RTXConfiguration() if response.tool_version is None: response.tool_version = rtxConfig.version if response.schema_version is None: response.schema_version = "0.8.0" if response.reasoner_id is None: response.reasoner_id = "RTX" response.n_results = n_results response.type = "medical_translator_query_response" response.context = "https://raw.githubusercontent.com/biolink/biolink-model/master/context.jsonld" response.datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if response.restated_question_text is None: response.restated_question_text = "" if response.original_question_text is None: response.original_question_text = "" termsString = "{}" if query is not None: if "terms" in query: termsString = stringifyDict(query["terms"]) storedResponse = Response( response_datetime=datetime.now(), restated_question=response.restated_question_text, query_type=query["query_type_id"], terms=termsString, tool_version=rtxConfig.version, result_code=response.response_code, message=response.message, n_results=n_results, response_object=pickle.dumps(ast.literal_eval(repr(response)))) session.add(storedResponse) session.flush() response.id = "https://rtx.ncats.io/api/rtx/v1/response/" + str( storedResponse.response_id) self.addNewResults(storedResponse.response_id, response) #### After updating all the ids, store an updated object storedResponse.response_object = pickle.dumps( ast.literal_eval(repr(response))) session.commit() return storedResponse.response_id
def _run_cypher_query(cypher_query: str) -> List[Dict[str, any]]: # This function sends a cypher query to the KG2 neo4j and returns results rtxc = RTXConfiguration() rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: print(f"Sending cypher query to KG2 neo4j ({rtxc.neo4j_bolt})") query_results = session.run(cypher_query).data() print(f"Got {len(query_results)} results back from neo4j") driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"Encountered an error interacting with KG2 neo4j. {tb}") return [] else: return query_results
def create_message(self, describe=False): """ Creates a basic empty Message object with basic boilerplate metadata :return: Response object with execution information and the new message object inside the data envelope :rtype: Response """ # Internal documentation setup #allowable_parameters = { 'action': { 'None' } } allowable_parameters = { 'dsl_command': '`create_message()`' } # can't get this name at run-time, need to manually put it in per https://www.python.org/dev/peps/pep-3130/ if describe: allowable_parameters[ 'brief_description'] = """The `create_message` method creates a basic empty Message object with basic boilerplate metadata such as reasoner_id, schema_version, etc. filled in. This DSL command takes no arguments""" return allowable_parameters #### Define a default response response = Response() self.response = response #### Create the top-level message response.info("Creating an empty template ARAX Message") message = Message() self.message = message #### Fill it with default information message.id = None message.type = "translator_reasoner_message" message.reasoner_id = "ARAX" message.tool_version = RTXConfiguration().version message.schema_version = "0.9.3" message.message_code = "OK" message.code_description = "Created empty template Message" message.context = "https://raw.githubusercontent.com/biolink/biolink-model/master/context.jsonld" #### Why is this _datetime ?? FIXME message._datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #### Create an empty master knowledge graph message.knowledge_graph = KnowledgeGraph() message.knowledge_graph.nodes = [] message.knowledge_graph.edges = [] #### Create an empty query graph message.query_graph = QueryGraph() message.query_graph.nodes = [] message.query_graph.edges = [] #### Create empty results message.results = [] message.n_results = 0 #### Return the response response.data['message'] = message return response
def size_of_given_type_in_KP(self, node_type, use_cypher_command=True, kg='KG1'): """ find all nodes of a certain type in KP :param node_type: the query node type :param use_cypher_command: Boolean (True or False). If True, it used cypher command to query all nodes otherwise used kgNodeIndex :param kg: only allowed for choosing 'KG1' or 'KG2' now. Will extend to BTE later """ # TODO: extend this to KG2, BTE, and other KP's we know of size_of_total = None if kg == 'KG1' or kg == 'KG2': pass else: self.response.error( f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally" ) return size_of_total if kg == 'KG1': if use_cypher_command: rtxConfig = RTXConfiguration() # Connection information for the neo4j server, populated with orangeboard driver = GraphDatabase.driver(rtxConfig.neo4j_bolt, auth=basic_auth( rtxConfig.neo4j_username, rtxConfig.neo4j_password)) session = driver.session() query = "MATCH (n:%s) return count(distinct n)" % (node_type) res = session.run(query) size_of_total = res.single()["count(distinct n)"] return size_of_total else: kgNodeIndex = KGNodeIndex() size_of_total = kgNodeIndex.get_total_entity_count(node_type, kg_name=kg) return size_of_total else: if use_cypher_command: self.response.warning( f"KG2 is only allowable to use kgNodeIndex to query the total number of node with query type. It was set to use kgNodeIndex" ) kgNodeIndex = KGNodeIndex() size_of_total = kgNodeIndex.get_total_entity_count(node_type, kg_name=kg) return size_of_total else: kgNodeIndex = KGNodeIndex() size_of_total = kgNodeIndex.get_total_entity_count(node_type, kg_name=kg) return size_of_total
def test_issue_130(): ob = Orangeboard(debug=True) ob.set_dict_reltype_dirs({'targets': True}) node1 = ob.add_node('drug', 'x', seed_node_bool=True) node2 = ob.add_node('uniprot_protein', 'w', seed_node_bool=False) ob.add_rel('targets', 'ChEMBL', node1, node2, prob=0.5) rtxConfig = RTXConfiguration() ob.neo4j_set_url(rtxConfig.neo4j_bolt) ob.neo4j_set_auth(rtxConfig.neo4j_username, rtxConfig.neo4j_password) ob.neo4j_push() print(ob)
def __init__(self): self.rtxConfig = RTXConfiguration() self.databaseName = "QueryTracker" self.engine_type = 'sqlite' self.session = None self.engine = None if self.rtxConfig.is_production_server: self.databaseName = "ResponseCache" self.engine_type = 'mysql' self.connect()
def test_issue_120(): ob = Orangeboard(debug=True) ob.set_dict_reltype_dirs({'interacts_with': False}) node1 = ob.add_node('uniprot_protein', 'w', seed_node_bool=True) node2 = ob.add_node('bartype', 'x', seed_node_bool=False) ob.add_rel('interacts_with', 'PC2', node1, node2) ob.add_rel('interacts_with', 'PC2', node2, node1) rtxConfig = RTXConfiguration() ob.neo4j_set_url(rtxConfig.neo4j_bolt) ob.neo4j_set_auth(rtxConfig.neo4j_username, rtxConfig.neo4j_password) ob.neo4j_push() print(ob)
def _answer_query_using_plover(qg: QueryGraph, log: ARAXResponse) -> Tuple[Dict[str, Dict[str, Union[set, dict]]], int]: rtxc = RTXConfiguration() rtxc.live = "Production" # First prep the query graph (requires some minor additions for Plover) dict_qg = qg.to_dict() dict_qg["include_metadata"] = True # Ask plover to return node/edge objects (not just IDs) dict_qg["respect_predicate_symmetry"] = True # Ignore direction for symmetric predicate, enforce for asymmetric # Allow subclass_of reasoning for qnodes with a small number of curies for qnode in dict_qg["nodes"].values(): if qnode.get("ids") and len(qnode["ids"]) < 5: if "allow_subclasses" not in qnode or qnode["allow_subclasses"] is None: qnode["allow_subclasses"] = True # Then send the actual query response = requests.post(f"{rtxc.plover_url}/query", json=dict_qg, timeout=60, headers={'accept': 'application/json'}) if response.status_code == 200: log.debug(f"Got response back from Plover") return response.json(), response.status_code else: log.warning(f"Plover returned a status code of {response.status_code}. Response was: {response.text}") return dict(), response.status_code
def connect(self): #engine = create_engine("sqlite:///"+self.databaseName) rtxConfig = RTXConfiguration() engine = create_engine("mysql+pymysql://" + rtxConfig.mysql_feedback_username + ":" + rtxConfig.mysql_feedback_password + "@" + rtxConfig.mysql_feedback_host + "/" + self.databaseName) DBSession = sessionmaker(bind=engine) session = DBSession() self.session = session self.engine = engine
def _run_cypher_query(cypher_query: str, kg_name: str, log: ARAXResponse) -> List[Dict[str, any]]: rtxc = RTXConfiguration() if "KG2" in kg_name: # Flip into KG2 mode if that's our KP (rtx config is set to KG1 info by default) rtxc.live = kg_name.upper( ) # TODO: Eventually change config file to "KG2c" vs. "KG2C" (then won't need to convert case here) try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error( f"Encountered an error interacting with {kg_name} neo4j. {tb}", error_code=error_type.__name__) return [] else: return query_results
def __init__(self, pubmed_directory_path, is_test, live="Production"): self.RTXConfig = RTXConfiguration() self.RTXConfig.live = live ngd_filepath = os.path.sep.join([ *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'NormalizedGoogleDistance' ]) self.pubmed_directory_path = pubmed_directory_path self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db" self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test
def _connect_to_kg2c_sqlite() -> Tuple[sqlite3.Connection, sqlite3.Cursor]: path_list = os.path.realpath(__file__).split(os.path.sep) rtx_index = path_list.index("RTX") rtxc = RTXConfiguration() sqlite_dir_path = os.path.sep.join([ *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c' ]) sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1] sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}" connection = sqlite3.connect(sqlite_file_path) cursor = connection.cursor() return connection, cursor
def createDatabase(self): print("Creating database") #if os.path.exists(self.databaseName): # os.remove(self.databaseName) #engine = create_engine("sqlite:///"+self.databaseName) rtxConfig = RTXConfiguration() engine = create_engine("mysql+pymysql://" + rtxConfig.mysql_feedback_username + ":" + rtxConfig.mysql_feedback_password + "@" + rtxConfig.mysql_feedback_host + "/" + self.databaseName) Base.metadata.create_all(engine) self.connect()
def connect(self): rtxConfig = RTXConfiguration() engine = create_engine("mysql+pymysql://" + rtxConfig.mysql_feedback_username + ":" + rtxConfig.mysql_feedback_password + "@" + rtxConfig.mysql_feedback_host + "/" + self.databaseName) DBSession = sessionmaker(bind=engine) session = DBSession() self.session = session self.engine = engine if not engine.dialect.has_table(engine, 'arax_query'): self.create_tables()
def getCachedResponse(self, query): if "bypass_cache" in query and query["bypass_cache"] == "true": return session = self.session rtxConfig = RTXConfiguration() tool_version = rtxConfig.version #### Look for previous responses we could use storedResponse = session.query(Response).filter( Response.query_type == query["known_query_type_id"]).filter( Response.tool_version == tool_version).filter( Response.terms == str(query["terms"])).order_by( desc(Response.response_datetime)).first() if (storedResponse is not None): return pickle.loads(storedResponse.response_object) return
def __get_node_names(type): # # connect to Neo4j # f = open(os.path.join(neo4j_helper_dir, 'config.json'), 'r') # config_data = f.read() # f.close() # config = json.loads(config_data) # create the RTXConfiguration object rtxConfig = RTXConfiguration() conn = Neo4jConnection(rtxConfig.neo4j_bolt, rtxConfig.neo4j_username, rtxConfig.neo4j_password) names = conn.get_node_names(type) conn.close() return names
def __init__(self, bolt, user=None, password=None): """ :param bolt: A string containing the bolt address of the neo4j instance you wish to upload to :param user: A string containing the username for neo4j :param password: A string containing the password for neo4j """ if user is None or password is None: rtxConfig = RTXConfiguration() if user is None: user = rtxConfig.neo4j_username if password is None: password = rtxConfig.neo4j_password # Connection information for the neo4j server, populated with orangeboard self.driver = GraphDatabase.driver(bolt, auth=basic_auth(user, password))