def migrate_journals(uri, database, journal_names: list, ctn, process_id=0): ''' Migrate journals to Grakn \n journal_names - list of journal names (strings) \n process_id - process id while running on multiple cores, by process_id = 0 ''' with Grakn.core_client(uri) as client: with client.session(database, SessionType.DATA) as session: counter = 0 transaction = session.transaction(TransactionType.WRITE) for journal_name in journal_names: ##Check if journal already in Knowledge Base try: match_query = 'match $j isa journal, has journal-name "{}";'.format( journal_name) next(transaction.query().match(match_query)) except StopIteration: insert_query = 'insert $j isa journal, has journal-name "{}";'.format( journal_name) transaction.query().insert(insert_query) if counter % ctn == 0: transaction.commit() transaction.close() transaction = session.transaction(TransactionType.WRITE) print("Process {} COMMITED".format(process_id)) print("Process {} ----- {} journals added".format( process_id, counter)) counter = counter + 1 transaction.commit() transaction.close()
def dgidbMigrator(uri, database, num_dr, num_int, num_threads, ctn): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) insertDrugs(uri, database, num_dr, num_threads, ctn, session) insertInteractions(uri, database, num_int, num_threads, ctn, session) session.close() client.close()
def disgenetMigrator(uri, database, num, num_threads, ctn): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) batches_pr = [] if num != 0: print(' ') print('Opening Disgenet dataset...') print(' ') url = "https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz" get_file(url, 'Dataset/Disgenet/') with gzip.open('Dataset/Disgenet/all_gene_disease_associations.tsv.gz', 'rt') as f: csvreader = csv.reader(f, delimiter=' ') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: raw_file.append(row) disgenet = [] for i in raw_file[:num]: data = {} data['entrez-id'] = i[0].strip() data['gene-symbol'] = i[1] data['disease-id'] = i[4] data['disease-name'] = i[5] data['disgenet-score'] = float(i[9]) disgenet.append(data) os.remove('Dataset/Disgenet/all_gene_disease_associations.tsv.gz') insertDiseases(disgenet, session, num_threads, ctn) counter = 0 pool = ThreadPool(num_threads) batches = [] for q in disgenet: counter = counter + 1 graql = f""" match $g isa gene, has gene-symbol "{q['gene-symbol']}", has entrez-id "{q['entrez-id']}"; $d isa disease, has disease-id "{q['disease-id']}", has disease-name "{q['disease-name']}"; insert $r (associated-gene: $g, associated-disease: $d) isa gene-disease-association, has disgenet-score {q['disgenet-score']};""" batches.append(graql) del graql if counter % ctn == 0: batches_pr.append(batches) batches = [] batches_pr.append(batches) pool.map(partial(batch_job, session), batches_pr) pool.close() pool.join() print('.....') print('Finished migrating Disgenet.') print('.....') session.close() client.close()
def tissueNetMigrator(uri, keyspace, num, num_threads, ctn): client = Grakn(uri=uri) session = client.session(keyspace=keyspace) batches_pr = [] if num != 0: print(' ') print('Opening TissueNet dataset...') print(' ') with open('Dataset/TissueNet/HPA-Protein.tsv', 'rt', encoding='utf-8') as csvfile: csvreader = csv.reader(csvfile, delimiter=' ') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: raw_file.append(row)
def reactomeMigrator(uri, database, num_path, num_threads, ctn): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) pathway_associations = filterHomoSapiens(num_path) insertPathways(uri, database, num_threads, ctn, session, pathway_associations) insertPathwayInteractions(uri, database, num_threads, ctn, session, pathway_associations) session.close() client.close()
def grakn_session(self): """ Did this like this in an attempt to make it also work when using with a DataLoader with num_workers > 0. TODO: it does not, so look into this. """ if not self._grakn_session: print("setting up session") print(self) client = Grakn.core_client(self._uri) self._grakn_session = client.session(database=self._database, session_type=SessionType.DATA) return self._grakn_session
def migrate_publications(uri, database, publications_list: list, ctn, process_id=0): ''' Migrate publiations to Grakn\n publications_list - list of dictionaries with publication data\n process_id - process id while running on multiple cores, by process_id = 0 ''' with Grakn.core_client(uri) as client: with client.session(database, SessionType.DATA) as session: counter = 0 transaction = session.transaction(TransactionType.WRITE) for publication_dict in publications_list: authors = publication_dict[ "authors"] #list of authors - list of strings ##Check if publication already in Knowledge Base try: match_query = 'match $p isa publication, has paper-id "{}";'.format( publication_dict["paper-id"]) next(transaction.query().match(match_query)) except StopIteration: match_query = 'match $j isa journal, has journal-name "{}"; '.format( publication_dict["journal-name"]) match_query = match_query + create_authorship_query( authors)[0] insert_query = 'insert $p isa publication, has paper-id "{}", has title "{}", has doi "{}", has publish-time "{}", has volume "{}", has issn "{}", has pmid "{}"; '.format( publication_dict["paper-id"], publication_dict["title"], publication_dict["doi"], publication_dict["publish-time"], publication_dict["volume"], publication_dict["issn"], publication_dict["pmid"]) insert_query = insert_query + create_authorship_query( authors)[1] insert_query = insert_query + '(publishing-journal: $j, published-publication: $p) isa publishing;' # print(match_query+insert_query) transaction.query().insert(match_query + insert_query) if counter % ctn == 0: transaction.commit() transaction.close() transaction = session.transaction(TransactionType.WRITE) print("Process {} COMMITED".format(process_id)) print("Process {} ----- {} publications added".format( process_id, counter)) counter = counter + 1 transaction.commit() transaction.close()
def proteinAtlasMigrator(uri, database, num, num_threads, ctn): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) batches_pr = [] if num != 0: print(' ') print('Opening HPA dataset...') print(' ') get_file('https://www.proteinatlas.org/download/normal_tissue.tsv.zip', 'Dataset/HumanProteinAtlas/') with ZipFile('Dataset/HumanProteinAtlas/normal_tissue.tsv.zip', 'r') as f: f.extractall('Dataset/HumanProteinAtlas') with open('Dataset/HumanProteinAtlas/normal_tissue.tsv', 'rt', encoding='utf-8') as csvfile: csvreader = csv.reader(csvfile, delimiter=' ') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: d = {} d['ensembl-gene-id'] = row[0] d['gene-symbol'] = row[1] d['tissue'] = row[2] d['expression-value'] = row[4] d['expression-value-reliability'] = row[5] raw_file.append(d) os.remove('Dataset/HumanProteinAtlas/normal_tissue.tsv.zip') os.remove('Dataset/HumanProteinAtlas/normal_tissue.tsv') tissue = [] for r in raw_file[:num]: tissue.append(r['tissue']) tissue = (list(set(tissue))) insertTissue(tissue, session, num_threads) insertEnsemblId(raw_file, session, num_threads, ctn) insertGeneTissue(raw_file, session, num_threads, ctn)
def migrate_relationships(uri, database, data: list, ctn, process_id=0): ''' Migrate relations to Grakn\n data - table in a form of list of lists \n process_id - process id while running on multiple cores, by process_id = 0 ''' with Grakn.core_client(uri) as client: with client.session(database, SessionType.DATA) as session: counter = 0 transaction = session.transaction(TransactionType.WRITE) for data_entity in data: predicate_name = data_entity[1] subject_name = data_entity[2] object_name = data_entity[3] relation = relationship_mapper( predicate_name ) #add handler for situation when there is no relation implemented in a mapper pmid = data_entity[0] sentence_text = data_entity[4].replace('"', "'") match_query = 'match $p isa publication, has paper-id "{}"; $g1 isa gene, has gene-symbol "{}"; $g2 isa gene, has gene-symbol "{}"; '.format( data_entity[0], data_entity[2], data_entity[3]) insert_query = 'insert $r ({}: $g1, {}: $g2) isa {}, has sentence-text "{}"; $m (mentioned-genes-relation: $r, mentioning: $p) isa mention, has source "SemMed";'.format( relation["active-role"], relation["passive-role"], relation["relation-name"], sentence_text) transaction.query().insert(match_query + insert_query) print(match_query + insert_query) if counter % ctn == 0: transaction.commit() transaction.close() transaction = session.transaction(TransactionType.WRITE) print("Process {} COMMITED".format(process_id)) print("Process {} ----- {} relations added".format( process_id, counter)) counter = counter + 1 transaction.commit() transaction.close()
def cord_ner_migrator(uri, database, num_ner, num_threads, ctn): print('.....') print('Opening CORD NER file.') print('.....') # FIRST DOWNLOAD THE CORD-NER-FULL.json FROM THIS WEBSITE: # https://uofi.app.box.com/s/k8pw7d5kozzpoum2jwfaqdaey1oij93x/file/651148518303 # AND ADD IT TO THIS DIR: DATASET/CORD_NER/ # TODO: Implement a JSON streamer with open('Dataset/CORD_NER/CORD-NER-full.json', "r") as f: data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]") # The session could time out if we open it BEFORE we load the file client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) data = data[:num_ner] insert_authors(data, num_threads, ctn, session) insert_journals(data, num_threads, ctn, session) insert_publications_journals(data, num_threads, ctn, session) #>insert_publications_with_authors(data, num_threads, 1, session) ## hangs with a large author list insert_entities_pub(data, num_threads, ctn, session) # fails with logic error
def uniprotMigrate(uri, database, num, num_threads, ctn): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) batches_pr = [] if num != 0: print(' ') print('Opening Uniprot dataset...') print(' ') tx = session.transaction(TransactionType.WRITE) org = "insert $h isa organism, has organism-name 'H**o sapiens (Human)', has organism-name 'Human'; $o2 isa organism, has organism-name 'Avian';" tx.query().insert(org) tx.commit() tx.close() with open('Dataset/Uniprot/uniprot-reviewed_yes+AND+proteome.tsv', 'rt', encoding='utf-8') as csvfile: csvreader = csv.reader(csvfile, delimiter=' ') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: raw_file.append(row) uniprotdb = [] for i in raw_file[:num]: data = {} data['uniprot-id'] = i[0] data['uniprot-entry-name'] = i[1] data['protein-name'] = i[3] data['gene-symbol'] = i[4] data['organism'] = i[5] data['function-description'] = i[7] data['ensembl-transcript'] = i[11] data['entrez-id'] = i[12] uniprotdb.append(data) insertGenes(uniprotdb, session, num_threads, ctn) insertTranscripts(uniprotdb, session, num_threads, ctn) # Insert proteins counter = 0 pool = ThreadPool(num_threads) batches = [] for q in uniprotdb: counter = counter + 1 transcripts = transcriptHelper(q) gene = geneHelper(q)[0] if transcripts != None: variables = [] tvariable = 1 graql = "match " for t in transcripts: variables.append(tvariable) graql = graql + "$" + str( tvariable ) + " isa transcript, has ensembl-transcript-stable-id '" + t + "'; " tvariable = tvariable + 1 if gene != None: try: graql = graql + "$g isa gene, has gene-symbol '" + gene + "';" except Exception: graql = "match $g isa gene, has gene-symbol '" + gene + "';" try: graql = graql + "$h isa organism, has organism-name '" + q[ 'organism'] + "';" except Exception: graql = "match $h isa organism, has organism-name '" + q[ 'organism'] + "';" graql = f"""{ graql } insert $a isa protein, has uniprot-id "{q['uniprot-id']}", has uniprot-name "{q['protein-name']}", has function-description "{q['function-description']}", has uniprot-entry-name "{q['uniprot-entry-name']}"; $r (associated-organism: $h, associating: $a) isa organism-association;""" if gene != None: graql = graql + "$gpe (encoding-gene: $g, encoded-protein: $a) isa gene-protein-encoding;" if transcripts != None: for v in variables: graql = f"""{ graql } $r{str(v)}(translating-transcript: ${str(v)}, translated-protein: $a) isa translation; """ if gene and transcripts != None: for v in variables: graql = graql + "$trans" + str( v ) + "(transcribing-gene: $g, encoded-transcript: $" + str( v) + ") isa transcription;" batches.append(graql) del graql if counter % ctn == 0: batches_pr.append(batches) batches = [] batches_pr.append(batches) pool.map(partial(batch_job, session), batches_pr) pool.close() pool.join() print('.....') print('Finished migrating Uniprot file.') print('.....') session.close() client.close()
def coronavirusMigrator(uri, database): client = Grakn.core_client(uri) session = client.session(database, SessionType.DATA) tx = session.transaction(TransactionType.WRITE) print('.....') print('Starting with Coronavirus file.') print('.....') # Temporary manual ingestion of locations graql = f"""insert $c isa country, has country-name 'China'; $c2 isa country, has country-name 'Kingdom of Saudi Arabia'; $c3 isa country, has country-name 'USA'; $c4 isa country, has country-name 'South Korea'; $o isa organism, has organism-name 'Mouse';""" tx.query().insert(graql) tx.commit() tx.close() with open('Dataset/Coronaviruses/Genome identity.csv', 'rt', encoding='utf-8') as csvfile: tx = session.transaction(TransactionType.WRITE) csvreader = csv.reader(csvfile, delimiter=',') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: raw_file.append(row) import_file = [] for i in raw_file: data = {} data['genbank-id'] = i[0] data['identity%'] = i[2] data['host'] = i[3][0:-1].strip() data['location-discovered'] = i[4].strip() data['coronavirus-1'] = i[1].strip() try: data['coronavirus-2'] = i[5].strip() except Exception: pass try: data['coronavirus-3'] = i[6].strip() except Exception: pass import_file.append(data) for q in import_file: virus_name = "" try: virus_name = f""" has virus-name "{q['coronavirus-1']}", has virus-name "{q['coronavirus-2']}", has virus-name "{q['coronavirus-3']}", """ except Exception: try: virus_name = f""" has virus-name "{q['coronavirus-1']}", has virus-name "{q['coronavirus-2']}", """ except Exception: virus_name = f""" has virus-name "{q['coronavirus-1']}", """ print(virus_name) graql = f"""match $c isa country, has country-name "{q['location-discovered']}"; $o isa organism, has organism-name "{q['host']}"; insert $v isa virus, has genbank-id "{q['genbank-id']}", {virus_name} has identity-percentage "{q['identity%']}"; $r (discovering-location: $c, discovered-virus: $v) isa discovery; $r1 (hosting-organism: $o, hosted-virus: $v) isa organism-virus-hosting;""" print(graql) tx.query().insert(graql) tx.commit() tx.close() with open( 'Dataset/Coronaviruses/Host proteins (potential drug targets).csv', 'rt', encoding='utf-8') as csvfile: tx = session.transaction(TransactionType.WRITE) csvreader = csv.reader(csvfile, delimiter=',') raw_file = [] n = 0 for row in csvreader: n = n + 1 if n != 1: raw_file.append(row) import_file = [] for i in raw_file: data = {} data['coronavirus'] = i[0].strip() data['uniprot-id'] = i[3].strip() data['entrez-id'] = i[4].strip() import_file.append(data) for q in import_file: graql = f"""match $v isa virus, has virus-name "{q['coronavirus']}"; $p isa protein, has uniprot-id "{q['uniprot-id']}"; $g isa gene, has entrez-id "{q['entrez-id']}"; insert $r2 (associated-virus-gene: $g, associated-virus: $v) isa gene-virus-association; $r3 (hosting-virus-protein: $p, associated-virus: $v) isa protein-virus-association;""" tx.query().insert(graql) print(graql) tx.commit() tx.close() print('.....') print('Finished with Coronavirus file.') print('.....')