Beispiel #1
0
def migrate_journals(uri, database, journal_names: list, ctn, process_id=0):
    '''
    Migrate journals to Grakn \n
    journal_names - list of journal names (strings) \n
    process_id - process id while running on multiple cores, by process_id = 0
    '''
    with Grakn.core_client(uri) as client:
        with client.session(database, SessionType.DATA) as session:
            counter = 0
            transaction = session.transaction(TransactionType.WRITE)
            for journal_name in journal_names:
                ##Check if journal already in Knowledge Base
                try:
                    match_query = 'match $j isa journal, has journal-name "{}";'.format(
                        journal_name)
                    next(transaction.query().match(match_query))
                except StopIteration:
                    insert_query = 'insert $j isa journal, has journal-name "{}";'.format(
                        journal_name)
                    transaction.query().insert(insert_query)
                if counter % ctn == 0:
                    transaction.commit()
                    transaction.close()
                    transaction = session.transaction(TransactionType.WRITE)
                    print("Process {} COMMITED".format(process_id))
                print("Process {} ----- {} journals added".format(
                    process_id, counter))
                counter = counter + 1
            transaction.commit()
            transaction.close()
def dgidbMigrator(uri, database, num_dr, num_int, num_threads, ctn):
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    insertDrugs(uri, database, num_dr, num_threads, ctn, session)
    insertInteractions(uri, database, num_int, num_threads, ctn, session)
    session.close()
    client.close()
Beispiel #3
0
def disgenetMigrator(uri, database, num, num_threads, ctn):
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    batches_pr = []

    if num != 0:
        print('  ')
        print('Opening Disgenet dataset...')
        print('  ')

        url = "https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz"
        get_file(url, 'Dataset/Disgenet/')

        with gzip.open('Dataset/Disgenet/all_gene_disease_associations.tsv.gz',
                       'rt') as f:
            csvreader = csv.reader(f, delimiter='	')
            raw_file = []
            n = 0
            for row in csvreader:
                n = n + 1
                if n != 1:
                    raw_file.append(row)

        disgenet = []
        for i in raw_file[:num]:
            data = {}
            data['entrez-id'] = i[0].strip()
            data['gene-symbol'] = i[1]
            data['disease-id'] = i[4]
            data['disease-name'] = i[5]
            data['disgenet-score'] = float(i[9])
            disgenet.append(data)
        os.remove('Dataset/Disgenet/all_gene_disease_associations.tsv.gz')
        insertDiseases(disgenet, session, num_threads, ctn)

        counter = 0
        pool = ThreadPool(num_threads)
        batches = []
        for q in disgenet:
            counter = counter + 1
            graql = f"""
match $g isa gene, has gene-symbol "{q['gene-symbol']}", has entrez-id "{q['entrez-id']}";
$d isa disease, has disease-id "{q['disease-id']}", has disease-name "{q['disease-name']}";
insert $r (associated-gene: $g, associated-disease: $d) isa gene-disease-association, has disgenet-score {q['disgenet-score']};"""
            batches.append(graql)
            del graql
            if counter % ctn == 0:
                batches_pr.append(batches)
                batches = []
        batches_pr.append(batches)
        pool.map(partial(batch_job, session), batches_pr)
        pool.close()
        pool.join()
        print('.....')
        print('Finished migrating Disgenet.')
        print('.....')
        session.close()
        client.close()
def tissueNetMigrator(uri, keyspace, num, num_threads, ctn):
	client = Grakn(uri=uri)
	session = client.session(keyspace=keyspace)
	batches_pr = []

	if num != 0:
		print('  ')
		print('Opening TissueNet dataset...')
		print('  ')

		with open('Dataset/TissueNet/HPA-Protein.tsv', 'rt', encoding='utf-8') as csvfile:
			csvreader = csv.reader(csvfile, delimiter='	')
			raw_file = []
			n = 0
			for row in csvreader: 
				n = n + 1
				if n != 1:
					raw_file.append(row)
def reactomeMigrator(uri, database, num_path, num_threads, ctn):
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    pathway_associations = filterHomoSapiens(num_path)
    insertPathways(uri, database, num_threads, ctn, session,
                   pathway_associations)
    insertPathwayInteractions(uri, database, num_threads, ctn, session,
                              pathway_associations)
    session.close()
    client.close()
Beispiel #6
0
    def grakn_session(self):
        """
        Did this like this in an attempt to make it
        also work when using with a DataLoader with
        num_workers > 0.

        TODO: it does not, so look into this.
        """
        if not self._grakn_session:
            print("setting up session")
            print(self)
            client = Grakn.core_client(self._uri)
            self._grakn_session = client.session(database=self._database,
                                                 session_type=SessionType.DATA)
        return self._grakn_session
Beispiel #7
0
def migrate_publications(uri,
                         database,
                         publications_list: list,
                         ctn,
                         process_id=0):
    '''
    Migrate publiations to Grakn\n
    publications_list - list of dictionaries with publication data\n
    process_id - process id while running on multiple cores, by process_id = 0
    '''
    with Grakn.core_client(uri) as client:
        with client.session(database, SessionType.DATA) as session:
            counter = 0
            transaction = session.transaction(TransactionType.WRITE)
            for publication_dict in publications_list:
                authors = publication_dict[
                    "authors"]  #list of authors - list of strings
                ##Check if publication already in Knowledge Base
                try:
                    match_query = 'match $p isa publication, has paper-id "{}";'.format(
                        publication_dict["paper-id"])
                    next(transaction.query().match(match_query))
                except StopIteration:
                    match_query = 'match $j isa journal, has journal-name "{}"; '.format(
                        publication_dict["journal-name"])
                    match_query = match_query + create_authorship_query(
                        authors)[0]
                    insert_query = 'insert $p isa publication, has paper-id "{}", has title "{}", has doi "{}", has publish-time "{}", has volume "{}", has issn "{}", has pmid "{}"; '.format(
                        publication_dict["paper-id"],
                        publication_dict["title"], publication_dict["doi"],
                        publication_dict["publish-time"],
                        publication_dict["volume"], publication_dict["issn"],
                        publication_dict["pmid"])
                    insert_query = insert_query + create_authorship_query(
                        authors)[1]
                    insert_query = insert_query + '(publishing-journal: $j, published-publication: $p) isa publishing;'
                    # print(match_query+insert_query)
                    transaction.query().insert(match_query + insert_query)
                if counter % ctn == 0:
                    transaction.commit()
                    transaction.close()
                    transaction = session.transaction(TransactionType.WRITE)
                    print("Process {} COMMITED".format(process_id))
                print("Process {} ----- {} publications added".format(
                    process_id, counter))
                counter = counter + 1
            transaction.commit()
            transaction.close()
def proteinAtlasMigrator(uri, database, num, num_threads, ctn):
	client = Grakn.core_client(uri)
	session = client.session(database, SessionType.DATA)
	batches_pr = []

	if num != 0:
		print('  ')
		print('Opening HPA dataset...')
		print('  ')

		get_file('https://www.proteinatlas.org/download/normal_tissue.tsv.zip', 'Dataset/HumanProteinAtlas/')	

		with ZipFile('Dataset/HumanProteinAtlas/normal_tissue.tsv.zip', 'r') as f:
			f.extractall('Dataset/HumanProteinAtlas')

		with open('Dataset/HumanProteinAtlas/normal_tissue.tsv', 'rt', encoding='utf-8') as csvfile:
			csvreader = csv.reader(csvfile, delimiter='	')
			raw_file = []
			n = 0
			for row in csvreader: 
				n = n + 1
				if n != 1:
					d = {}
					d['ensembl-gene-id'] = row[0]
					d['gene-symbol'] = row[1]
					d['tissue'] = row[2]
					d['expression-value'] = row[4]
					d['expression-value-reliability'] = row[5]
					raw_file.append(d)
			os.remove('Dataset/HumanProteinAtlas/normal_tissue.tsv.zip')
			os.remove('Dataset/HumanProteinAtlas/normal_tissue.tsv')

		tissue = []
		for r in raw_file[:num]:
			tissue.append(r['tissue'])
		tissue = (list(set(tissue)))

		insertTissue(tissue, session, num_threads)
		insertEnsemblId(raw_file, session, num_threads, ctn)
		insertGeneTissue(raw_file, session, num_threads, ctn)
Beispiel #9
0
def migrate_relationships(uri, database, data: list, ctn, process_id=0):
    '''
    Migrate relations to Grakn\n
    data - table in a form of list of lists \n
    process_id - process id while running on multiple cores, by process_id = 0
    '''
    with Grakn.core_client(uri) as client:
        with client.session(database, SessionType.DATA) as session:
            counter = 0
            transaction = session.transaction(TransactionType.WRITE)
            for data_entity in data:
                predicate_name = data_entity[1]
                subject_name = data_entity[2]
                object_name = data_entity[3]
                relation = relationship_mapper(
                    predicate_name
                )  #add handler for situation when there is no relation implemented in a mapper
                pmid = data_entity[0]
                sentence_text = data_entity[4].replace('"', "'")

                match_query = 'match $p isa publication, has paper-id "{}"; $g1 isa gene, has gene-symbol "{}"; $g2 isa gene, has gene-symbol "{}"; '.format(
                    data_entity[0], data_entity[2], data_entity[3])
                insert_query = 'insert $r ({}: $g1, {}: $g2) isa {}, has sentence-text "{}"; $m (mentioned-genes-relation: $r, mentioning: $p) isa mention, has source "SemMed";'.format(
                    relation["active-role"], relation["passive-role"],
                    relation["relation-name"], sentence_text)
                transaction.query().insert(match_query + insert_query)
                print(match_query + insert_query)
                if counter % ctn == 0:
                    transaction.commit()
                    transaction.close()
                    transaction = session.transaction(TransactionType.WRITE)
                    print("Process {} COMMITED".format(process_id))
                print("Process {} ----- {} relations added".format(
                    process_id, counter))
                counter = counter + 1
            transaction.commit()
            transaction.close()
Beispiel #10
0
def cord_ner_migrator(uri, database, num_ner, num_threads, ctn):
    print('.....')
    print('Opening CORD NER file.')
    print('.....')

    # FIRST DOWNLOAD THE CORD-NER-FULL.json FROM THIS WEBSITE:
    # https://uofi.app.box.com/s/k8pw7d5kozzpoum2jwfaqdaey1oij93x/file/651148518303

    # AND ADD IT TO THIS DIR: DATASET/CORD_NER/

    # TODO: Implement a JSON streamer
    with open('Dataset/CORD_NER/CORD-NER-full.json', "r") as f:
        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

    # The session could time out if we open it BEFORE we load the file
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    data = data[:num_ner]
    insert_authors(data, num_threads, ctn, session)
    insert_journals(data, num_threads, ctn, session)
    insert_publications_journals(data, num_threads, ctn, session)
    #>insert_publications_with_authors(data, num_threads, 1, session)  ## hangs with a large author list
    insert_entities_pub(data, num_threads, ctn,
                        session)  # fails with logic error
Beispiel #11
0
def uniprotMigrate(uri, database, num, num_threads, ctn):
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    batches_pr = []

    if num != 0:
        print('  ')
        print('Opening Uniprot dataset...')
        print('  ')

        tx = session.transaction(TransactionType.WRITE)
        org = "insert $h isa organism, has organism-name 'H**o sapiens (Human)', has organism-name 'Human'; $o2 isa organism, has organism-name 'Avian';"
        tx.query().insert(org)
        tx.commit()
        tx.close()

        with open('Dataset/Uniprot/uniprot-reviewed_yes+AND+proteome.tsv',
                  'rt',
                  encoding='utf-8') as csvfile:
            csvreader = csv.reader(csvfile, delimiter='	')
            raw_file = []
            n = 0
            for row in csvreader:
                n = n + 1
                if n != 1:
                    raw_file.append(row)

        uniprotdb = []
        for i in raw_file[:num]:
            data = {}
            data['uniprot-id'] = i[0]
            data['uniprot-entry-name'] = i[1]
            data['protein-name'] = i[3]
            data['gene-symbol'] = i[4]
            data['organism'] = i[5]
            data['function-description'] = i[7]
            data['ensembl-transcript'] = i[11]
            data['entrez-id'] = i[12]
            uniprotdb.append(data)

        insertGenes(uniprotdb, session, num_threads, ctn)
        insertTranscripts(uniprotdb, session, num_threads, ctn)

        # Insert proteins
        counter = 0
        pool = ThreadPool(num_threads)
        batches = []
        for q in uniprotdb:
            counter = counter + 1
            transcripts = transcriptHelper(q)
            gene = geneHelper(q)[0]
            if transcripts != None:
                variables = []
                tvariable = 1
                graql = "match "
                for t in transcripts:
                    variables.append(tvariable)
                    graql = graql + "$" + str(
                        tvariable
                    ) + " isa transcript, has ensembl-transcript-stable-id '" + t + "'; "
                    tvariable = tvariable + 1
            if gene != None:
                try:
                    graql = graql + "$g isa gene, has gene-symbol '" + gene + "';"
                except Exception:
                    graql = "match $g isa gene, has gene-symbol '" + gene + "';"
            try:
                graql = graql + "$h isa organism, has organism-name '" + q[
                    'organism'] + "';"
            except Exception:
                graql = "match $h isa organism, has organism-name '" + q[
                    'organism'] + "';"

            graql = f"""{ graql } insert $a isa protein, has uniprot-id "{q['uniprot-id']}", has uniprot-name
"{q['protein-name']}", has function-description "{q['function-description']}", 
has uniprot-entry-name "{q['uniprot-entry-name']}";
$r (associated-organism: $h, associating: $a) isa organism-association;"""
            if gene != None:
                graql = graql + "$gpe (encoding-gene: $g, encoded-protein: $a) isa gene-protein-encoding;"
            if transcripts != None:
                for v in variables:
                    graql = f"""{ graql } $r{str(v)}(translating-transcript: ${str(v)}, translated-protein: $a) isa translation; """
            if gene and transcripts != None:
                for v in variables:
                    graql = graql + "$trans" + str(
                        v
                    ) + "(transcribing-gene: $g, encoded-transcript: $" + str(
                        v) + ") isa transcription;"

            batches.append(graql)
            del graql
            if counter % ctn == 0:
                batches_pr.append(batches)
                batches = []
        batches_pr.append(batches)
        pool.map(partial(batch_job, session), batches_pr)
        pool.close()
        pool.join()
        print('.....')
        print('Finished migrating Uniprot file.')
        print('.....')
        session.close()
        client.close()
def coronavirusMigrator(uri, database):
    client = Grakn.core_client(uri)
    session = client.session(database, SessionType.DATA)
    tx = session.transaction(TransactionType.WRITE)
    print('.....')
    print('Starting with Coronavirus file.')
    print('.....')

    # Temporary manual ingestion of locations
    graql = f"""insert $c isa country, has country-name 'China'; $c2 isa country, has country-name 'Kingdom of Saudi Arabia'; 
	$c3 isa country, has country-name 'USA'; $c4 isa country, has country-name 'South Korea'; $o isa organism, has organism-name 'Mouse';"""
    tx.query().insert(graql)
    tx.commit()
    tx.close()

    with open('Dataset/Coronaviruses/Genome identity.csv',
              'rt',
              encoding='utf-8') as csvfile:
        tx = session.transaction(TransactionType.WRITE)
        csvreader = csv.reader(csvfile, delimiter=',')
        raw_file = []
        n = 0
        for row in csvreader:
            n = n + 1
            if n != 1:
                raw_file.append(row)
        import_file = []
        for i in raw_file:
            data = {}
            data['genbank-id'] = i[0]
            data['identity%'] = i[2]
            data['host'] = i[3][0:-1].strip()
            data['location-discovered'] = i[4].strip()
            data['coronavirus-1'] = i[1].strip()
            try:
                data['coronavirus-2'] = i[5].strip()
            except Exception:
                pass
            try:
                data['coronavirus-3'] = i[6].strip()
            except Exception:
                pass
            import_file.append(data)
        for q in import_file:
            virus_name = ""
            try:
                virus_name = f""" has virus-name "{q['coronavirus-1']}", has virus-name "{q['coronavirus-2']}", has virus-name "{q['coronavirus-3']}", """
            except Exception:
                try:
                    virus_name = f""" has virus-name "{q['coronavirus-1']}", has virus-name "{q['coronavirus-2']}", """
                except Exception:
                    virus_name = f""" has virus-name "{q['coronavirus-1']}", """
            print(virus_name)
            graql = f"""match $c isa country, has country-name "{q['location-discovered']}"; 
			$o isa organism, has organism-name "{q['host']}";
			insert $v isa virus, has genbank-id "{q['genbank-id']}", {virus_name}
			has identity-percentage "{q['identity%']}";
			$r (discovering-location: $c, discovered-virus: $v) isa discovery;
			$r1 (hosting-organism: $o, hosted-virus: $v) isa organism-virus-hosting;"""
            print(graql)
            tx.query().insert(graql)
        tx.commit()
        tx.close()

    with open(
            'Dataset/Coronaviruses/Host proteins (potential drug targets).csv',
            'rt',
            encoding='utf-8') as csvfile:
        tx = session.transaction(TransactionType.WRITE)
        csvreader = csv.reader(csvfile, delimiter=',')
        raw_file = []
        n = 0
        for row in csvreader:
            n = n + 1
            if n != 1:
                raw_file.append(row)
        import_file = []
        for i in raw_file:
            data = {}
            data['coronavirus'] = i[0].strip()
            data['uniprot-id'] = i[3].strip()
            data['entrez-id'] = i[4].strip()
            import_file.append(data)
        for q in import_file:
            graql = f"""match $v isa virus, has virus-name "{q['coronavirus']}"; 
			$p isa protein, has uniprot-id "{q['uniprot-id']}";
			$g isa gene, has entrez-id "{q['entrez-id']}";
			insert $r2 (associated-virus-gene: $g, associated-virus: $v) isa gene-virus-association;
			$r3 (hosting-virus-protein: $p, associated-virus: $v) isa protein-virus-association;"""
            tx.query().insert(graql)
            print(graql)
        tx.commit()
        tx.close()
    print('.....')
    print('Finished with Coronavirus file.')
    print('.....')