def gene_item_statements():
        """
        construct list of referenced statements to past to PBB_Core Item engine
        :return:
        """
        # creates reference object for WD gene item claim
        ncbi_gene_reference = wdo.reference_store(
            source='ncbi_gene', identifier=gene_record['_id'])

        # claims for datatype string.
        WD_String_CLAIMS = {
            'P351': str(gene_record['_id']),
            'P2393': gene_record['locus_tag'],
        }
        WD_Genome_Annotation_Claims = {
            'P644': str(int(gene_record['genomic_pos']['start'])),
            'P645': str(int(gene_record['genomic_pos']['end'])),
        }
        # claims for datytpe item
        WD_Item_CLAIMS = {
            'P703': spec_strain.iloc[0]['wd_qid'],
            'P279': 'Q7187',
        }

        # convert integer representation of strand to corresponding WD item (Forward Strand/Reverse Strand)
        if gene_record['genomic_pos']['strand'] == 1:
            WD_Item_CLAIMS['P2548'] = 'Q22809680'
        elif gene_record['genomic_pos']['strand'] == -1:
            WD_Item_CLAIMS['P2548'] = 'Q22809711'
        chromosome = gene_record['genomic_pos']['chr']
        rs_chrom = PBB_Core.WDString(value=chromosome,
                                     prop_nr='P2249',
                                     is_qualifier=True)

        statements = []
        # process to pbb_Core data value object and append to statments for each valid item in each datatype dict
        # WDItemID datatype
        for k, v in WD_Item_CLAIMS.items():
            statements.append(
                PBB_Core.WDItemID(value=v,
                                  prop_nr=k,
                                  references=[ncbi_gene_reference]))
        # WDString datatype
        for k, v in WD_String_CLAIMS.items():
            statements.append(
                PBB_Core.WDString(value=v,
                                  prop_nr=k,
                                  references=[ncbi_gene_reference]))
        for k, v in WD_Genome_Annotation_Claims.items():
            statements.append(
                PBB_Core.WDString(value=v,
                                  prop_nr=k,
                                  references=[ncbi_gene_reference],
                                  qualifiers=[rs_chrom]))

        return statements
Esempio n. 2
0
    def make_reference(self,
                       stated_in,
                       source_element,
                       source_element_name,
                       source_element_prop,
                       date=time.strftime('+%Y-%m-%dT00:00:00Z'),
                       date_property='P813'):
        ref = [[
            PBB_Core.WDItemID(value=stated_in,
                              prop_nr='P248',
                              is_reference=True),  # stated in
            PBB_Core.WDString(value=source_element,
                              prop_nr=source_element_prop,
                              is_reference=True),  # source element
            PBB_Core.WDItemID(value='Q1860', prop_nr='P407',
                              is_reference=True),  # language of work
            PBB_Core.WDMonolingualText(value=source_element_name,
                                       language='en',
                                       prop_nr='P1476',
                                       is_reference=True),
            PBB_Core.WDTime(time=date,
                            prop_nr=date_property,
                            is_reference=True)  # publication date
        ]]

        # this will overwrite all existing references of a WD claim value.
        for x in ref[0]:
            x.overwrite_references = True

        return ref
Esempio n. 3
0
    def protein_item_statements():
        """
        construct list of referenced statements to pass to PBB_Core Item engine
        :return:
        """
        uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot)

        WD_String_CLAIMS = {
            'P637':
            str(gene_record['refseq']['protein']),  # set refseq protein id
            'P352': uniprot  # Set uniprot ID
        }

        WD_Item_CLAIMS = {
            'P703': [spec_strain.iloc[0]['wd_qid']
                     ],  # get strain taxid qid from strain record
            'P279': ['Q8054'],  # subclass of protein
        }

        statements = []
        #generate go term claims
        for gt in gene_record['GOTERMS']:
            goprop = go_props[gt[1]]
            govalue = wdo.WDSparqlQueries(
                prop='P686',
                string=gt[0]).wd_prop2qid()  #  Get GeneOntology Item by GO ID
            evprop = 'P459'
            try:
                evvalue = go_evidence_codes[gt[2]]
                evstat = PBB_Core.WDItemID(value=evvalue,
                                           prop_nr=evprop,
                                           is_qualifier=True)
                statements.append(
                    PBB_Core.WDItemID(value=govalue,
                                      prop_nr=goprop,
                                      references=[uniprot_ref],
                                      qualifiers=[evstat]))
            except Exception as e:
                statements.append(
                    PBB_Core.WDItemID(value=govalue,
                                      prop_nr=goprop,
                                      references=[uniprot_ref]))

        # generate list of pbb core value objects for all valid claims
        for k, v in WD_Item_CLAIMS.items():
            if v:
                for i in v:
                    statements.append(
                        PBB_Core.WDItemID(value=i,
                                          prop_nr=k,
                                          references=[uniprot_ref]))

        for k, v in WD_String_CLAIMS.items():
            if v:
                statements.append(
                    PBB_Core.WDString(value=v,
                                      prop_nr=k,
                                      references=[uniprot_ref]))

        return statements
def reference_store(source='', identifier=''):
    """
    :param source: database source to be referenced (key name from source_qids)
    :param ref_type: type of WD reference statement (imported from, stated in) (key names from prop_ids)
    :return: PBB_Core reference object for database source
    """
    source_items = {'uniprot': 'Q905695',
                    'ncbi_gene': 'Q20641742',
                    'ncbi_taxonomy': 'Q13711410',
                    'swiss_prot': 'Q2629752',
                    'trembl': 'Q22935315'}

    prop_ids = {'uniprot': 'P352',
                'ncbi_gene': 'P351',
                'ncbi_taxonomy': 'P685',
                'ncbi_locus_tag': 'P2393'
                }
    refs = [PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True),
            PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True),
            PBB_Core.WDString(value=identifier, prop_nr=prop_ids[source], is_reference=True),
            PBB_Core.WDTime(str(strftime("+%Y-%m-%dT00:00:00Z", gmtime())), prop_nr='P813', is_reference=True)
            ]
    for ref in refs:
        ref.overwrite_references = True
    return refs
Esempio n. 5
0
    def __init__(self, object):
            self.logincreds = object["logincreds"]
            self.name = object["uberonLabel"]
            self.uberon = object["uberon"]
            self.uberon_id = self.uberon.replace("http://purl.obolibrary.org/obo/UBERON_", "")
            self.wikidata_id = object["wikidata_id"]
            self.start = object["start"]
            self.graph = object["graph"]

            subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
            id = URIRef("http://www.geneontology.org/formats/oboInOwl#id")
            hasExactSyn = URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym")
            print(self.uberon_id)
            print(self.name)

            refStatedIn = PBB_Core.WDItemID(21552738, prop_nr='P248', is_reference=True)
            refStatedIn.overwrite_references = True
            refImported = PBB_Core.WDItemID(value=7876491, prop_nr='P143', is_reference=True)
            refImported.overwrite_references = True
            timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
            refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True)
            refRetrieved.overwrite_references = True
            ub_reference = [refStatedIn, refImported, refRetrieved]


            if self.uberon_id in self.wikidata_id.keys():
                self.wdid = self.wikidata_id[self.uberon_id.replace("UBERON:", "")]
            else:
                self.wdid = None

            self.synonyms = []
            for synonym in self.graph.objects(URIRef(self.uberon), hasExactSyn):
                self.synonyms.append(str(synonym))

            prep = dict()
            prep["P279"] = [PBB_Core.WDItemID(value='Q4936952', prop_nr='P279', references=[copy.deepcopy(ub_reference)])]
            prep["P1554"] = [PBB_Core.WDString(value=self.uberon_id, prop_nr='P1554', references=[copy.deepcopy(ub_reference)])]
            print(self.uberon)
            prep["P1709"] = [PBB_Core.WDUrl(value=self.uberon, prop_nr='P1709', references=[copy.deepcopy(ub_reference)])]

            data2add = []
            for key in prep.keys():
                for statement in prep[key]:
                    data2add.append(statement)
                    print(statement.prop_nr, statement.value)

            if self.wdid is not None:
                wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure",append_value=['P279'])
            else:
                wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure", append_value=['P279'])
            if len(self.synonyms) >0:
                wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True)
            print(self.synonyms)
            for syn in self.synonyms:
                print(syn)
            wdPage.write(self.logincreds)
            print("======")
            sys.exit()
Esempio n. 6
0
 def create_reference(self):
     first_ref = PBB_Core.WDItemID(value='Q905695', prop_nr='P248', is_reference=True)
     first_ref.overwrite_references = True
     return [
         first_ref,
         PBB_Core.WDString(value=self.uniprot, prop_nr='P352', is_reference=True),
         PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z', time.gmtime()), prop_nr='P813',
                         is_reference=True),
         PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True),  # language of work
     ]
Esempio n. 7
0
def generate_refs(ref_source_id):
    ref_list = [[]]

    if ref_source_id.startswith('C'):
        ref_list[0].extend([
            PBB_Core.WDItemID(value='Q6120337', prop_nr='P248', is_reference=True),  # stated in
            PBB_Core.WDString(value=ref_source_id, prop_nr='P592', is_reference=True),  # source element
        ])
    elif ref_source_id.startswith('N'):
        ref_list[0].extend([
            PBB_Core.WDItemID(value='Q21008030', prop_nr='P248', is_reference=True),  # stated in
            PBB_Core.WDString(value=ref_source_id, prop_nr='P2115', is_reference=True),  # source element
        ])

    ref_list[0].extend([
        PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True),  # language of work
        # PBB_Core.WDMonolingualText(value=source_element_name, language='en',
        #                            prop_nr='P1476', is_reference=True),
        PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True)  # publication date
    ])

    return ref_list
Esempio n. 8
0
    def protein_item_statements():
        """
        construct list of referenced statements to past to PBB_Core Item engine
        :return:
        """
        uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot)

        WD_String_CLAIMS = {
            'P637': str(gene_record['refseq']['protein']),
            #'P2393': gene_record['locus_tag'],
            'P352': uniprot
            #'P591': str(gene_record['EC number'])
        }
        WD_Item_CLAIMS = {
            'P703': [spec_strain.iloc[0]['wd_qid']],
            'P279': ['Q8054'],
            'P680': [],  # molecular function
            'P681': [],  # cellular component
            'P682': []  # biological process
        }
        for gt in gene_record['GOTERMS']:
            gtids = parse_go_terms(gt)
            WD_Item_CLAIMS[gtids[1]].append(gtids[0])

        statements = []
        # generate list of pbb core value objects for all valid claims
        for k, v in WD_Item_CLAIMS.items():
            if v:
                for i in v:
                    statements.append(
                        PBB_Core.WDItemID(value=i,
                                          prop_nr=k,
                                          references=[uniprot_ref]))

        for k, v in WD_String_CLAIMS.items():
            if v:
                statements.append(
                    PBB_Core.WDString(value=v,
                                      prop_nr=k,
                                      references=[uniprot_ref]))
        return statements
Esempio n. 9
0
def generate_refs(iuphar_ligand):
    ref_list = [[]]

    ref_list[0].extend([
        PBB_Core.WDItemID(value='Q2793172', prop_nr='P248',
                          is_reference=True),  # stated in
        PBB_Core.WDString(value=iuphar_ligand,
                          prop_nr='P595',
                          is_reference=True),  # source element
    ])

    ref_list[0].extend([
        PBB_Core.WDItemID(value='Q1860', prop_nr='P407',
                          is_reference=True),  # language of work
        # PBB_Core.WDMonolingualText(value=source_element_name, language='en',
        #                            prop_nr='P1476', is_reference=True),
        PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'),
                        prop_nr='P813',
                        is_reference=True)  # publication date
    ])

    return ref_list
Esempio n. 10
0
    def __init__(self, object):
        """
        constructor
        :param wd_do_content: Wikidata item id
        :param do_id: Identifier of the disease in Disease Ontology
        :param label: Primary label of the disease in Disease Ontology
        :param synonyms: All synonyms for the disease captured in the Disease Ontology
        :param xrefs: a dictionary with all external references of the Disease captured in the Disease Ontology
        """
        # Reference section
        doVersionURL = object[1]
        doClass = object[0]
        self.logincreds = object[3]
        self.wd_doMappings = object[2]
        self.start = object[4]

        self.wd_do_content = doClass
        PBB_Debug.prettyPrint(self.wd_do_content)
        self.do_id = self.getDoValue(self.wd_do_content,
                                     './/oboInOwl:id')[0].text

        print(self.do_id)
        self.name = self.getDoValue(self.wd_do_content,
                                    './/rdfs:label')[0].text
        print(self.name)
        classDescription = self.getDoValue(
            self.wd_do_content,
            './/oboInOwl:hasDefinition/oboInOwl:Definition/rdfs:label')
        if len(classDescription) > 0:
            self.description = classDescription[0].text

        if self.do_id in object[2].keys():
            self.wdid = "Q" + str(object[2][self.do_id])
        else:
            self.wdid = None
        if len(self.getDoValue(self.wd_do_content,
                               './/owl:deprecated')) > 0 and self.getDoValue(
                                   self.wd_do_content,
                                   './/owl:deprecated')[0].text == "true":
            self.rank = "deprecated"
        else:
            self.rank = "normal"

        self.synonyms = []
        for synonym in self.getDoValue(self.wd_do_content,
                                       './/oboInOwl:hasExactSynonym'):
            self.synonyms.append(synonym.text)

        self.subclasses = []
        for subclass in self.getDoValue(self.wd_do_content,
                                        './/rdfs:subClassOf'):
            parts = subclass.get(
                '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split(
                    "DOID_")
            if len(parts) > 1:
                self.subclasses.append("DOID:" + parts[1])
            if "DOID:4" in self.subclasses:
                self.subclasses.remove("DOID:4")

        self.xrefs = dict()
        for xref in self.getDoValue(self.wd_do_content,
                                    './/oboInOwl:hasDbXref'):
            if not xref.text.split(":")[0] in self.xrefs.keys():
                self.xrefs[xref.text.split(":")[0]] = []
            self.xrefs[xref.text.split(":")[0]].append(xref.text.split(":")[1])

        refStatedIn = PBB_Core.WDUrl(value=doVersionURL,
                                     prop_nr='P1065',
                                     is_reference=True)
        refStatedIn.overwrite_references = True
        refImported = PBB_Core.WDItemID(value=5282129,
                                        prop_nr='P248',
                                        is_reference=True)
        refImported.overwrite_references = True
        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
        refRetrieved = PBB_Core.WDTime(timeStringNow,
                                       prop_nr='P813',
                                       is_reference=True)
        refRetrieved.overwrite_references = True
        do_reference = [refImported, refRetrieved, refStatedIn]

        prep = dict()
        prep["P279"] = [
            PBB_Core.WDItemID(value='Q12136',
                              prop_nr='P279',
                              references=[copy.deepcopy(do_reference)],
                              rank=self.rank)
        ]
        # Subclass of disease
        for subclass in self.subclasses:
            if subclass in self.wd_doMappings.keys():
                prep["P279"].append(
                    PBB_Core.WDItemID(value=self.wd_doMappings[subclass],
                                      prop_nr='P279',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank))

        if "Orphanet" in self.xrefs.keys():
            prep["P1550"] = []
            if isinstance(self.xrefs["Orphanet"], list):
                for id in self.xrefs["Orphanet"]:
                    prep["P1550"].append(
                        PBB_Core.WDString(
                            value=self.xrefs["Orphanet"],
                            prop_nr='P1550',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P1550"] = [
                    PBB_Core.WDString(value=self.xrefs["Orphanet"],
                                      prop_nr='P1550',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        #disease Ontology

        prep["P699"] = [
            PBB_Core.WDString(value=self.do_id,
                              prop_nr='P699',
                              references=[do_reference],
                              rank=self.rank)
        ]

        if "url" in self.xrefs.keys():
            if isinstance(self.xrefs["url"], list):
                for i in self.xrefs["url"]:
                    if "//en.wikipedia.org/wiki/" in i:
                        wikilink = self.i.replace("//en.wikipedia.org/wiki/",
                                                  "").replace("_", "")
                    else:
                        wikilink = None
            else:
                if "//en.wikipedia.org/wiki/" in xrefs["url"]:
                    wikilink = xrefs["url"].replace("//en.wikipedia.org/wiki/",
                                                    "").replace("_", "")
                else:
                    wikilink = None
        else:
            wikilink = None

        if "ICD10CM" in self.xrefs.keys():
            prep["P494"] = []
            if isinstance(self.xrefs["ICD10CM"], list):
                for id in self.xrefs["ICD10CM"]:
                    prep["P494"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P494',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P494"] = [
                    PBB_Core.WDString(value=self.xrefs["ICD10CM"],
                                      prop_nr='P494',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "ICD9CM" in self.xrefs.keys():
            prep["P493"] = []
            if isinstance(self.xrefs["ICD9CM"], list):
                for id in self.xrefs["ICD9CM"]:
                    prep["P493"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P493',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P493"] = [
                    PBB_Core.WDString(value=self.xrefs["ICD9CM"],
                                      prop_nr='P493',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "MSH" in self.xrefs.keys():
            prep["P486"] = []
            if isinstance(self.xrefs["MSH"], list):
                for id in self.xrefs["MSH"]:
                    prep["P486"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P486',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P486"] = [
                    PBB_Core.WDString(value=self.xrefs["MSH"],
                                      prop_nr='P486',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "NCI" in self.xrefs.keys():
            prep["P1748"] = []
            if isinstance(self.xrefs["NCI"], list):
                for id in self.xrefs["NCI"]:
                    prep["P1748"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P1748',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P1748"] = [
                    PBB_Core.WDString(value=self.xrefs["NCI"],
                                      prop_nr='P1748',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        if "OMIM" in self.xrefs.keys():
            prep["P492"] = []
            if isinstance(self.xrefs["OMIM"], list):
                for id in self.xrefs["OMIM"]:
                    prep["P492"].append(
                        PBB_Core.WDString(
                            value=id,
                            prop_nr='P492',
                            references=[copy.deepcopy(do_reference)],
                            rank=self.rank))
            else:
                prep["P492"] = [
                    PBB_Core.WDString(value=self.xrefs["OMIM"],
                                      prop_nr='P492',
                                      references=[copy.deepcopy(do_reference)],
                                      rank=self.rank)
                ]

        print(self.wdid)
        data2add = []
        for key in prep.keys():
            for statement in prep[key]:
                data2add.append(statement)
                print(statement.prop_nr, statement.value)

        if self.wdid is not None:
            wdPage = PBB_Core.WDItemEngine(self.wdid,
                                           item_name=self.name,
                                           data=data2add,
                                           server="www.wikidata.org",
                                           domain="diseases",
                                           append_value=['P279'])
        else:
            wdPage = PBB_Core.WDItemEngine(item_name=self.name,
                                           data=data2add,
                                           server="www.wikidata.org",
                                           domain="diseases",
                                           append_value=['P279'])

        # wdPage.set_description(description='Human disease', lang='en')
        if wikilink is not None:
            wdPage.set_sitelink(site="enwiki", title=wikilink)
        if self.synonyms is not None:
            wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True)
        self.wd_json_representation = wdPage.get_wd_json_representation()
        PBB_Debug.prettyPrint(self.wd_json_representation)
        wdPage.write(self.logincreds)
        if not os.path.exists('./json_dumps'):
            os.makedirs('./json_dumps')
        f = open('./json_dumps/' + self.do_id.replace(":", "_") + '.json',
                 'w+')
        pprint.pprint(self.wd_json_representation, stream=f)
        f.close()

        PBB_Core.WDItemEngine.log(
            'INFO',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=self.do_id,
                    exception_type='',
                    message=f.name,
                    wd_id=self.wdid,
                    duration=time.time() - self.start))
Esempio n. 11
0
                strand = 'Q22809711'
            genes = line[2:]
            for gene in genes:
                for lgene in list_genes:
                    if gene == lgene['symbol']:
                        lgene['operon'] = {'operon': operon, 'strand': strand}
        f.close()
        return list_genes


ops = combine_resources()
#pprint.pprint(ops)
genestot = len(ops)
count = 0
reference = [
    PBB_Core.WDString(value='19448609', prop_nr='P698', is_reference=True),
    PBB_Core.WDTime(str(strftime("+%Y-%m-%dT00:00:00Z", gmtime())),
                    prop_nr='P813',
                    is_reference=True)
]
for ref in reference:
    ref.overwrite_references = True

login = PBB_login.WDLogin(sys.argv[1], sys.argv[2])
for gene in ops:
    statements = []
    if 'locus_tag' in gene.keys():
        item_name = '{}    {}'.format(gene['name'], gene['locus_tag'])

        if 'operon' in gene.keys():
            count += 1
Esempio n. 12
0
    def __init__(self, login):

        self.login_obj = login

        # wdq_results = PBB_Core.WDItemList('CLAIM[686]', '686').wditems
        # wd_go_terms = list(map(lambda z: z[2], wdq_results['props']['686']))
        # go_qid_list = list(map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['686']))

        query = '''
            SELECT distinct ?gene ?go WHERE {
                ?gene wdt:P686 ?go .
                FILTER(!REGEX(?go, "^GO:[0-9]", "i"))
            }
        '''
        qids_to_clean = set()
        for x in PBB_Core.WDItemEngine.execute_sparql_query(
                query=query)['results']['bindings']:
            qids_to_clean.add(x['gene']['value'].split('/')[-1])

        # print(len(wd_go_terms))
        # for count, go_term in enumerate(wd_go_terms):
        #     curr_qid = go_qid_list[wd_go_terms.index(go_term)]
        #
        #     # try:
        #     #     int(go_term)
        #     # except ValueError as e:
        #     qids_to_clean.add(curr_qid)

        for count, curr_qid in enumerate(qids_to_clean):
            start = time.time()
            clean_gos = []
            print(curr_qid)

            cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid)
            for wd_value in cleanup_item.statements:
                if wd_value.get_prop_nr() == 'P686':
                    go_value = wd_value.get_value()

                    # int(go_value)

                    if not go_value.startswith('GO'):
                        clean_gos.append(
                            PBB_Core.WDString(value='GO:' + go_value,
                                              prop_nr='P686'))

            try:
                go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                data=clean_gos)
                # pprint.pprint(go_item.get_wd_json_representation())

                go_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type='',
                           message='success',
                           wd_id=curr_qid,
                           duration=time.time() - start))
                print(count, 'success', curr_qid, go_item.get_label(lang='en'))

            except Exception as e:
                print(count, 'error', curr_qid)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type=type(e),
                           message=e.__str__(),
                           wd_id=curr_qid,
                           duration=time.time() - start))
Esempio n. 13
0
    def __init__(self, object):
        # Populate variables with different values
        self.geneSymbols = object["geneSymbols"]
        self.logincreds = object["logincreds"]
        self.goTerms = object["goTerms"]
        self.version = object["results"]["bindings"][0]["upversion"]["value"]
        self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"]
        self.uniprotId = object["id"]
        self.name = object["results"]["bindings"][0]["plabel"]["value"]
        self.start = object["start"]
        self.entrezWikidataIds = object["entrezWikidataIds"]

        up_in_wd = search_wd(self.name)
        self.wdid = None
        hits = []
        for result in up_in_wd["search"]:
            if result["match"]["text"] == up_in_wd["searchinfo"]["search"]:
                hits.append(result)
                print(result["match"]["text"])
        if len(hits) > 0:
            valid = []
            for hit in hits:
                hitPage = PBB_Core.WDItemEngine(item_name=hit["label"],
                                                wd_item_id=hit["id"],
                                                data=[],
                                                server="www.wikidata.org",
                                                domain="proteins")
                json_rep = hitPage.get_wd_json_representation()
                proteinClaim = False
                geneClaim = False
                speciesClaim = False
                if "P279" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P279"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8054:
                            proteinClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 7187:
                            geneClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 407355:
                            proteinClaim = True
                            break
                if "P31" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P31"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8047:
                            proteinClaim = True
                            break
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 8054:
                            proteinClaim = True
                            break
                if "P703" in json_rep["claims"].keys():
                    for it in json_rep["claims"]["P703"]:
                        if it["mainsnak"]["datavalue"]["value"][
                                "numeric-id"] == 5:
                            speciesClaim = True
                            break

                if len(json_rep["claims"]) == 0:
                    raise Exception(hit["id"] +
                                    " has an indentical label as " +
                                    self.uniprotId + ", but with no claims")
                elif ("P352" in json_rep["claims"].keys()
                      or "P705" in json_rep["claims"].keys() or proteinClaim):
                    valid.append(hit["id"])
                elif geneClaim:
                    self.wdid = None
                else:
                    raise Exception(hit["id"] + " has an identical label as " +
                                    self.uniprotId +
                                    " but with no valid protein claims")
            if len(valid) == 1:
                self.wdid = valid[0]
            elif len(valid) > 1:
                raise Exception(
                    self.uniprotId +
                    " There are multiple valid Wikidata items that might be applicable. "
                    + str(valid))

        if "gene_id" in object["results"]["bindings"][0].keys():
            self.gene_id = []
            for geneId in object["results"]["bindings"][0]["gene_id"][
                    "value"].split(";"):
                if geneId != "":
                    self.gene_id.append(geneId)

        if "ecName" in object["results"]["bindings"][0].keys():
            self.ecname = []
            self.ecname.append(
                object["results"]["bindings"][0]["ecName"]["value"])
        self.alias = []
        for syn in object["results"]["bindings"][0]["upalias"]["value"].split(
                ";"):
            if syn != "":
                self.alias.append(syn)
        if "pdbid" in object["results"]["bindings"][0].keys(
        ) and object["results"]["bindings"][0]["pdbid"]["value"] != "":
            self.pdb = []
            for pdbId in object["results"]["bindings"][0]["pdbid"][
                    "value"].split(";"):
                self.pdb.append(
                    pdbId.replace("http://rdf.wwpdb.org/pdb/",
                                  "").replace(" ", ""))
        if "refseqid" in object["results"]["bindings"][0].keys():
            self.refseq = []
            for refseqId in object["results"]["bindings"][0]["refseqid"][
                    "value"].split(";"):
                self.refseq.append(
                    refseqId.replace("http://purl.uniprot.org/refseq/",
                                     "").replace(" ", ""))
        if "ensemblp" in object["results"]["bindings"][0].keys(
        ) and object["results"]["bindings"][0]["ensemblp"]["value"] != "":
            self.ensemblp = []
            for ensP in object["results"]["bindings"][0]["ensemblp"][
                    "value"].split(";"):
                self.ensemblp.append(
                    ensP.replace("http://purl.uniprot.org/ensembl/",
                                 "").replace(" ", ""))

        # Prepare references
        refStatedIn = PBB_Core.WDItemID(value=2629752,
                                        prop_nr='P248',
                                        is_reference=True)
        refStatedIn.overwrite_references = True
        refURL = "http://www.uniprot.org/uniprot/" + self.uniprotId + ".txt?version=" + str(
            self.version)
        refReferenceURL = PBB_Core.WDUrl(value=refURL,
                                         prop_nr='P854',
                                         is_reference=True)
        refReferenceURL.overwrite_references = True
        refImported = PBB_Core.WDItemID(value=905695,
                                        prop_nr='P143',
                                        is_reference=True)
        refImported.overwrite_references = True
        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
        refRetrieved = PBB_Core.WDTime(timeStringNow,
                                       prop_nr='P813',
                                       is_reference=True)
        refRetrieved.overwrite_references = True
        protein_reference = [[
            refStatedIn, refImported, refRetrieved, refReferenceURL
        ]]

        references = dict()
        proteinPrep = dict()
        genePrep = dict()

        # P279 = subclass of
        proteinPrep['P279'] = [
            PBB_Core.WDItemID(value="Q8054",
                              prop_nr='P279',
                              references=protein_reference)
        ]

        # P703 = found in taxon
        proteinPrep['P703'] = [
            PBB_Core.WDItemID(value="Q5",
                              prop_nr='P703',
                              references=protein_reference)
        ]

        # P352 = UniprotID
        proteinPrep['P352'] = [
            PBB_Core.WDString(value=self.uniprotId,
                              prop_nr='P352',
                              references=protein_reference)
        ]

        # P591 = ec number
        if "ecname" in vars(self):
            proteinPrep['P591'] = []
            for i in range(len(self.ecname)):
                proteinPrep['P591'].append(
                    PBB_Core.WDString(value=self.ecname[i],
                                      prop_nr='P591',
                                      references=protein_reference))

        # P638 = PDBID
        if "pdb" in vars(self) and len(self.pdb) > 0:
            proteinPrep['P638'] = []
            for i in range(len(self.pdb)):
                proteinPrep['P638'].append(
                    PBB_Core.WDString(value=self.pdb[i],
                                      prop_nr='P638',
                                      references=protein_reference))

        # P637 = Refseq Protein ID
        if "refseq" in vars(self) and len(self.refseq) > 0:
            proteinPrep['P637'] = []
            for i in range(len(self.refseq)):
                proteinPrep['P637'].append(
                    PBB_Core.WDString(value=self.refseq[i],
                                      prop_nr='P637',
                                      references=protein_reference))

        # P705 = Ensembl Protein ID
        if "ensemblp" in vars(self) and len(self.ensemblp) > 0:
            proteinPrep['P705'] = []
            for i in range(len(self.ensemblp)):
                proteinPrep['P705'].append(
                    PBB_Core.WDString(value=self.ensemblp[i],
                                      prop_nr='P705',
                                      references=protein_reference))
        """
        # P686 = Gene Ontology ID
        proteinPrep["P680"] = []
        proteinPrep["P681"] = []
        proteinPrep["P682"] = []

        for result in self.goTerms["results"]["bindings"]:

            statement = [
                    PBB_Core.WDString(value=result["go"]["value"].replace("http://purl.obolibrary.org/obo/GO_", "GO:"),
                                      prop_nr='P686', references=protein_reference)]
            goWdPage = PBB_Core.WDItemEngine(item_name=result["goLabel"]["value"], data=statement,
                                                 server="www.wikidata.org", domain="proteins")
            if goWdPage.get_description() == "":
                goWdPage.set_description("Gene Ontology term")
            js = goWdPage.get_wd_json_representation()
            goWdId = goWdPage.write(self.logincreds)

            if result["parentLabel"]["value"] == "molecular_function":
                exists = False
                for i in range(len(proteinPrep["P680"])):
                    if proteinPrep["P680"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P680"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P680', references=protein_reference))
            if result["parentLabel"]["value"] == "cellular_component":
                exists = False
                for i in range(len(proteinPrep["P681"])):
                    if proteinPrep["P681"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P681"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P681', references=protein_reference))
            if result["parentLabel"]["value"] == "biological_process":
                exists = False
                for i in range(len(proteinPrep["P682"])):
                    if proteinPrep["P682"][i].value == goWdId:
                        exists = True
                if not exists:
                    proteinPrep["P682"].append(
                        PBB_Core.WDItemID(value=goWdId, prop_nr='P682', references=protein_reference))
        """

        # P702 = Encoded by
        if "gene_id" in vars(self) and len(self.gene_id) > 0:
            proteinPrep['P702'] = []
            proteinPrep['P702'].append(
                PBB_Core.WDItemID(
                    value=self.entrezWikidataIds[self.gene_id[0].replace(
                        "http://purl.uniprot.org/geneid/",
                        "").replace(" ", "")],
                    prop_nr='P702',
                    references=protein_reference))

        proteinData2Add = []
        for key in proteinPrep.keys():
            for statement in proteinPrep[key]:
                proteinData2Add.append(statement)
                print(statement.prop_nr, statement.value)
        if self.wdid is None:
            wdProteinpage = PBB_Core.WDItemEngine(item_name=self.name,
                                                  data=proteinData2Add,
                                                  server="www.wikidata.org",
                                                  domain="proteins",
                                                  append_value=['P279'])
        else:
            wdProteinpage = PBB_Core.WDItemEngine(wd_item_id=self.wdid,
                                                  item_name=self.name,
                                                  data=proteinData2Add,
                                                  server="www.wikidata.org",
                                                  domain="proteins",
                                                  append_value=['P279'])

        if len(self.alias) > 0:
            wdProteinpage.set_aliases(aliases=self.alias,
                                      lang='en',
                                      append=True)
        if wdProteinpage.get_description() == "":
            wdProteinpage.set_description(description='human protein',
                                          lang='en')
        if wdProteinpage.get_description(lang="de") == "":
            wdProteinpage.set_description(description='humanes Protein',
                                          lang='de')
        if wdProteinpage.get_description(lang="nl") == "":
            wdProteinpage.set_description(description='menselijk eiwit',
                                          lang='nl')
        if wdProteinpage.get_description(
                lang="fr") == "" or wdProteinpage.get_description(
                    lang="fr") == "protéine":
            wdProteinpage.set_description(description='protéine humaine',
                                          lang='fr')

        self.wd_json_representation = wdProteinpage.get_wd_json_representation(
        )
        PBB_Debug.prettyPrint(self.wd_json_representation)
        wdProteinpage.write(self.logincreds)
        print(wdProteinpage.wd_item_id)
        if not os.path.exists('./json_dumps'):
            os.makedirs('./json_dumps')
        f = open('./json_dumps/' + self.uniprotId + '.json', 'w+')
        pprint.pprint(self.wd_json_representation, stream=f)
        f.close()
        PBB_Core.WDItemEngine.log(
            'INFO',
            '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
            .format(main_data_id=self.uniprotId,
                    exception_type='',
                    message=f.name,
                    wd_id=self.wdid,
                    duration=time.time() - self.start))
        print("===============")
Esempio n. 14
0
    def __init__(self, login, prop_nr, prefix_str, separator=':'):
        """
        A class to take care of fixing certain identifer prefixes
        :param login: The Wikidata login object instance of PBB_login.WDLogin()
        :param prop_nr: the property number of the identifier the prefix should be fixed for
        :param prefix_str: the prefix string. e.g. 'GO', 'DOID'
        :param separator: the separator character between prefix and string
        """

        self.login_obj = login

        query = '''
            SELECT distinct ?s ?id WHERE {{
                ?s wdt:{0} ?id .
                FILTER(!REGEX(?id, "^{1}{2}[0-9]", "i"))
            }}
        '''.format(prop_nr, prefix_str, separator)

        qids_to_clean = set()
        for x in PBB_Core.WDItemEngine.execute_sparql_query(
                query=query)['results']['bindings']:
            qids_to_clean.add(x['s']['value'].split('/')[-1])

        print('Cleaning up', len(qids_to_clean), 'items.')

        for count, curr_qid in enumerate(qids_to_clean):
            start = time.time()
            clean_gos = []
            print(curr_qid)

            cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid)
            for wd_value in cleanup_item.statements:
                if wd_value.get_prop_nr() == prop_nr:
                    go_value = wd_value.get_value()

                    if not go_value.startswith(prefix_str):
                        clean_gos.append(
                            PBB_Core.WDString(value=prefix_str + separator +
                                              go_value,
                                              prop_nr=prop_nr))

            try:
                go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                data=clean_gos)
                go_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type='',
                           message='success',
                           wd_id=curr_qid,
                           duration=time.time() - start))
                print(count, 'success', curr_qid, go_item.get_label(lang='en'))

            except Exception as e:
                print(count, 'error', curr_qid)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '"{exception_type}", "{message}", {wd_id}, {duration}'.
                    format(exception_type=type(e),
                           message=e.__str__(),
                           wd_id=curr_qid,
                           duration=time.time() - start))
Esempio n. 15
0
    def cleanup_obsolete_edges(ontology_id,
                               core_property_nr,
                               login,
                               current_node_qids=(),
                               obsolete_term=False):
        filter_props_string = ''
        if not obsolete_term:
            for x in OBOImporter.obo_wd_map.values():
                prop_nr = list(x.keys())[0]
                filter_props_string += 'Filter (?p = wdt:{})\n'.format(prop_nr)

        query = '''
        SELECT DISTINCT ?qid ?p ?onto_qid WHERE {{
            {{
                SELECT DISTINCT ?onto_qid WHERE {{
                    ?onto_qid wdt:{2} '{0}' .
                }}
            }}
            ?qid ?p [wdt:{2} '{0}'].
            {1}
        }}
        ORDER BY ?qid
        '''.format(ontology_id, filter_props_string, core_property_nr)
        print(query)

        sr = PBB_Core.WDItemEngine.execute_sparql_query(query=query)

        for occurrence in sr['results']['bindings']:
            if 'statement' in occurrence['qid']['value']:
                continue

            start = time.time()

            qid = occurrence['qid']['value'].split('/')[-1]
            if qid in current_node_qids:
                continue

            prop_nr = occurrence['p']['value'].split('/')[-1]
            wd_onto_qid = occurrence['onto_qid']['value'].split('/')[-1]
            wd_item_id = PBB_Core.WDItemID(value=wd_onto_qid, prop_nr=prop_nr)
            setattr(wd_item_id, 'remove', '')
            try:
                wd_item = PBB_Core.WDItemEngine(wd_item_id=qid,
                                                data=[wd_item_id])
                wd_item.write(login=login)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type='',
                            message='successfully removed obsolete edges',
                            wd_id=qid,
                            duration=time.time() - start))
            except Exception as e:
                print(e)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id=qid,
                            duration=time.time() - start))

        if obsolete_term:
            data = [
                PBB_Core.WDString(value=ontology_id,
                                  prop_nr=core_property_nr,
                                  rank='deprecated'),
            ]

            start = time.time()
            try:
                wd_item = PBB_Core.WDItemEngine(item_name='obo',
                                                domain='obo',
                                                data=data,
                                                use_sparql=True)
                if wd_item.create_new_item:
                    return
                qid = wd_item.write(login=login)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type='',
                            message='successfully obsoleted the ',
                            wd_id=qid,
                            duration=time.time() - start))
            except Exception as e:
                print(e)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}'.format(ontology_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
Esempio n. 16
0
        def get_item_qid(go_id, data=()):
            start = time.time()

            # for efficiency reasons, skip if item already had a root write performed
            if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \
                    and 'qid' in self.local_qid_onto_map[go_id]:
                return self.local_qid_onto_map[go_id]['qid']

            try:
                data = list(data)

                r = requests.get(url=self.base_url +
                                 '{}_{}'.format(self.ontology, go_id),
                                 headers=self.headers)
                go_term_data = r.json()
                label = go_term_data['label']
                description = go_term_data['description'][0]

                if go_term_data['is_obsolete']:
                    OBOImporter.cleanup_obsolete_edges(
                        ontology_id='{}:{}'.format(self.ontology, go_id),
                        login=self.login_obj,
                        core_property_nr=self.core_property_nr,
                        obsolete_term=True)
                    return None

                # get parent ontology term info so item can be populated with description, etc.
                data.append(
                    PBB_Core.WDString(value='GO:{}'.format(go_id),
                                      prop_nr=self.core_property_nr,
                                      references=[self.create_reference()]))
                print(data)
                if go_id in self.local_qid_onto_map:
                    wd_item = PBB_Core.WDItemEngine(
                        wd_item_id=self.local_qid_onto_map[go_id]['qid'],
                        domain='obo',
                        data=data,
                        use_sparql=True)
                else:
                    wd_item = PBB_Core.WDItemEngine(item_name='test',
                                                    domain='obo',
                                                    data=data,
                                                    use_sparql=True)
                wd_item.set_label(label=label)
                if len(description) <= 250:
                    wd_item.set_description(description=description)
                else:
                    wd_item.set_description(description='Gene Ontology term')
                if go_term_data['synonyms'] is not None and len(
                        go_term_data['synonyms']) > 0:
                    aliases = []
                    for alias in go_term_data['synonyms']:
                        if len(alias) <= 250:
                            aliases.append(alias)

                    wd_item.set_aliases(aliases=aliases)

                new_msg = ''
                if wd_item.create_new_item:
                    new_msg = ': created new GO term'

                qid = wd_item.write(login=self.login_obj)

                if go_id not in self.local_qid_onto_map:
                    self.local_qid_onto_map[go_id] = {
                        'qid': qid,
                        'had_root_write': False,
                    }

                if go_id == current_root_id:
                    self.local_qid_onto_map[go_id]['had_root_write'] = True
                    self.local_qid_onto_map[go_id]['parents'] = list(parents)
                    self.local_qid_onto_map[go_id]['children'] = list(children)

                current_node_qids.append(qid)
                print('QID created or retrieved', qid)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type='',
                            message='success{}'.format(new_msg),
                            wd_id=qid,
                            duration=time.time() - start))
                return qid

            except Exception as e:
                print(e)

                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
                return None
Esempio n. 17
0
        def get_item_qid(go_id, data=()):
            start = time.time()

            if self.use_prefix:
                id_string = '{}:{}'.format(self.ontology, go_id)
            else:
                id_string = go_id

            # for efficiency reasons, skip if item already had a root write performed
            if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \
                    and 'qid' in self.local_qid_onto_map[go_id]:
                return self.local_qid_onto_map[go_id]['qid'], False, False

            try:
                data = list(data)

                r = OBOImporter.ols_session.get(
                    url=self.base_url + '{}_{}'.format(self.ontology, go_id),
                    headers=self.headers)
                go_term_data = r.json()
                label = go_term_data['label'].replace('_', ' ')

                description = go_term_data['description'][0]

                if go_term_data['is_obsolete']:
                    OBOImporter.cleanup_obsolete_edges(
                        ontology_id=id_string,
                        login=self.login_obj,
                        core_property_nr=self.core_property_nr,
                        obsolete_term=True)
                    return None, None, None

                # get parent ontology term info so item can be populated with description, etc.
                data.append(
                    PBB_Core.WDString(value=id_string,
                                      prop_nr=self.core_property_nr,
                                      references=[self.create_reference()]))

                exact_match_string = 'http://purl.obolibrary.org/obo/{}_{}'.format(
                    self.ontology, go_id)
                data.append(
                    PBB_Core.WDUrl(value=exact_match_string, prop_nr='P2888'))

                # add xrefs
                if go_term_data['obo_xref']:
                    for xref in go_term_data['obo_xref']:
                        if xref['database'] in OBOImporter.xref_props:
                            wd_prop = OBOImporter.xref_props[xref['database']]
                        else:
                            continue
                        xref_value = xref['id']
                        data.append(
                            PBB_Core.WDExternalID(
                                value=xref_value,
                                prop_nr=wd_prop,
                                references=[self.create_reference()]))

                if go_term_data['obo_synonym']:
                    for syn in go_term_data['obo_synonym']:
                        if syn['type'] in OBOImporter.obo_synonyms:
                            wd_prop = OBOImporter.obo_synonyms[syn['type']]
                        else:
                            continue
                        syn_value = syn['name']
                        data.append(
                            PBB_Core.WDExternalID(
                                value=syn_value,
                                prop_nr=wd_prop,
                                references=[self.create_reference()]))

                if go_id in self.local_qid_onto_map:
                    wd_item = PBB_Core.WDItemEngine(
                        wd_item_id=self.local_qid_onto_map[go_id]['qid'],
                        domain='obo',
                        data=data,
                        fast_run=self.fast_run,
                        fast_run_base_filter=self.fast_run_base_filter)
                else:
                    wd_item = PBB_Core.WDItemEngine(
                        item_name='test',
                        domain='obo',
                        data=data,
                        fast_run=self.fast_run,
                        fast_run_base_filter=self.fast_run_base_filter)
                wd_item.set_label(label=label)
                wd_item.set_description(description=description[0:250])
                # if len(description) <= 250:
                #     wd_item.set_description(description=description)
                # else:
                #     wd_item.set_description(description='Gene Ontology term')
                if go_term_data['synonyms'] is not None and len(
                        go_term_data['synonyms']) > 0:
                    aliases = []
                    for alias in go_term_data['synonyms']:
                        if len(alias) <= 250:
                            aliases.append(alias)

                    wd_item.set_aliases(aliases=aliases)

                new_msg = ''
                if wd_item.create_new_item:
                    new_msg = ': created new {} term'.format(self.ontology)

                qid = wd_item.write(login=self.login_obj)

                if go_id not in self.local_qid_onto_map:
                    self.local_qid_onto_map[go_id] = {
                        'qid': qid,
                        'had_root_write': False,
                    }

                if go_id == current_root_id:
                    self.local_qid_onto_map[go_id]['had_root_write'] = True
                    self.local_qid_onto_map[go_id]['parents'] = list(parents)
                    self.local_qid_onto_map[go_id]['children'] = list(children)

                current_node_qids.append(qid)
                print('QID created or retrieved', qid)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type='',
                            message='success{}'.format(new_msg),
                            wd_id=qid,
                            duration=time.time() - start))
                return qid, go_term_data['obo_xref'], wd_item.require_write

            except Exception as e:
                print(e)
                # traceback.print_exc(e)

                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id='{}:{}'.format(self.ontology, go_id),
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))
                return None, None, None
Esempio n. 18
0
    def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress, fast_run=True):
        self.uniprot = uniprot
        self.uniprot_qid = base_map[uniprot]['qid']
        self.ensp = set()
        self.ncbip = set()
        self.go_terms = set()
        self.login = login
        self.go_prop_map = go_prop_map
        self.entrez = base_map[uniprot]['entrez']['id']
        self.entrez_quid = base_map[uniprot]['entrez']['qid']
        self.res_id = base_map[uniprot]['entrez']['res_id']

        self.label = ''
        self.description = ''
        self.aliases = set()
        self.tax_id = ''
        self.annotation_type = ''

        self.statements = []

        self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid}

        start = time.time()

        if not os.path.exists('./data/uniprot_raw'):
            os.makedirs('./data/uniprot_raw')

        # check if Uniprot xml exists and its age?
        r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(self.uniprot))

        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w')
        f.write(r.text)
        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r')

        # check if XML can be properly parsed, log obsolete items for permanent removal.
        try:
            for event, e in Et.iterparse(f, events=('start', 'end')):

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry':
                    if 'dataset' in e.attrib:
                        self.annotation_type = e.attrib['dataset']

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein':
                    tmp = e.find('./{http://uniprot.org/uniprot}recommendedName/'
                                 '{http://uniprot.org/uniprot}fullName')
                    if tmp is not None:
                        self.label = tmp.text
                    elif e.find('./{http://uniprot.org/uniprot}submittedName/'
                                '{http://uniprot.org/uniprot}fullName') is not None:
                        self.label = e.find('./{http://uniprot.org/uniprot}submittedName/'
                                            '{http://uniprot.org/uniprot}fullName').text

                    for prop in e.findall('./{http://uniprot.org/uniprot}alternativeName/'):
                        self.aliases.add(prop.text)

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism':
                    for prop in e.findall('./{http://uniprot.org/uniprot}dbReference'):
                        if prop.attrib['type'] == 'NCBI Taxonomy':
                            self.tax_id = prop.attrib['id']

                # print(e)
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'Ensembl':

                    for prop in e.findall('./{http://uniprot.org/uniprot}property'):
                        if prop.attrib['type'] == 'protein sequence ID':
                            self.ncbip.add(prop.attrib['value'])
                            self.statements.append(PBB_Core.WDString(value=prop.attrib['value'], prop_nr='P705',
                                                                     references=[self.create_reference()]))

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'RefSeq':
                    self.ncbip.add(e.attrib['id'])
                    self.statements.append(PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637',
                                                             references=[self.create_reference()]))

                # get alternative identifiers for gene to protein mapping
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes:
                    res_id = e.attrib['id']
                    if res_id in res_id_to_entrez_qid:
                        self.entrez_quid = res_id_to_entrez_qid[res_id][0]

        except Et.ParseError as e:
            print('Error when parsing Uniprot {} XML file, item {} most likely obsolete'.format(self.uniprot,
                                                                                                self.uniprot_qid))
            PBB_Core.WDItemEngine.log(
                'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))
            return

        # get GO annotations from QuickGO
        params = {
            'format': 'tsv',
            'limit': '1000',
            'protein': self.uniprot
        }

        url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation'

        try:
            itrt = iter(requests.get(url, params=params).text.strip('\n ').split('\n'))
            next(itrt)  # skip header line

            for line in itrt:
                cols = line.split('\t')
                go_id = cols[6]
                evidence_code = cols[9]
                go_aspect = cols[11][0]

                if self.uniprot not in pdb_to_go:
                    pdb_to_go[self.uniprot] = {
                        'go_terms': list(),
                        'evidence': list(),
                        'pdb': set()
                    }

                pdb_to_go[self.uniprot]['go_terms'].append(go_id)
                pdb_to_go[self.uniprot]['evidence'].append(evidence_code)

                if go_id in go_prop_map:
                    go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_aspect)
        except requests.HTTPError as e:
            print(e.__str__())
            print('Quick GO service not available, exiting!')
            sys.exit(1)
        except IndexError:
            print(e.__str__())
            print('Quick GO data error, service likely not available, exiting!')
            sys.exit(1)

        # set description according to the annotation the Uniprot entry is coming from
        self.description = self.descr_map[self.tax_id]['en']

        if self.annotation_type == 'TrEMBL':
            self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(self.uniprot)
        elif self.annotation_type == 'Swiss-Prot':
            self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(self.uniprot)

        # assign a GO term a GO subontology/OBO namespace
        if self.uniprot in pdb_to_go:
            for go in set(pdb_to_go[self.uniprot]['go_terms']):
                # check if a GO term is not yet in Wikidata
                # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it
                if go not in go_prop_map:
                    PBB_Core.WDItemEngine.log(
                        'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                            .format(
                                main_data_id='{}'.format(self.uniprot),
                                exception_type='GO term not in Wikidata exception',
                                message='GO term {} not found in Wikidata, skipping this one'.format(go),
                                wd_id=self.uniprot_qid,
                                duration=time.time() - start
                            ))
                    print('GO term {} not found in Wikidata, skipping this one'.format(go))
                    continue

                # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class
                if not go_prop_map[go]['go_class_prop']:
                    go_class_prop = ProteinBot.get_go_class(go)
                    if not go_class_prop:
                        continue

                    go_prop_map[go]['go_class_prop'] = go_class_prop
                    print('added class code {} to {}'.format(go_prop_map[go]['go_class_prop'], go))

                # create a set of WD QIDs representing GO evidence code items in WD
                evidence = list()
                for count, ev in enumerate(pdb_to_go[self.uniprot]['evidence']):
                    if pdb_to_go[self.uniprot]['go_terms'][count] == go and self.go_evidence_codes[ev] not in evidence:
                        evidence.append(self.go_evidence_codes[ev])

                # iterate though the evidence code set and create a new qualifier for each one
                qualifiers = [PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev]

                # Create Wikidata GO term value
                prop_nr = self.go_prop_map[go]['go_class_prop']
                qid = self.go_prop_map[go]['qid']
                self.statements.append(PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers,
                                                         references=[self.create_reference()]))

            for pdb in pdb_to_go[self.uniprot]['pdb']:
                self.statements.append(PBB_Core.WDString(value=pdb.upper(), prop_nr='P638',
                                                         references=[self.create_reference()]))

        self.statements.append(PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()]))

        if self.entrez_quid != '':
            self.statements.append(PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702',
                                                     references=[self.create_reference()]))

        current_taxonomy_id = self.taxon_map[self.tax_id]
        self.statements.append(PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703',
                                                 references=[self.create_reference()]))
        self.statements.append(PBB_Core.WDString(value=self.uniprot, prop_nr='P352',
                                                 references=[self.create_reference()]))

        # remove all Wikidata properties where no data has been provided, but are handled by the bot
        all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements))
        for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P638', 'P692', 'P702']:
            if pr not in all_stmnt_props:
                self.statements.append(PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr))

        try:
            taxon_qid = self.taxon_map[self.tax_id]
            new_msg = ''
            if self.uniprot_qid:
                wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements,
                                                append_value=['P279'], fast_run=fast_run,
                                                fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q8054'})
            else:
                wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements)
                new_msg = 'new protein created'

            wd_item.set_label(self.label)
            wd_item.set_description(self.description)
            wd_item.set_aliases(aliases=self.aliases, append=False)

            self.uniprot_qid = wd_item.write(self.login)

            if self.entrez_quid != '':
                encodes = PBB_Core.WDItemID(value=self.uniprot_qid, prop_nr='P688',
                                            references=[self.create_reference()])
                gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688'],
                                                  fast_run=fast_run,
                                                  fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q7187'})
                gene_item.write(login)

            progress[self.uniprot] = self.uniprot_qid

            PBB_Core.WDItemEngine.log(
                'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type='',
                        message='success{}'.format(new_msg),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))

            # pprint.pprint(wd_item.get_wd_json_representation())
        except Exception as e:
            print(e)

            PBB_Core.WDItemEngine.log(
                'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(
                        main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start
                    ))
            traceback.print_exc()

        print(self.label)
        print(self.aliases)
        print(self.tax_id)
Esempio n. 19
0
    def __init__(self, login):

        self.login_obj = login

        image_data = pd.read_csv(
            './image_data/gene_wiki_images_with_preferred.txt',
            encoding='utf-8',
            sep='\t',
            dtype={'entrez': np.str})

        wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]',
                                          '351').wditems
        wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351']))
        entrez_qid_list = list(
            map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351']))

        print(len(wd_entrez_ids))

        for index in image_data.index:
            start = time.time()
            # print(image_data.loc[index, 'other_images'])
            image_names = image_data.loc[index, 'other_images']

            preferred_image = image_data.loc[index, 'primary_image']

            image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf']
            if pd.notnull(preferred_image) and '|' in preferred_image:
                for splt in preferred_image.split('|'):
                    for ending in image_file_extension:
                        if ending in splt:
                            preferred_image = splt
                            break

            entrez = image_data.loc[index, 'entrez']
            # print(entrez)

            protein_images = []
            protein_image_value_store = []
            genex_images = []
            genex_value_store = []

            if entrez not in wd_entrez_ids:
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='Entrez ID not yet in Wikidata!!',
                            wd_id='',
                            duration=time.time() - start))
                continue
            else:
                curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)]

            if pd.isnull(image_names):
                PBB_Core.WDItemEngine.log(
                    'WARNING',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='No images available for this Entrez ID',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                continue

            for sub_string in image_names.split('|'):
                if 'PBB GE ' in sub_string:
                    value = sub_string[5:]

                    # if value[-6:-4] == 'tn':
                    #     value = value[:-6] + 'fs' + value[-4:]

                    # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956

                    genex_images.append(value)
                    genex_value_store.append(
                        PBB_Core.WDCommonsMedia(value=value, prop_nr='P692'))
                elif 'PDB ' in sub_string:
                    value = sub_string[5:]
                    protein_images.append(value)

                    protein_image_value_store.append(
                        PBB_Core.WDCommonsMedia(value, prop_nr=''))

            entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351')

            data = [entrez_id_value]
            data.extend(genex_value_store)

            if pd.notnull(preferred_image):
                data.append(
                    PBB_Core.WDCommonsMedia(value=preferred_image,
                                            prop_nr='P18'))

            try:
                gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                  domain='genes',
                                                  data=data)
                # pprint.pprint(gene_item.get_wd_json_representation())

                gene_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='success',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                print(index, 'success', curr_qid, entrez,
                      gene_item.get_label(lang='en'))

            except Exception as e:
                print(index, 'error', curr_qid, entrez)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id=curr_qid,
                            duration=time.time() - start))
Esempio n. 20
0
    def __init__(self, user, pwd):
        properties = [
            'P279', 'P769', 'P31', 'P636', 'P267', 'P231', 'P486', 'P672',
            'P662', 'P661', 'P652', 'P665', 'P683', 'P274', 'P715', 'P646',
            'P592', 'P233', 'P234', 'P235', 'P18', 'P373', 'P2275', 'P657',
            'P595', 'P2115'
        ]
        # these property names do not match those in Wikidata!!
        property_names = [
            'subclass of', 'significant drug interaction', 'instance of',
            'route of administration', 'ATC code', 'CAS number', 'MeSH ID',
            'MeSH Code', 'PubChem ID (CID)', 'ChemSpider', 'UNII', 'KEGG Drug',
            'ChEBI', 'Molecular Formula', 'Drugbank ID', 'Freebase identifier',
            'ChEMBL', 'SMILES', 'InChI', 'InChIKey', 'image',
            'Commons category', 'WHO INN', 'RTECS Number',
            'Guide to Pharmacology', 'NDF-RT NUI'
        ]

        prop_to_name = dict(zip(properties, property_names))
        name_to_prop = dict(zip(property_names, properties))

        login_obj = WDLogin(user=user, pwd=pwd, server='www.wikidata.org')

        drug_data = pd.read_csv('./drugbank_data/drugbank.csv',
                                index_col=0,
                                engine='c',
                                encoding='utf-8',
                                dtype={
                                    'PubChem ID (CID)': np.str,
                                    'ChEBI': np.str,
                                    'ChEMBL': np.str,
                                    'ChemSpider': np.str,
                                    'Guide to Pharmacology': np.str
                                })

        # extract creation date of Drugbank file from Drugbank zip file
        drugbank_zip = zipfile.ZipFile('./drugbank_data/drugbank.xml.zip')
        self.drugbank_date = datetime.datetime(
            *[x for x in drugbank_zip.infolist()[0].date_time]).strftime(
                '+%Y-%m-%dT00:00:00Z')

        print(drug_data.dtypes)

        base_ref = {'ref_properties': ['P248'], 'ref_values': ['Q1122544']}

        # remove potential 'InChI=' and 'InChIKey=' prefixes
        for i in drug_data['InChI'].index:
            if pd.notnull(drug_data['InChI'].at[i]):
                if 'InChI=' in drug_data['InChI'].at[i]:
                    drug_data['InChI'].at[i] = drug_data['InChI'].at[i][6:]
                if 'InChIKey=' in drug_data['InChIKey'].at[i]:
                    drug_data['InChIKey'].at[i] = drug_data['InChIKey'].at[i][
                        9:]

        # remove DB prefix from Drugbank ID (should be corrected in the Wikidata property)
        for i in drug_data['Drugbank ID'].index:
            if pd.notnull(drug_data['Drugbank ID'].at[i]):
                drug_data['Drugbank ID'].at[i] = drug_data['Drugbank ID'].at[
                    i][2:]

        # Iterate though all drugbank compounds and add those to Wikidata which are either FDA-approved or have been
        # withdrawn from the market. Add all non-missing values for each drug to Wikidata.
        for count in drug_data.index:
            print('Count is:', count)

            if drug_data.loc[count, 'Status'] == 'approved' or drug_data.loc[
                    count, 'Status'] == 'withdrawn':
                data = []
                special_cases = ['WHO INN', 'ATC code']
                for col in drug_data.columns.values:
                    data_value = drug_data.loc[count, col]

                    # no values and values greater than 400 chars should not be added to wikidata.
                    if pd.isnull(data_value) or col not in name_to_prop:
                        continue
                    elif len(data_value) > 400:
                        continue

                    if col in property_names and col not in special_cases:
                        data.append(
                            PBB_Core.WDString(value=str(data_value).strip(),
                                              prop_nr=name_to_prop[col]))

                # add instances of (P31) of chemical compound (Q11173), pharmaceutical drug (Q12140),
                # Biologic medical product (Q679692) and  monoclonal antibodies (Q422248)
                data.append(PBB_Core.WDItemID(value='Q11173', prop_nr='P31'))
                data.append(PBB_Core.WDItemID(value='Q12140', prop_nr='P31'))

                if drug_data.loc[count, 'Drug type'] == 'biotech':
                    data.append(
                        PBB_Core.WDItemID(value='Q679692', prop_nr='P31'))

                if drug_data.loc[count, 'Name'][-3:] == 'mab':
                    data.append(
                        PBB_Core.WDItemID(value='Q422248', prop_nr='P31'))

                # for instance of, do not overwrite what other users have put there
                append_value = ['P31', 'P2275']

                # Monolingual value WHO INN requires special treatment
                if pd.notnull(drug_data.loc[count, 'WHO INN']):
                    data.append(
                        PBB_Core.WDMonolingualText(
                            value=drug_data.loc[count, 'WHO INN'],
                            prop_nr='P2275',
                            language='en'))

                # split the ATC code values present as one string in the csv file
                if pd.notnull(drug_data.loc[count, 'ATC code']):
                    for atc in drug_data.loc[count, 'ATC code'].split(';'):
                        data.append(
                            PBB_Core.WDString(value=atc, prop_nr='P267'))

                drugbank_source = [
                    'instance of', 'ATC code', 'CAS number', 'Drugbank ID',
                    'Molecular Formula', 'InChI', 'InChIKey'
                ]
                chembl_source = [
                    'ChEMBL', 'ChemSpider', 'KEGG Drug', 'ChEBI', 'SMILES',
                    'WHO INN', 'Guide to Pharmacology'
                ]
                pubchem_source = ['MeSH ID', 'PubChem ID (CID)']
                ndfrt_source = ['NDF-RT NUI', 'UNII']

                for i in data:
                    if i.get_prop_nr() in [
                            name_to_prop[x] for x in chembl_source
                    ]:
                        # if no ChEMBL ID exists, data is from Drugbank, therefore add Drugbank as ref
                        if pd.isnull(drug_data.loc[count, 'ChEMBL']):
                            drugbank_source.append(
                                prop_to_name[i.get_prop_nr()])
                            continue
                        i.set_references(
                            self.make_reference(
                                stated_in='Q6120337',
                                source_element=drug_data.loc[count, 'ChEMBL'],
                                source_element_name=drug_data.loc[count,
                                                                  'Name'],
                                source_element_prop=name_to_prop['ChEMBL']))

                for i in data:
                    if i.get_prop_nr() in [
                            name_to_prop[x] for x in drugbank_source
                    ]:
                        i.set_references(
                            self.make_reference(
                                stated_in='Q1122544',
                                source_element=drug_data.loc[count,
                                                             'Drugbank ID'],
                                source_element_name=drug_data.loc[count,
                                                                  'Name'],
                                source_element_prop=name_to_prop[
                                    'Drugbank ID'],
                                date=self.drugbank_date,
                                date_property='P577'))

                for i in data:
                    if i.get_prop_nr() in [
                            name_to_prop[x] for x in pubchem_source
                    ] and pd.notnull(drug_data.loc[count, 'PubChem ID (CID)']):
                        i.set_references(
                            self.make_reference(
                                stated_in='Q278487',
                                source_element=drug_data.loc[
                                    count, 'PubChem ID (CID)'],
                                source_element_name=drug_data.loc[count,
                                                                  'Name'],
                                source_element_prop=name_to_prop[
                                    'PubChem ID (CID)']))

                for i in data:
                    if i.get_prop_nr() in [
                            name_to_prop[x] for x in ndfrt_source
                    ] and pd.notnull(drug_data.loc[count, 'NDF-RT NUI']):
                        i.set_references(
                            self.make_reference(
                                stated_in='Q21008030',
                                source_element=drug_data.loc[count,
                                                             'NDF-RT NUI'],
                                source_element_name=drug_data.loc[
                                    count, 'Name'].upper(),
                                source_element_prop=name_to_prop['NDF-RT NUI'])
                        )

                label = drug_data.loc[count, 'Name']
                domain = 'drugs'

                # If label in aliases list, remove the label from it. If an alias is longer than 250 chars, also remove
                # Aliases longer than 250 characters will trigger an WD API error.
                if pd.notnull(drug_data.loc[count, 'Aliases']):
                    aliases = drug_data.loc[count, 'Aliases'].split(';')
                    for i in aliases:
                        if i == label or i == label.lower(
                        ) or len(i) > 250 or len(i) == 0:
                            aliases.remove(i)

                start = time.time()

                # pprint.pprint(data)
                # pprint.pprint(references)
                print('Drug name:', label)
                try:

                    wd_item = PBB_Core.WDItemEngine(item_name=label,
                                                    domain=domain,
                                                    data=data,
                                                    use_sparql=True,
                                                    append_value=append_value)

                    # overwrite only certain descriptions
                    descriptions_to_overwrite = {
                        'chemical compound', 'chemical substance', ''
                    }
                    if wd_item.get_description() in descriptions_to_overwrite:
                        wd_item.set_description(
                            description='pharmaceutical drug', lang='en')

                    wd_item.set_label(label=label, lang='en')

                    if pd.notnull(drug_data.loc[count, 'Aliases']):
                        wd_item.set_aliases(aliases=aliases,
                                            lang='en',
                                            append=True)

                    # pprint.pprint(wd_item.get_wd_json_representation())

                    wd_item.write(login_obj)

                    new_mgs = ''
                    if wd_item.create_new_item:
                        new_mgs = ': New item'

                    PBB_Core.WDItemEngine.log(
                        'INFO',
                        '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                        .format(
                            main_data_id=drug_data['Drugbank ID'].at[count],
                            exception_type='',
                            message='success{}'.format(new_mgs),
                            wd_id=wd_item.wd_item_id,
                            duration=time.time() - start))
                    print('success')

                except Exception as e:
                    print(e)

                    PBB_Core.WDItemEngine.log(
                        'ERROR',
                        '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                        .format(
                            main_data_id=drug_data['Drugbank ID'].at[count],
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='',
                            duration=time.time() - start))

                end = time.time()
                print('Time elapsed:', end - start)