Esempio n. 1
0
def searchAlias(key, propertyID, value):
    '''
	To search wikidata through "Also known as"(alias) field.
	Here keys maybe "entrez:1234" or "uniprot:P1234" or "go:1234"
    '''
    ID = ""
    res = wikidata.search_Item(key)
    if res:
        ID = wikidata.search_claim(res, propertyID, value)
    return ID
Esempio n. 2
0
    def run_MouseProtein(self, MouseProtein):
        ''' Run the bot for Mouse Protein item
        Arguments:
        MouseProtein : MouseProtein item constructed from mygeneinfo.api

        Search for already created Human Gene item by wikidata label. If fails, as backup search from "Also known as"(alias) field
        which has values as "uniprot:1234".
        '''

        key = MouseProtein.fieldsdict['Uniprot ID']
        title = MouseProtein.fieldsdict['Name']
        res = wikidata.search_Item(title)
        uniprot = 'P352'
        if res:
            ID = wikidata.search_claim(res, uniprot, key)
            if not ID:
                res = wikidata.search_Item('uniprot:' + str(key))
                ID = wikidata.search_claim(res, uniprot, key)
            if not ID:
                message = 'Failed to retreive MouseProtein item with uniprot:{UP} from search result:{RES}'.format(
                    RES=res, UP=key)
                raise wikidata.WikidataSearchError(message)
            Item = self.genewikidata.get_item(ID)
        else:
            message = 'Failed to search already created MouseProtein Wikidata item with UniprotID={val}'.format(
                val=key)
            raise wikidata.WikidataSearchError(message)

        try:
            CurMProtein = wikidata.construct_from_item(Item,
                                                       WItem.MouseProtein())
            updatedMProtein, summary, updatedClaims = CurMProtein.updateWith(
                MouseProtein)
            #print updatedClaims
        except Exception as err:
            if isinstance(err, wikidata.WikidataConstructItem):
                message = 'WikidataParseFailure. ErrorCause:{e} '.format(e=err)
                raise wikidata.WikidataConstructItem(message)
            else:
                raise err
        #ipdb.set_trace()
        message = self.write(Item, updatedMProtein, updatedClaims)
        self.logger(0, Item.getID(), msg=message)
Esempio n. 3
0
    def run_HumanGene(self, HumanGene):
        '''  Run the bot for the Human Gene item
        Arguments:
        HumanGene : HumanGene object constructed from mygeneinfo.api
        
	Search for already created Human Gene item by wikidata label. If fails, as backup search from "Also known as"(alias) field
	which has values as "entrez:1234".
        '''

        key = HumanGene.fieldsdict['Entrez Gene ID']
        title = HumanGene.fieldsdict['Name']
        res = wikidata.search_Item(title)
        entrez = 'P351'
        if res:
            ID = wikidata.search_claim(res, entrez, key)
            if not ID:
                res = wikidata.search_Item('entrez:' + str(key))
                ID = wikidata.search_claim(res, entrez, key)
            if not ID:
                message = 'Failed to retreive HumanGene item with entrez:{EZ} from search result:{RES}'.format(
                    RES=res, EZ=key)
                raise wikidata.WikidataSearchError(message)
            Item = self.genewikidata.get_item(ID)
        else:
            message = 'Failed to search by label already created HumanGene Wikidata item with EntrezID={val}'.format(
                val=key)
            raise wikidata.WikidataSearchError(message)
        try:
            CurHGene = wikidata.construct_from_item(Item, WItem.HumanGene())
            updatedHGene, summary, updatedClaims = CurHGene.updateWith(
                HumanGene)
            #print updatedClaims
        except Exception as err:
            if isinstance(err, wikidata.WikidataConstructItem):
                message = 'WikidataParseFailure. ErrorCause:{e} '.format(e=err)
                raise wikidata.WikidataConstructItem(message)
            else:
                raise err
        #ipdb.set_trace()
        message = self.write(Item, updatedHGene, updatedClaims)
        self.logger(0, Item.getID(), msg=message)
Esempio n. 4
0
def parse_HumanProtein_json(gene_json, label):
    '''Construct the Human Gene from gene_json. The entire fields are specified in WItem
    Arguments:
    gene_json - mygeneinfo json document for given gene
    homoog_json - mygeneinfo json_documnet for corresponding mouse gene 
    '''
    #ipdb.set_trace()
    root = gene_json
    HPItem = HumanProtein()

    entrez = get(root, 'entrezgene')
    uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez)
    if not uniprot:
        raise UniProtError('Could not find uniprot ID')
    HPItem.setField("Uniprot ID", uniprot)
    HPItem.setField("aliases", 'uniprot:' + str(uniprot))
    #First setup Human protein Item with label = HGNC name  and uniprot ID
    name = get(root, 'name')
    wikidata.setHumanProtein(name, label, uniprot)

    #found in taxon wikidata item human=Q5 , protein=Q8054
    #for proteins label= HGNC fullname
    HPItem.setField("Name", name)
    HPItem.setField("description", "human protein")
    HPItem.setField("found in taxon", "Q5")
    HPItem.setField("subclass of", "Q8054")
    name = get(root, 'name')
    HPItem.setField("Name", name)

    # HPItem.setField("EC number", get(root, 'ec'))
    #adding refseq id's based on valid accession prefixes
    initial = get(get(root, 'refseq'), 'protein')
    HPItem.setField("RefSeq Protein ID",
                    parse_accession(initial, "RefSeq Protein ID"))
    HPItem.setField("Ensembl Protein ID", get(get(root, 'ensembl'), 'protein'))

    #Wikidata items for GO terms
    GO_ID = 'P686'
    if get(root, 'go'):
        GO_DICT = get(root, 'go')
        for key in GO_DICT:
            res_list = GO_DICT[key]
            ID = []
            #single term
            if 'term' in res_list:
                GID = []
                title = res_list['term']
                res = wikidata.search_Item(title)
                if res:
                    for val in res:
                        if val['label'] == title:
                            GID = val['id']
                            wikidata.setLabel(GID, res_list['id'])
                            wikidata.set_GO_Terms(str(GID), res_list['id'][3:])
                            break

                if not GID:
                    GID = searchAlias(res_list['id'], GO_ID,
                                      res_list['id'][3:])
                #Create GO Item if it does not exist
                if not GID:
                    GID = wikidata.create_Item(title)
                    print "created GO item ", GO_ID, GID
                    wikidata.setLabel(GID, res_list['id'])
                    #add created id's for the go terms
                    wikidata.addClaim(GID, GO_ID, res_list['id'][3:],
                                      'Gene Ontology ID')
                    CreatedItemlogger(Item=GID,
                                      Type='GO TERM',
                                      field='Gene Ontology ID',
                                      value=res_list['id'],
                                      name=str(key))
                if not GID.title() in ID:
                    ID.append(GID.title())

            else:
                #mutiple terms in go field
                for val in res_list:
                    #search for the item

                    title = val['term']
                    # title has multiple words seperated by /
                    # found
                    if title.find('/') != -1:
                        match = re.search(r'([\w ]*)\/.*', title)
                        title = match.group(1)
                    res = wikidata.search_Item(title)
                    GID = []
                    #search for the corresponding go term
                    if res:
                        for each_val in res:
                            if each_val['label'] == title:
                                GID = each_val['id']
                                wikidata.setLabel(GID, val['id'])
                                wikidata.set_GO_Terms(str(GID), val['id'][3:])
                                break
                    if not GID:
                        GID = searchAlias(val['id'], GO_ID, val['id'][3:])
                #Create GO Item if it does not exist
                    if not GID:
                        GID = wikidata.create_Item(title)
                        print "created GO item ", GO_ID, GID
                        wikidata.setLabel(GID, val['id'])
                        wikidata.addClaim(GID, GO_ID, val['id'][3:],
                                          'Gene Ontology ID')
                        CreatedItemlogger(Item=GID,
                                          Type='GO TERM',
                                          field='Gene Ontology ID',
                                          value=val['id'][3:],
                                          name=str(title))
                    if not GID.title() in ID:
                        ID.append(GID.title())

            if key == 'CC':
                HPItem.setField("cell component", ID)
            elif key == 'MF':
                HPItem.setField("molecular function", ID)
            elif key == 'BP':
                HPItem.setField("biological process", ID)

    #PDB
    pdbs = rcsb.pdbs_for_uniprot(uniprot)
    if not pdbs:
        pdbs = get(root, 'pdb')
    HPItem.setField("PDB", pdbs)

    #For "encoded by" property search for corresponding gene item. If not present ,create it and obtain wikidata identifier
    #search_title = HGNC symbol
    key = get(root, 'symbol')
    ID = []
    res = wikidata.search_Item(key)
    #search for human gene, property = entrez
    entrezID = 'P351'
    if res:
        ID = wikidata.search_claim(res, entrezID, entrez)
    if not ID:
        ID = searchAlias("entrez:" + str(entrez), entrezID, entrez)

    #search result is null or corresponding human gene doesnot exist
    if not ID:
        #create human gene item
        ID = wikidata.create_Item(key)
        print "created human gene", entrez
        elabel = "entrez:" + str(entrez)
        wikidata.setLabel(ID, elabel)

        #add entrez claim to human gene item
        wikidata.addClaim(ID, 'P351', str(entrez), 'Entrez Gene ID')
        CreatedItemlogger(Item=ID,
                          Type='Human Gene',
                          field='Entrez Gene ID',
                          value=entrez,
                          name=str(key))
    HPItem.setField("encoded by", ID.title())

    #ipdb.set_trace()
    return HPItem
Esempio n. 5
0
def parse_MouseProtein_json(Homolog_json):
    '''Construct the Mouse Protein from gene_json. The entire fields are specified in WItem
    Arguments:
    gene_json - mygeneinfo json document for given gene
    homolog_json - mygeneinfo json_documnet for corresponding mouse gene 
    '''
    #ipdb.set_trace()
    root = Homolog_json
    MPItem = MouseProtein()

    #found in taxon wikidata item mouse=Q83310 , protein=Q8054
    MPItem.setField("Name", get(root, 'name'))
    MPItem.setField("description", "mouse protein")
    MPItem.setField("found in taxon", "Q83310")
    MPItem.setField("subclass of", "Q8054")
    name = get(root, 'name')
    MPItem.setField("Name", name)

    entrez = get(root, 'entrezgene')
    uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez)
    if not uniprot:
        raise UniProtError('Could not find uniprot ID')
    MPItem.setField("Uniprot ID", uniprot)
    MPItem.setField("aliases", 'uniprot:' + str(uniprot))

    #MPItem.setField("EC number", get(root, 'ec'))
    initial = get(get(root, 'refseq'), 'protein')
    MPItem.setField("RefSeq Protein ID",
                    parse_accession(initial, "RefSeq Protein ID"))
    MPItem.setField("Ensembl Protein ID", get(get(root, 'ensembl'), 'protein'))

    #GO TERMS
    #Wikidata items for GO terms
    GO_ID = 'P686'

    if get(root, 'go'):
        GO_DICT = get(root, 'go')
        for key in GO_DICT:
            res_list = GO_DICT[key]
            ID = []
            #single term
            if 'term' in res_list:
                GID = []
                title = res_list['term']
                res = wikidata.search_Item(title)
                if res:
                    for val in res:
                        if val['label'] == title:
                            GID = val['id']
                            wikidata.setLabel(GID, res_list['id'])
                            wikidata.set_GO_Terms(str(GID), res_list['id'][3:])
                            break
                if not GID:
                    GID = searchAlias(res_list['id'], GO_ID,
                                      res_list['id'][3:])
                #Create GO Item if it does not exist
                if not GID:
                    GID = wikidata.create_Item(title)
                    print "created GO item ", GO_ID, GID
                    #add created id's for the go terms
                    wikidata.setLabel(GID, str(res_list['id']))
                    wikidata.addClaim(GID, GO_ID, res_list['id'][3:],
                                      'Gene Ontology ID')
                    CreatedItemlogger(Item=GID,
                                      Type='GO TERM',
                                      field='Gene Ontology ID',
                                      value=res_list['id'],
                                      name=str(key))

                if not GID.title() in ID:
                    ID.append(GID.title())

            else:
                #mutiple terms in go field
                for val in res_list:
                    #search for the item

                    title = val['term']
                    # title has multiple words seperated by /
                    # found
                    if title.find('/') != -1:
                        match = re.search(r'([\w ]*)\/.*', title)
                        title = match.group(1)
                    res = wikidata.search_Item(title)
                    GID = []
                    #search for the corresponding go term
                    if res:
                        for each_val in res:
                            if each_val['label'] == title:
                                GID = each_val['id']
                                wikidata.setLabel(GID, str(val['id']))
                                wikidata.set_GO_Terms(str(GID), val['id'][3:])
                                break

                #Create GO Item if it does not exist
                    if not GID:
                        GID = searchAlias(val['id'], GO_ID, val['id'][3:])
                    if not GID:
                        GID = wikidata.create_Item(title)
                        print "created GO item ", GO_ID, GID
                        #add created id's for the go terms
                        wikidata.setLabel(GID, str(val['id']))
                        wikidata.addClaim(GID, GO_ID, val['id'][3:],
                                          'Gene Ontology ID')
                        CreatedItemlogger(Item=GID,
                                          Type='GO TERM',
                                          field='Gene Ontology ID',
                                          value=val['id'][3:],
                                          name=str(title))
                    if not GID.title() in ID:
                        ID.append(GID.title())
            if key == 'CC':
                MPItem.setField("cell component", ID)
            elif key == 'MF':
                MPItem.setField("molecular function", ID)
            elif key == 'BP':
                MPItem.setField("biological process", ID)

    #PDB  - CHECK what if Human proteins donot have pdb Id?
    pdbs = rcsb.pdbs_for_uniprot(uniprot)
    if not pdbs:
        pdbs = get(root, 'pdb')
    MPItem.setField("PDB", pdbs)

    #For "encoded by" property search for corresponding gene item. If not present ,create it and obtain wikidata identifier
    #search_title = HGNC symbol
    key = get(root, 'symbol')
    ID = []
    res = wikidata.search_Item(key)
    entrezID = 'P351'
    #search for mouse gene, property = entrez
    if res:
        ID = wikidata.search_claim(res, entrezID, entrez)
    if not ID:
        ID = searchAlias("entrez:" + str(entrez), entrezID, entrez)
    #search result is null or corresponding mouse gene doesnot exist
    if not ID:
        message = "Failed to retreive Mouse Gene wikidata item with entrez:{ez}".format(
            ez=entrez)
        raise wikidata.WikidataSearchError(message)
    MPItem.setField("encoded by", ID.title())
    #ipdb.set_trace()
    return MPItem
Esempio n. 6
0
def parse_MouseGene_json(homolog_json, gene_json):
    '''Construct the Mouse Gene from gene_json. The entire fields are specified in WItem
    Arguments:
    gene_json - mygeneinfo json document for given gene
    homolog_json - mygeneinfo json_documnet for corresponding mouse gene 
    '''
    #ipdb.set_trace()
    MGItem = MouseGene()
    root = homolog_json

    MGItem.setField("found in taxon", "Q83310")
    MGItem.setField("subclass of", "Q7187")
    MGItem.setField("description", "mouse gene")

    #for mouse gene label = MGI symbol
    MGItem.setField("Name", get(root, 'symbol'))
    entrez = get(root, 'entrezgene')
    MGItem.setField("Entrez Gene ID", entrez)
    MGItem.setField("aliases", 'entrez:' + str(entrez))
    MGItem.setField("Homologene ID", get(get(root, 'homologene'), 'id'))
    MGItem.setField("gene symbol", get(root, 'symbol'))
    MGItem.setField("Ensembl Gene ID", get(get(root, 'ensembl'), 'gene'))
    MGItem.setField("Ensembl Transcript ID",
                    get(get(root, 'ensembl'), 'transcript'))
    MGItem.setField("GenLoc_chr", get(get(root, 'genomic_pos'), 'chr'))
    MGItem.setField("GenLoc_start", get(get(root, 'genomic_pos'), 'start'))
    MGItem.setField("GenLoc_end", get(get(root, 'genomic_pos'), 'end'))
    #MGItem.setField("AltSymbols", get(root, 'alias'))

    #adding id's based on valid refseq prefixes
    initial = get(get(root, 'refseq'), 'rna')
    MGItem.setField("RefSeq", parse_accession(initial, "RefSeq"))
    initial = get(get(root, 'accession'), 'rna')
    MGItem.setField("RefSeq RNA ID", parse_accession(initial, "RefSeq RNA ID"))

    #encodes  -- search for mouse protein
    key = get(root, 'name')
    ID = []
    res = wikidata.search_Item(key)
    #search for mouse protein, property = uniprot ID
    #most surely mouse protein is present, but still....
    uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez)
    uniprotID = 'P352'
    if res:
        ID = wikidata.search_claim(res, uniprotID, uniprot)
    if not ID:
        ID = searchAlias("uniprot:" + str(uniprot), uniprotID, uniprot)
    #search result is null or corresponding human gene doesnot exist
    if not ID:
        #create mouse protein item
        print "creating mouse protein with uniprot ID", uniprot
        ID = wikidata.create_Item(key)
        ulabel = "uniprot:" + str(uniprot)
        wikidata.setLabel(ID, ulabel)

        #add uniprot claim to mouse protein item
        wikidata.addClaim(ID, 'P352', uniprot, 'Uniprot ID')
        CreatedItemlogger(Item=ID,
                          Type='Mouse Protein',
                          field='Uniprot',
                          value='uniprot',
                          name=str(key))
    MGItem.setField("encodes", ID.title())

    #ortholog
    key = get(gene_json, 'symbol')
    ID = []
    res = wikidata.search_Item(key)
    #search for human gene, property = entrez ID
    entrezID = 'P351'
    if res:
        ID = wikidata.search_claim(res, entrezID, get(gene_json, 'entrezgene'))
    if not ID:
        ID = searchAlias("entrez:" + str(get(gene_json, 'entrezgene')),
                         entrezID, get(gene_json, 'entrezgene'))
    #search result is null or corresponding human gene doesnot exist
    if not ID:
        message = "Failed to retreive Human Gene wikidata item with entrez:{ez}".format(
            ez=get(gene_json, 'entrezgene'))
        raise wikidata.WikidataSearchError(message)
    MGItem.setField("ortholog", ID.title())
    #ipdb.set_trace()
    return MGItem
Esempio n. 7
0
def parse_HumanGene_json(gene_json, homolog_json):
    '''Construct the Human Gene from gene_json. The entire fields are specified in WItem
    Arguments:
    gene_json - mygeneinfo json document for given gene
    homolog_json - mygeneinfo json_documnet for corresponding mouse gene 
    '''
    #ipdb.set_trace()
    HGItem = HumanGene()
    root = gene_json

    HGItem.setField("found in taxon", "Q5")
    HGItem.setField("description", "human gene")
    HGItem.setField("subclass of", "Q7187")

    #for genes label = HGNC symbol
    HGItem.setField("Name", get(root, 'symbol'))
    entrez = get(root, 'entrezgene')
    HGItem.setField("Entrez Gene ID", entrez)
    HGItem.setField("aliases", 'entrez:' + str(entrez))
    HGItem.setField("Homologene ID", get(get(root, 'homologene'), 'id'))
    HGItem.setField("gene symbol", get(root, 'symbol'))
    HGItem.setField("Ensembl Gene ID", get(get(root, 'ensembl'), 'gene'))

    HGItem.setField("Ensembl Transcript ID",
                    get(get(root, 'ensembl'), 'transcript'))
    HGItem.setField("GenLoc_chr", get(get(root, 'genomic_pos'), 'chr'))
    HGItem.setField("GenLoc_start", get(get(root, 'genomic_pos'), 'start'))
    HGItem.setField("GenLoc_end", get(get(root, 'genomic_pos'), 'end'))
    #HGItem.setField("AltSymbols", get(root, 'alias'))

    #adding id's based on valid refseq prefixes
    initial = get(get(root, 'refseq'), 'rna')
    HGItem.setField("RefSeq", parse_accession(initial, "RefSeq"))
    initial = get(get(root, 'accession'), 'rna')
    HGItem.setField("RefSeq RNA ID", parse_accession(initial, "RefSeq RNA ID"))

    HGItem.setField("HGNC ID", get(root, 'HGNC'))
    HGItem.setField("OMIM ID", get(root, 'MIM'))

    #encodes  -- search for human protein
    key = get(root, 'name')
    ID = []
    res = wikidata.search_Item(key)
    #search for human protein, property = uniprot ID
    #ipdb.set_trace()
    uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez)
    uniprotID = 'P352'
    if res:
        ID = wikidata.search_claim(res, uniprotID, uniprot)
    if not ID:
        ID = searchAlias("uniprot:" + str(uniprot), uniprotID, uniprot)
    #search result is null or corresponding human protein doesnot exist
    if not ID:
        message = "Failed to retreive HumanProteinItem with Uniprot:{up}".format(
            up=uniprot)
        raise wikidata.WikidataSearchError(message)
    #following convention of having capitalised wikidata identifiers
    HGItem.setField("encodes", ID.title())

    if not homolog_json:
        return HGItem
    #ortholog
    key = get(homolog_json, 'symbol')
    ID = []
    res = wikidata.search_Item(key)
    #search for mouse gene, property = entrez ID
    entrezID = 'P351'
    mouse_entrez = get(homolog_json, 'entrezgene')
    if res:
        ID = wikidata.search_claim(res, entrezID, mouse_entrez)
    #backup search
    if not ID:
        ID = searchAlias("entrez:" + str(mouse_entrez), entrezID, mouse_entrez)
    #search result is null or corresponding mouse gene doesnot exist
    if not ID:
        #create mouse gene item
        ID = wikidata.create_Item(key)
        #add entrez claim to mouse gene item
        mouse_entrez = get(homolog_json, 'entrezgene')
        elabel = "entrez:" + str(mouse_entrez)
        wikidata.setLabel(ID, elabel)
        wikidata.addClaim(ID, 'P351', str(mouse_entrez), 'Entrez Gene ID')
        CreatedItemlogger(Item=ID,
                          Type='Mouse Gene',
                          field='Entrez',
                          value=mouse_entrez,
                          name=str(key))
        print "created mouse gene item -- with entrez", mouse_entrez
    #following convention of having capitalised wikidata identifiers
    HGItem.setField("ortholog", ID.title())

    #ipdb.set_trace()
    return HGItem