def main():
    """
    This function undo gene to protein merges. For that, a query searches for WD items which have the
    Entrez gene ID (P351) and Uniprot ID (P352) on one item. Bases on that, it generates instances of MergeDefender
    and undoes the merges. 
    :return: None
    """
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    conflict_set_1 = {'P351'}
    conflict_set_2 = {'P352'}

    likely_merged_ids = PBB_Core.WDItemList(wdquery='CLAIM[351] AND CLAIM[352]')
    print(likely_merged_ids.wditems['items'])

    for count, x in enumerate(likely_merged_ids.wditems['items']):
        print('\n', count)
        print('Q{}'.format(x))

        try:

            MergeDefender(login, merge_target='Q{}'.format(x), conflict_set_1=conflict_set_1, conflict_set_2=conflict_set_2)

        except Exception as e:
            traceback.print_exc()
            PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}"'.format(
                        main_data_id=x,
                        exception_type=type(e),
                        message=e.__str__(),
                    ))
    def __init__(self):
        self.start = time.time()
        self.content = ET.fromstring(self.download_disease_ontology())
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        # self.updateDiseaseOntologyVersion()

        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Disease Ontology ID in WikiData")
        doWikiData_id = dict()
        DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699")

        print("Getting latest version of Disease Ontology from Github")
        r = requests.get(
            "https://api.github.com/repos/DiseaseOntology/HumanDiseaseOntology/git/refs"
        )
        test = r.json()
        sha = test[0]["object"]["sha"]
        githubReferenceUrl = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/" + sha + "/src/ontology/doid.owl"

        for diseaseItem in DoInWikiData.wditems["props"]["699"]:
            doWikiData_id[str(diseaseItem[2])] = diseaseItem[
                0]  # diseaseItem[2] = DO identifier, diseaseItem[0] = WD identifier

        for doClass in self.content.findall(
                './/owl:Class', DiseaseOntology_settings.getDoNameSpaces()):
            try:
                disVars = []
                disVars.append(doClass)
                disVars.append(githubReferenceUrl)
                disVars.append(doWikiData_id)
                disVars.append(self.logincreds)
                disVars.append(self.start)

                diseaseClass = disease(disVars)

                print("do_id: " + diseaseClass.do_id)
                print(diseaseClass.wdid)
                print(diseaseClass.name)
                print(diseaseClass.synonyms)
                print(diseaseClass.xrefs)
            except Exception as e:
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=diseaseClass.do_id,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
                f = open('/tmp/Diseaseexceptions.txt', 'a')
                # f.write("Unexpected error:", sys.exc_info()[0]+'\n')
                f.write(diseaseClass.do_id + "\n")
                #f.write(diseaseClass.wd_json_representation)
                traceback.print_exc(file=f)
                f.close()
Example #3
0
def main():
    pwd = input('Password:'******'ProteinBoxBot', pwd=pwd)

    # for mouse genes
    # LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:83310]').wditems['items'], {'gène': 'gène de souris'},
    #                  'fr', login)

    # for human genes
    LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]').wditems['items'], {'gène': 'gène humain'},
                     'fr', login)
Example #4
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Gene Ontology ID in WikiData")
        goWikiData_id = dict()
        goInWikiData = PBB_Core.WDItemList("CLAIM[686]", "686")
        for goItem in goInWikiData.wditems["props"]["686"]:
            goWikiData_id[str(goItem[2])] = goItem[
                0]  # diseaseItem[2] = go identifier, diseaseItem[0] = go identifier
        print(len(goWikiData_id.keys()))
        sys.exit()
        graph = rdflib.Graph()

        goUrl = requests.get("http://purl.obolibrary.org/obo/go.owl")

        print("ja")
        graph.parse(data=goUrl.text, format="application/rdf+xml")

        cls = URIRef("http://www.w3.org/2002/07/owl#Class")
        subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
        counter = 0
        for gouri in graph.subjects(RDF.type, cls):
            try:
                counter = counter + 1
                print(counter)
                goVars = dict()
                goVars["uri"] = gouri
                goVars["label"] = graph.label(URIRef(gouri))
                goVars["wikidata_id"] = goWikiData_id
                goVars["logincreds"] = self.logincreds
                goVars["start"] = self.start
                goVars["graph"] = graph
                if "GO" in gouri:
                    goClass = goTerm(goVars)

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=gouri,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
Example #5
0
    def __init__(self):
        self.content = json.loads(self.download_mouse_proteins())
        # print self.content["results"]["bindings"]
        self.protein_count = len(self.content["results"]["bindings"])
        self.proteins = self.content["results"]["bindings"]
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        uniprotWikidataIds = dict()
        print "Getting all proteins with a uniprot ID in Wikidata"
        InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[352]",
                                         "352")

        r0 = requests.get(
            "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a10090+.%0d%0a%7d&format=srj"
        )

        for proteinItem in InWikiData.wditems["props"]["352"]:
            try:
                uniprotWikidataIds[str(proteinItem[2])] = proteinItem[0]
                r = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(proteinItem[2]) +
                    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName+%3fupversion&format=srj"
                )
                # r = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"+str(proteinItem[2])+"%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName&format=srj")
                print r.text
                protein = json.loads(r.text)
                protein["logincreds"] = self.logincreds
                protein["wdid"] = 'Q' + str(proteinItem[0])
                print protein
                proteinClass = mouse_protein(protein)

            except:
                # client = Client('http://*****:*****@sentry.sulab.org/9')
                # client.captureException()
                print "There has been an except"
                print "Unexpected error:", sys.exc_info()[0]

                f = open('/tmp/exceptions.txt', 'a')
                # f.write("Unexpected error:", sys.exc_info()[0]+'\n')
                f.write(
                    str(protein["results"]["bindings"][0]["uniprot"]["value"])
                    + "\n")
                traceback.print_exc(file=f)
                f.close()
Example #6
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword())
        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Uberon ID in WikiData")
        ubWikiData_id = dict()
        ubInWikiData = PBB_Core.WDItemList("CLAIM[1554]", "1554")
        for uberonItem in ubInWikiData.wditems["props"]["1554"]:
           ubWikiData_id[str(uberonItem[2])]=uberonItem[0] # diseaseItem[2] = Uberon identifier, diseaseItem[0] = Uberon identifier
        graph = rdflib.Graph()

        ubUrl = requests.get("http://purl.obolibrary.org/obo/uberon.owl")

        print("ja")
        graph.parse(data=ubUrl.text, format="application/rdf+xml")

        cls = URIRef("http://www.w3.org/2002/07/owl#Class")
        subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
        for uberonuri in graph.subjects(RDF.type, cls):
            try:
                uberonVars = dict()
                uberonVars["uberon"] = uberonuri
                uberonVars["uberonLabel"] = graph.label(URIRef(uberonuri))
                uberonVars["wikidata_id"] = ubWikiData_id
                uberonVars["logincreds"] = self.logincreds
                uberonVars["start"] = self.start
                uberonVars["graph"] = graph
                if "UBERON" in uberonuri:
                    uberonClass = uberonTerm(uberonVars)

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format(
                        main_data_id=uberonuri,
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id='-',
                        duration=time.time() - self.start
                    ))
Example #7
0
    def __init__(self, login):

        self.login_obj = login

        image_data = pd.read_csv(
            './image_data/gene_wiki_images_with_preferred.txt',
            encoding='utf-8',
            sep='\t',
            dtype={'entrez': np.str})

        wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]',
                                          '351').wditems
        wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351']))
        entrez_qid_list = list(
            map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351']))

        print(len(wd_entrez_ids))

        for index in image_data.index:
            start = time.time()
            # print(image_data.loc[index, 'other_images'])
            image_names = image_data.loc[index, 'other_images']

            preferred_image = image_data.loc[index, 'primary_image']

            image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf']
            if pd.notnull(preferred_image) and '|' in preferred_image:
                for splt in preferred_image.split('|'):
                    for ending in image_file_extension:
                        if ending in splt:
                            preferred_image = splt
                            break

            entrez = image_data.loc[index, 'entrez']
            # print(entrez)

            protein_images = []
            protein_image_value_store = []
            genex_images = []
            genex_value_store = []

            if entrez not in wd_entrez_ids:
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='Entrez ID not yet in Wikidata!!',
                            wd_id='',
                            duration=time.time() - start))
                continue
            else:
                curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)]

            if pd.isnull(image_names):
                PBB_Core.WDItemEngine.log(
                    'WARNING',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='No images available for this Entrez ID',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                continue

            for sub_string in image_names.split('|'):
                if 'PBB GE ' in sub_string:
                    value = sub_string[5:]

                    # if value[-6:-4] == 'tn':
                    #     value = value[:-6] + 'fs' + value[-4:]

                    # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956

                    genex_images.append(value)
                    genex_value_store.append(
                        PBB_Core.WDCommonsMedia(value=value, prop_nr='P692'))
                elif 'PDB ' in sub_string:
                    value = sub_string[5:]
                    protein_images.append(value)

                    protein_image_value_store.append(
                        PBB_Core.WDCommonsMedia(value, prop_nr=''))

            entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351')

            data = [entrez_id_value]
            data.extend(genex_value_store)

            if pd.notnull(preferred_image):
                data.append(
                    PBB_Core.WDCommonsMedia(value=preferred_image,
                                            prop_nr='P18'))

            try:
                gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid,
                                                  domain='genes',
                                                  data=data)
                # pprint.pprint(gene_item.get_wd_json_representation())

                gene_item.write(self.login_obj)

                PBB_Core.WDItemEngine.log(
                    'INFO',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type='',
                            message='success',
                            wd_id=curr_qid,
                            duration=time.time() - start))
                print(index, 'success', curr_qid, entrez,
                      gene_item.get_label(lang='en'))

            except Exception as e:
                print(index, 'error', curr_qid, entrez)
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=entrez,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id=curr_qid,
                            duration=time.time() - start))
params = {
    'action': 'wbsearchentities',
    'format': 'json',
    'language': 'en',
    'type': 'item',
    'search': searchTerm
}
data = requests.get(url, params=params)
reply = json.loads(data.text, "utf-8")
if len(reply['search']) == 0:
    sys.exit("A new version of DO has been release, a full update is required")
else:
    doVersionID = reply['search'][0]['id']

doWikiData_id = dict()
DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699")

for doClass in content.findall('.//owl:Class',
                               DiseaseOntology_settings.getDoNameSpaces()):
    try:
        do_id = doClass.findall(
            './/oboInOwl:id',
            DiseaseOntology_settings.getDoNameSpaces())[0].text
        if do_id == sys.argv[1]:
            disVars = []
            disVars.append(doClass)
            disVars.append(doVersionID)
            disVars.append(doWikiData_id)
            disVars.append(logincreds)
            disVars.append(start)
            diseaseClass = DiseaseOntology.disease(disVars)
speciesInfo["rat"]["taxid"] = "10114"
speciesInfo["rat"]["wdid"] = "Q36396"
speciesInfo["rat"]["name"] = "rat"
speciesInfo["rat"]["release"] = "Q19296606"

if len(sys.argv) == 1:
    print("Please provide an ncbi gene ID")
    print("Example: python singleGeneBot.py 628")
    sys.exit()


logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword())
entrezWikidataIds = dict()
wdqQuery = "CLAIM[351]"
InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351")
'''
Below a mapping is created between entrez gene ids and wikidata identifiers.
'''
for geneItem in InWikiData.wditems["props"]["351"]:
    entrezWikidataIds[str(geneItem[2])] = geneItem[0]

uniprotwikidataids = dict()
print('Getting all proteins with a uniprot ID in Wikidata...')
inwikidata = PBB_Core.WDItemList("CLAIM[352]", "352")
for proteinItem in inwikidata.wditems["props"]["352"]:
    uniprotwikidataids[str(proteinItem[2])] = proteinItem[0]
try:

    object=dict()
    object["entrezgene"] = str(sys.argv[1])
Example #10
0
while (line != ""):
    alreadyAdded.append(line.strip())
    line = added.readline()

f1 = open('alreadyAdded.txt', 'a+')

mygeneinfo_url = "http://mygene.info/v2/query?q=_exists_:wikipedia&fields=wikipedia,entrezgene&size=15000"
r = requests.get(mygeneinfo_url)

mappings = r.json()
PBB_Debug.prettyPrint(mappings)

# Get entrezgene - Wikidata mapping
entrezWikidataIds = dict()
wdqQuery = "CLAIM[351]"
InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351")
logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())

for geneItem in InWikiData.wditems["props"]["351"]:
    entrezWikidataIds[int(geneItem[2])] = geneItem[0]

for hit in mappings["hits"]:
    print(hit["entrezgene"])
    f1.write(str(hit["entrezgene"]) + "\n")
    data2add = []
    try:
        if hit["entrezgene"] in entrezWikidataIds.keys(
        ) and hit["wikipedia"]["url_stub"].count("?") == 0 and str(
                hit["entrezgene"]) not in alreadyAdded:
            print(entrezWikidataIds[hit["entrezgene"]])
Example #11
0
]

data_types = {x: object for x in col_names}
data_types.update({'has interwiki link': bool})

append = True

if os.path.isfile('./WD_to_WP_disease_map.csv') and append:
    wd_to_wp_map = pd.read_csv('./WD_to_WP_disease_map.csv',
                               index_col=0,
                               dtype=data_types)
else:
    wd_to_wp_map = pd.DataFrame(columns=col_names)

wd_disease_items = PBB_Core.WDItemList(
    'CLAIM[279:12136] or CLAIM[279:929833] or CLAIM[31:12136] '
    'or CLAIM[31:929833] or CLAIM[557] or CLAIM[699] or claim[493] '
    'or claim[494] or claim[1995]').wditems['items']

print(wd_to_wp_map.dtypes)

print('Total number of items to match:', len(wd_disease_items))

for count, item in enumerate(wd_disease_items):
    print(item)
    if str(item) in wd_to_wp_map['QID'].values and append:
        print('skipping', item)
        count += 1
        continue

    wd_object = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item))
    wd_json = wd_object.wd_json_representation
Example #12
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        uniprotwikidataids = dict()
        genesymbolwdmapping = dict()

        print('Getting all proteins with a uniprot ID in Wikidata...')
        inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352")
        for proteinItem in inwikidata.wditems["props"]["352"]:
            uniprotwikidataids[str(proteinItem[2])] = proteinItem[0]

        print('Getting all human genes with a ncbi gene ID in Wikidata...')
        entrezWikidataIds = dict()
        print("wdq 1")
        wdqQuery = "CLAIM[703:5] AND CLAIM[351]"

        InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351")
        '''
        Below a mapping is created between entrez gene ids and wikidata identifiers.
        '''
        for geneItem in InWikiData.wditems["props"]["351"]:
            entrezWikidataIds[str(geneItem[2])] = geneItem[0]

        print("Getting all human proteins from Uniprot...")
        # r0 = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj")
        r0 = requests.get(
            'http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj'
        )
        prot_results = r0.json()
        uniprot_ids = []
        for protein in prot_results["results"]["bindings"]:
            item = dict()
            item["id"] = protein["protein"]["value"].replace(
                "http://purl.uniprot.org/uniprot/", "")
            item["label"] = protein["protein_label"]["value"]
            uniprot_ids.append(item)

        for up in uniprot_ids:
            try:
                #if up["id"] not in uniprotwikidataids:
                '''
                    Get protein annotations from Uniprot
                    '''
                #r = requests.get(
                #    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" +
                #    str(up["id"]) +
                #    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09%09%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj")

                r = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3fncbiGene%3b+separator%3d%22%3b+%22)+as+%3fgene_id)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(up["id"]) +
                    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09++++%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++optional%7b%3funiprot+rdfs%3aseeAlso+%3fncbiGene+.%0d%0a++++++++%3fncbiGene+up%3adatabase+database%3aGeneID+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj"
                )

                protein = r.json()
                if len(protein["results"]["bindings"]) == 0:
                    raise Exception("Communication error on " + up["id"])
                #if "results" not in protein.keys():
                '''
                    Get go annotations from Uniprot
                    '''
                r2 = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e+%0d%0aSELECT+DISTINCT+%3fprotein+%3fgo+%3fgoLabel+%3fparentLabel%0d%0aWHERE%0d%0a%7b%0d%0a++%09%09VALUES+%3fprotein+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(up["id"]) +
                    "%3e%7d%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+up%3aclassifiedWith+%3fgo+.+++%0d%0a++++++++%3fgo+rdfs%3alabel+%3fgoLabel+.%0d%0a++++++++%3fgo+rdfs%3asubClassOf*+%3fparent+.%0d%0a++++++++%3fparent+rdfs%3alabel+%3fparentLabel+.%0d%0a++++++++optional+%7b%3fparent+rdfs%3asubClassOf+%3fgrandParent+.%7d%0d%0a++++++++FILTER+(!bound(%3fgrandParent))%0d%0a%7d&format=srj"
                )
                go_terms = r2.json()

                protein["goTerms"] = go_terms
                protein["logincreds"] = self.logincreds
                # protein["label"] = up["label"]
                protein["id"] = up["id"]
                protein["start"] = self.start
                protein["geneSymbols"] = genesymbolwdmapping
                protein["entrezWikidataIds"] = entrezWikidataIds
                protein_class = HumanProtein(protein)
            #else:
            #print(up["id"]+" already covered in wikidata")

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=up["id"],
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
Example #13
0
    import json
import humanprotein

if len(sys.argv) == 1:
    print("Please provide an Uniprot ID")
    print("Example: python singleGeneBot.py P12345")
    sys.exit()

start = time.time()
logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())
uniprotwikidataids = dict()
genesymbolwdmapping = dict()

print('Getting all proteins with a uniprot ID in Wikidata...')
inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352")
for proteinItem in inwikidata.wditems["props"]["352"]:
    uniprotwikidataids[str(proteinItem[2])] = proteinItem[0]

print("Getting all human proteins in Wikidata...")
gene_symbol_mapping = PBB_Core.WDItemList("CLAIM[353] AND CLAIM[703:5]", "353")
for genesymbol in gene_symbol_mapping.wditems["props"]["353"]:
    genesymbolwdmapping[str(genesymbol[2])] = genesymbol[0]

try:
    up = str(sys.argv[1])
    '''
    Get protein annotations from Uniprot
    '''
    r = requests.get(
        "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
        wdPage = PBB_Core.WDItemEngine(wd_item_id=self.source,
                                       data=[orthologValue],
                                       server="www.wikidata.org",
                                       domain="genes")
        print(wdPage.wd_json_representation)
        wdPage.write(self.logincreds)


logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())

humanEntrezWikidataIds = dict()
mouseEntrezWikidataIds = dict()

print("Getting all human genes in Wikidata")
InWikiData = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[351]", "351")
for geneItem in InWikiData.wditems["props"]["351"]:
    humanEntrezWikidataIds[str(geneItem[2])] = geneItem[0]

print("Getting all mouse genes in Wikidata")
InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[351]", "351")
for geneItem in InWikiData.wditems["props"]["351"]:
    mouseEntrezWikidataIds[str(geneItem[2])] = geneItem[0]

homologene = open("/tmp/homologene.data", "r")
humanOrthologs = dict()
mouseOrthologs = dict()

for line in homologene:
    for line in homologene:
        fields = line.split('\t')
import PBB_Core
import requests
import copy
import pprint
from SPARQLWrapper import SPARQLWrapper, JSON
from time import gmtime, strftime

# This bot extends gene items in Wikidata with gene disease relationship information from the OMIM-sourced
# downloadable dump in Phenocarta. Currently, the bot writes to the genes specified in the source code,
# and uses the genetic association property with references.

# Get Wikidata Ids for all entrez genes in Wikidata.
ncbi_gene_wikidata_ids = dict()
print("Getting all terms with a Disease Ontology ID in WikiData (WDQ)")
wdqQuery = "CLAIM[351]"
ncbi_gene_in_wikidata = PBB_Core.WDItemList(wdqQuery, wdprop="351")
for geneItem in ncbi_gene_in_wikidata.wditems["props"]["351"]:
    ncbi_gene_wikidata_ids[str(geneItem[2])] = geneItem[0]

gnsym_gemma_ids = dict(
)  # maps gene symbols to Gemma/Phenocarta-specific gene IDs, for reference URLs

# Retrieve gene-disease relationships from Phenocarta.
source = "http://www.chibi.ubc.ca/Gemma/phenocarta/LatestEvidenceExport/AnnotationsByDataset/OMIM.tsv"
result = requests.get(source, stream=True)
for line in result.iter_lines():
    # First separate each tuple into distinct fields.
    values = dict()
    s = str(line)
    fields = s.split("\\t")
    if "#" not in fields[0]:
import sys
import os

sys.path.append(
    os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core")
import PBB_Core
import PBB_Debug
import PBB_login
import PBB_settings
import pprint
import sys

gene_ontologydataids = dict()
print('Getting all proteins with a uniprot ID in Wikidata...')
inwikidata = PBB_Core.WDItemList("CLAIM[686] and CLAIM[351]", "686")
for goItem in inwikidata.wditems["props"]["686"]:
    if not "GO:" in str(goItem[2]):
        gene_ontologydataids[str(goItem[2])] = goItem[0]

pprint.pprint(gene_ontologydataids)
print(len(gene_ontologydataids))
logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())
prep = dict()
for id in gene_ontologydataids.keys():
    print(id)
    wdid = 'Q' + str(gene_ontologydataids[str(id)])
    prep["P686"] = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P686')]
    data2Add = []
    for key in prep.keys():