Esempio n. 1
0
def interwiki_link(entrez, name):
    # Query wikidata for Q-item id (cid)

    cid_query = """
        SELECT ?cid  WHERE {
        ?cid wdt:P351 ?entrez_id  .
        FILTER(?entrez_id ='""" + str(entrez) + """') .
    }
    """

    wikidata_results = PBB_Core.WDItemEngine.execute_sparql_query(
        prefix=settings.PREFIX, query=cid_query)['results']['bindings']
    cid = ''
    for x in wikidata_results:
        cid = x['cid']['value'].split('/')[-1]

    # create interwiki link
    username = models.CharField(max_length=200, blank=False)
    password = models.CharField(max_length=200, blank=False)
    # create your login object with your user and password (or the ProteinBoxBot account?)
    login_obj = PBB_login.WDLogin(user=username, pwd=password)
    # load the gene Wikidata object
    wd_gene_item = PBB_Core.WDItemEngine(wd_item_id=cid)
    # set the interwiki link to the correct Wikipedia page
    wd_gene_item.set_sitelink(site='enwiki', title=name)
    # write the changes to the item
    wd_gene_item.write(login_obj)
Esempio n. 2
0
def main():
    """
    This function undo gene to protein merges. For that, a query searches for WD items which have the
    Entrez gene ID (P351) and Uniprot ID (P352) on one item. Bases on that, it generates instances of MergeDefender
    and undoes the merges. 
    :return: None
    """
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    conflict_set_1 = {'P351'}
    conflict_set_2 = {'P352'}

    likely_merged_ids = PBB_Core.WDItemList(wdquery='CLAIM[351] AND CLAIM[352]')
    print(likely_merged_ids.wditems['items'])

    for count, x in enumerate(likely_merged_ids.wditems['items']):
        print('\n', count)
        print('Q{}'.format(x))

        try:

            MergeDefender(login, merge_target='Q{}'.format(x), conflict_set_1=conflict_set_1, conflict_set_2=conflict_set_2)

        except Exception as e:
            traceback.print_exc()
            PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}"'.format(
                        main_data_id=x,
                        exception_type=type(e),
                        message=e.__str__(),
                    ))
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Gene Ontology prefix cleaner')
    parser.add_argument('--user',
                        action='store',
                        help='Username on Wikidata',
                        required=True)
    parser.add_argument('--pwd',
                        action='store',
                        help='Password on Wikidata',
                        required=True)
    parser.add_argument('--prefix',
                        action='store',
                        help='The prefix which should be added',
                        required=True)
    parser.add_argument('--prop-nr',
                        action='store',
                        help='The Wikidata property number where the '
                        'prefixes need to be checked and fixed',
                        required=True)
    parser.add_argument('--separator',
                        action='store',
                        help='The separator character between prefix '
                        'and actual identifier. ":" as default.',
                        required=False,
                        default=':')

    args = parser.parse_args()
    print(args.user, args.pwd, args.prefix, args.prop_nr, args.separator)
    login = PBB_login.WDLogin(user=args.user, pwd=args.pwd)

    GOCleaner(login,
              prop_nr=args.prop_nr,
              prefix_str=args.prefix,
              separator=args.separator)
Esempio n. 4
0
def main():
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    # biological process (GO:0008150), molecular function (GO:0003674), cellular component (GO:0005575) (Q5058355)
    root_objects = ['0008150', '0003674', '0005575']

    # continue_at = ''
    # stop_at = ''

    file_name = 'temp_GO_onto_map.json'
    if os.path.exists(file_name):
        f = open(file_name, 'r')
        local_qid_onto_map = json.loads(f.read())
        f.close()
    else:
        local_qid_onto_map = {}

    # Ontology ref item is the Wikidata 'Gene Ontolgy' item
    OBOImporter(root_objects=root_objects,
                ontology='GO',
                core_property_nr='P686',
                ontology_ref_item='Q135085',
                login=login,
                local_qid_onto_map=local_qid_onto_map,
                use_prefix=True,
                fast_run=True,
                fast_run_base_filter={'P686': ''})
    def __init__(self):
        self.start = time.time()
        self.content = ET.fromstring(self.download_disease_ontology())
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        # self.updateDiseaseOntologyVersion()

        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Disease Ontology ID in WikiData")
        doWikiData_id = dict()
        DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699")

        print("Getting latest version of Disease Ontology from Github")
        r = requests.get(
            "https://api.github.com/repos/DiseaseOntology/HumanDiseaseOntology/git/refs"
        )
        test = r.json()
        sha = test[0]["object"]["sha"]
        githubReferenceUrl = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/" + sha + "/src/ontology/doid.owl"

        for diseaseItem in DoInWikiData.wditems["props"]["699"]:
            doWikiData_id[str(diseaseItem[2])] = diseaseItem[
                0]  # diseaseItem[2] = DO identifier, diseaseItem[0] = WD identifier

        for doClass in self.content.findall(
                './/owl:Class', DiseaseOntology_settings.getDoNameSpaces()):
            try:
                disVars = []
                disVars.append(doClass)
                disVars.append(githubReferenceUrl)
                disVars.append(doWikiData_id)
                disVars.append(self.logincreds)
                disVars.append(self.start)

                diseaseClass = disease(disVars)

                print("do_id: " + diseaseClass.do_id)
                print(diseaseClass.wdid)
                print(diseaseClass.name)
                print(diseaseClass.synonyms)
                print(diseaseClass.xrefs)
            except Exception as e:
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=diseaseClass.do_id,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
                f = open('/tmp/Diseaseexceptions.txt', 'a')
                # f.write("Unexpected error:", sys.exc_info()[0]+'\n')
                f.write(diseaseClass.do_id + "\n")
                #f.write(diseaseClass.wd_json_representation)
                traceback.print_exc(file=f)
                f.close()
Esempio n. 6
0
def main():
    pwd = input('Password:'******'ProteinBoxBot', pwd=pwd)

    # for mouse genes
    # LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:83310]').wditems['items'], {'gène': 'gène de souris'},
    #                  'fr', login)

    # for human genes
    LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]').wditems['items'], {'gène': 'gène humain'},
                     'fr', login)
Esempio n. 7
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Gene Ontology ID in WikiData")
        goWikiData_id = dict()
        goInWikiData = PBB_Core.WDItemList("CLAIM[686]", "686")
        for goItem in goInWikiData.wditems["props"]["686"]:
            goWikiData_id[str(goItem[2])] = goItem[
                0]  # diseaseItem[2] = go identifier, diseaseItem[0] = go identifier
        print(len(goWikiData_id.keys()))
        sys.exit()
        graph = rdflib.Graph()

        goUrl = requests.get("http://purl.obolibrary.org/obo/go.owl")

        print("ja")
        graph.parse(data=goUrl.text, format="application/rdf+xml")

        cls = URIRef("http://www.w3.org/2002/07/owl#Class")
        subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
        counter = 0
        for gouri in graph.subjects(RDF.type, cls):
            try:
                counter = counter + 1
                print(counter)
                goVars = dict()
                goVars["uri"] = gouri
                goVars["label"] = graph.label(URIRef(gouri))
                goVars["wikidata_id"] = goWikiData_id
                goVars["logincreds"] = self.logincreds
                goVars["start"] = self.start
                goVars["graph"] = graph
                if "GO" in gouri:
                    goClass = goTerm(goVars)

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=gouri,
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
Esempio n. 8
0
    def __init__(self):
        self.content = json.loads(self.download_mouse_proteins())
        # print self.content["results"]["bindings"]
        self.protein_count = len(self.content["results"]["bindings"])
        self.proteins = self.content["results"]["bindings"]
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        uniprotWikidataIds = dict()
        print "Getting all proteins with a uniprot ID in Wikidata"
        InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[352]",
                                         "352")

        r0 = requests.get(
            "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a10090+.%0d%0a%7d&format=srj"
        )

        for proteinItem in InWikiData.wditems["props"]["352"]:
            try:
                uniprotWikidataIds[str(proteinItem[2])] = proteinItem[0]
                r = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(proteinItem[2]) +
                    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName+%3fupversion&format=srj"
                )
                # r = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"+str(proteinItem[2])+"%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName&format=srj")
                print r.text
                protein = json.loads(r.text)
                protein["logincreds"] = self.logincreds
                protein["wdid"] = 'Q' + str(proteinItem[0])
                print protein
                proteinClass = mouse_protein(protein)

            except:
                # client = Client('http://*****:*****@sentry.sulab.org/9')
                # client.captureException()
                print "There has been an except"
                print "Unexpected error:", sys.exc_info()[0]

                f = open('/tmp/exceptions.txt', 'a')
                # f.write("Unexpected error:", sys.exc_info()[0]+'\n')
                f.write(
                    str(protein["results"]["bindings"][0]["uniprot"]["value"])
                    + "\n")
                traceback.print_exc(file=f)
                f.close()
Esempio n. 9
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword())
        # Get all WikiData entries that contain a WikiData ID
        print("Getting all terms with a Uberon ID in WikiData")
        ubWikiData_id = dict()
        ubInWikiData = PBB_Core.WDItemList("CLAIM[1554]", "1554")
        for uberonItem in ubInWikiData.wditems["props"]["1554"]:
           ubWikiData_id[str(uberonItem[2])]=uberonItem[0] # diseaseItem[2] = Uberon identifier, diseaseItem[0] = Uberon identifier
        graph = rdflib.Graph()

        ubUrl = requests.get("http://purl.obolibrary.org/obo/uberon.owl")

        print("ja")
        graph.parse(data=ubUrl.text, format="application/rdf+xml")

        cls = URIRef("http://www.w3.org/2002/07/owl#Class")
        subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
        for uberonuri in graph.subjects(RDF.type, cls):
            try:
                uberonVars = dict()
                uberonVars["uberon"] = uberonuri
                uberonVars["uberonLabel"] = graph.label(URIRef(uberonuri))
                uberonVars["wikidata_id"] = ubWikiData_id
                uberonVars["logincreds"] = self.logincreds
                uberonVars["start"] = self.start
                uberonVars["graph"] = graph
                if "UBERON" in uberonuri:
                    uberonClass = uberonTerm(uberonVars)

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format(
                        main_data_id=uberonuri,
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id='-',
                        duration=time.time() - self.start
                    ))
Esempio n. 10
0
def main():
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    root_objects = ['11946']

    OBOImporter.obo_synonyms = {
        'SMILES': 'P233',
        'InChIKey': 'P235',
        'FORMULA': 'P274'
    }

    file_name = 'temp_GO_onto_map.json'
    if os.path.exists(file_name):
        f = open(file_name, 'r')
        local_qid_onto_map = json.loads(f.read())
        f.close()
    else:
        local_qid_onto_map = {}

    # Ontology ref item is the Wikidata 'Gene Ontolgy' item
    OBOImporter(root_objects=root_objects, ontology='CHEBI', core_property_nr='P683',
                ontology_ref_item='Q902623', login=login, local_qid_onto_map=local_qid_onto_map)
Esempio n. 11
0
    def __init__(self):
        self.start = time.time()
        self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                            PBB_settings.getWikiDataPassword())
        uniprotwikidataids = dict()
        genesymbolwdmapping = dict()

        print('Getting all proteins with a uniprot ID in Wikidata...')
        inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352")
        for proteinItem in inwikidata.wditems["props"]["352"]:
            uniprotwikidataids[str(proteinItem[2])] = proteinItem[0]

        print('Getting all human genes with a ncbi gene ID in Wikidata...')
        entrezWikidataIds = dict()
        print("wdq 1")
        wdqQuery = "CLAIM[703:5] AND CLAIM[351]"

        InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351")
        '''
        Below a mapping is created between entrez gene ids and wikidata identifiers.
        '''
        for geneItem in InWikiData.wditems["props"]["351"]:
            entrezWikidataIds[str(geneItem[2])] = geneItem[0]

        print("Getting all human proteins from Uniprot...")
        # r0 = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj")
        r0 = requests.get(
            'http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj'
        )
        prot_results = r0.json()
        uniprot_ids = []
        for protein in prot_results["results"]["bindings"]:
            item = dict()
            item["id"] = protein["protein"]["value"].replace(
                "http://purl.uniprot.org/uniprot/", "")
            item["label"] = protein["protein_label"]["value"]
            uniprot_ids.append(item)

        for up in uniprot_ids:
            try:
                #if up["id"] not in uniprotwikidataids:
                '''
                    Get protein annotations from Uniprot
                    '''
                #r = requests.get(
                #    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" +
                #    str(up["id"]) +
                #    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09%09%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj")

                r = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3fncbiGene%3b+separator%3d%22%3b+%22)+as+%3fgene_id)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(up["id"]) +
                    "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09++++%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++optional%7b%3funiprot+rdfs%3aseeAlso+%3fncbiGene+.%0d%0a++++++++%3fncbiGene+up%3adatabase+database%3aGeneID+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj"
                )

                protein = r.json()
                if len(protein["results"]["bindings"]) == 0:
                    raise Exception("Communication error on " + up["id"])
                #if "results" not in protein.keys():
                '''
                    Get go annotations from Uniprot
                    '''
                r2 = requests.get(
                    "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e+%0d%0aSELECT+DISTINCT+%3fprotein+%3fgo+%3fgoLabel+%3fparentLabel%0d%0aWHERE%0d%0a%7b%0d%0a++%09%09VALUES+%3fprotein+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
                    + str(up["id"]) +
                    "%3e%7d%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+up%3aclassifiedWith+%3fgo+.+++%0d%0a++++++++%3fgo+rdfs%3alabel+%3fgoLabel+.%0d%0a++++++++%3fgo+rdfs%3asubClassOf*+%3fparent+.%0d%0a++++++++%3fparent+rdfs%3alabel+%3fparentLabel+.%0d%0a++++++++optional+%7b%3fparent+rdfs%3asubClassOf+%3fgrandParent+.%7d%0d%0a++++++++FILTER+(!bound(%3fgrandParent))%0d%0a%7d&format=srj"
                )
                go_terms = r2.json()

                protein["goTerms"] = go_terms
                protein["logincreds"] = self.logincreds
                # protein["label"] = up["label"]
                protein["id"] = up["id"]
                protein["start"] = self.start
                protein["geneSymbols"] = genesymbolwdmapping
                protein["entrezWikidataIds"] = entrezWikidataIds
                protein_class = HumanProtein(protein)
            #else:
            #print(up["id"]+" already covered in wikidata")

            except Exception as e:
                print(traceback.format_exc())
                PBB_Core.WDItemEngine.log(
                    'ERROR',
                    '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                    .format(main_data_id=up["id"],
                            exception_type=type(e),
                            message=e.__str__(),
                            wd_id='-',
                            duration=time.time() - self.start))
import time
import pprint
try:
    import simplejson as json
except ImportError as e:
    import json

import traceback

start = time.time()
if len(sys.argv) == 1:
    print("Please provide an Disease Ontology ID")
    print("Example: python single_disease_bot.py 628")
    sys.exit()

logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                               PBB_settings.getWikiDataPassword())
content = ET.fromstring(requests.get(DiseaseOntology_settings.getdoUrl()).text)
doDate = content.findall('.//oboInOwl:date',
                         DiseaseOntology_settings.getDoNameSpaces())
doversion = content.findall('.//owl:versionIRI',
                            DiseaseOntology_settings.getDoNameSpaces())
dateList = doDate[0].text.split(' ')[0].split(":")
searchTerm = "Disease ontology release " + dateList[2] + "-" + dateList[
    1] + "-" + dateList[0]

url = 'https://www.wikidata.org/w/api.php'
params = {
    'action': 'wbsearchentities',
    'format': 'json',
    'language': 'en',
    'type': 'item',
Esempio n. 13
0
def main():
    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:

            line = f.readline()
            if line is None or line == '':
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line

    def get_uniprot_for_entrez():
        # query WD for all entrez IDs (eukaryotic)
        # query Uniprot for all high quality annotated Uniprots based on the entrez id.

        query = '''
        SELECT * WHERE {
            ?gene wdt:P351 ?entrez .
            {?gene wdt:P703 wd:Q5}
            UNION {?gene wdt:P703 wd:Q83310} .
            {?gene wdt:P354 ?res_id}
            UNION {?gene wdt:P671 ?res_id} .
        }
        '''

        results = PBB_Core.WDItemEngine.execute_sparql_query(
            query=query)['results']['bindings']

        entrez_to_qid = dict()
        global res_id_to_entrez_qid
        res_id_to_entrez_qid = dict()

        for z in results:
            # ensure that the correct prefix exists so the identifier can be found in the Uniprot XML file
            res_id = z['res_id']['value']
            entrez_qid = z['gene']['value'].split('/')[-1]
            entrez_id = z['entrez']['value']
            if len(res_id.split(':')) <= 1:
                res_id = 'HGNC:' + z['res_id']['value']

            entrez_to_qid[entrez_id] = (entrez_qid, res_id)
            res_id_to_entrez_qid.update({res_id: (entrez_qid, entrez_id)})

        print('Wikidata Entrez query complete')

        uniprot_to_qid = get_all_wd_uniprots()
        print('Wikidata Uniprot query complete')

        up_prefix = '''
        PREFIX taxon:<http://purl.uniprot.org/taxonomy/>
        PREFIX up:<http://purl.uniprot.org/core/>
        PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        '''

        headers = {
            'content-type': 'application/sparql-results+json',
            'charset': 'utf-8'
        }

        up_query = '''
        SELECT DISTINCT * WHERE {
            ?uniprot rdfs:seeAlso ?gene .
            ?uniprot up:reviewed ?reviewed .
            {?uniprot up:organism taxon:9606}
            UNION {?uniprot up:organism taxon:10090} .

            FILTER regex(?gene, "^http://purl.uniprot.org/geneid/")
        }
        GROUP BY ?uniprot ?gene ?reviewed
        '''

        query_string = up_prefix + up_query

        data = {'format': 'srj', 'query': query_string}

        if read_local and os.path.isfile('uniprot_entrez_map.json'):
            with open('uniprot_entrez_map.json', 'r') as f:
                results = json.load(f)
        else:
            reply = requests.post(url='http://sparql.uniprot.org/sparql/',
                                  params=data,
                                  headers=headers)
            results = reply.json()

            with open('uniprot_entrez_map.json', 'w') as of:
                json.dump(results, of)

        print('Uniprot query complete')
        uniprot_map = dict()

        for ids in results['results']['bindings']:
            entrez_id = ids['gene']['value'].split('/')[-1]
            uniprot_id = ids['uniprot']['value'].split('/')[-1]

            reviewed = False
            if ids['reviewed']['value'] == 'true':
                reviewed = True

            if reviewed or (not reviewed and uniprot_id in uniprot_to_qid):
                if entrez_id not in entrez_to_qid:
                    print('Entrez ID {} not in Wikidata'.format(entrez_id))
                    continue

                if uniprot_id not in uniprot_to_qid:
                    protein_qid = ''
                else:
                    protein_qid = uniprot_to_qid[uniprot_id]

                uniprot_map[uniprot_id] = {
                    'entrez': {
                        'id': entrez_id,
                        'qid': entrez_to_qid[entrez_id][0],
                        'res_id': entrez_to_qid[entrez_id][1]
                    },
                    'qid': protein_qid
                }

        # Uniprot items in WD without a link to a gene should also be updated, therefore add them to uniprot_map,
        # keep entrez empty.
        for wd_protein_item in uniprot_to_qid:
            if wd_protein_item not in uniprot_map:
                uniprot_map[wd_protein_item] = {
                    'entrez': {
                        'id': '',
                        'qid': '',
                        'res_id': ''
                    },
                    'qid': uniprot_to_qid[wd_protein_item]
                }

        return uniprot_map

    def get_all_wd_uniprots():
        query = '''
        SELECT * WHERE {
            ?protein wdt:P352 ?uniprot .
            {?protein wdt:P703 wd:Q5}
            UNION {?protein wdt:P703 wd:Q83310} .
        }
        '''

        results = PBB_Core.WDItemEngine.execute_sparql_query(
            query=query)['results']['bindings']

        return {
            z['uniprot']['value']: z['protein']['value'].split('/')[-1]
            for z in results
        }

    def get_go_map():
        query = '''
        SELECT * WHERE {
            ?qid wdt:P686 ?go .
        }
        '''

        results = PBB_Core.WDItemEngine.execute_sparql_query(
            query=query)['results']['bindings']

        go_to_qid = dict()
        for z in results:
            go_to_qid[z['go']['value']] = {
                'qid': z['qid']['value'].split('/')[-1],
                'go_class_prop': ''
            }

        return go_to_qid

    def get_pdb_to_uniprot():
        file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz'

        pdb_uniprot_map = dict()

        for c, line in enumerate(read_file(file)):
            if c < 2:
                print(line)
                continue

            dt = line.strip('\n').split(',')

            if dt[0] not in pdb_uniprot_map:
                pdb_uniprot_map[dt[0]] = dt[1].split(';')

        return pdb_uniprot_map

    def const_go_map():
        base_dict = {'go_terms': list(), 'evidence': list(), 'pdb': set()}

        file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_go.csv.gz'

        pdb_go_map = dict()

        for c, line in enumerate(read_file(file)):
            if c < 2:
                print(line)
                c += 1
                continue

            dt = line.strip('\n').split(',')
            uniprot = copy.copy(dt[2])

            if uniprot not in base_map:
                continue

            if uniprot not in pdb_go_map:
                pdb_go_map[uniprot] = copy.deepcopy(base_dict)

            pdb_go_map[uniprot]['go_terms'].append(dt[-1])
            pdb_go_map[uniprot]['evidence'].append(dt[-2])

            pdb_go_map[uniprot]['pdb'].add(dt[0])

        print('total number of PDBs', len(pdb_go_map))

        pdb_to_uniprot = get_pdb_to_uniprot()
        for uniprot in pdb_to_uniprot:
            if uniprot in pdb_go_map:
                pdb_go_map[uniprot]['pdb'].update(pdb_to_uniprot[uniprot])
            else:
                pdb_go_map[uniprot] = copy.deepcopy(base_dict)
                pdb_go_map[uniprot]['pdb'] = set(pdb_to_uniprot[uniprot])

        entrez_to_uniprot = {
            base_map[z]['entrez']['id']: z
            for z in base_map if base_map[z]['entrez']['id']
        }

        # Download and process latest human and mouse GO term annotation files
        files = [
            'http://geneontology.org/gene-associations/gene_association.goa_human.gz',
            'http://geneontology.org/gene-associations/gene_association.mgi.gz'
        ]

        for file in files:
            for line in read_file(file):
                if line.startswith('!'):
                    continue

                cols = line.split('\t')
                uniprot = cols[1]
                go_id = cols[4]
                evidence = cols[6]
                go_class = cols[8]

                if cols[0] == 'MGI':
                    try:
                        mgi = cols[1]
                        entrez = res_id_to_entrez_qid[mgi][1]
                        uniprot = entrez_to_uniprot[entrez]
                    except KeyError:
                        continue

                if uniprot not in pdb_go_map:
                    pdb_go_map[uniprot] = copy.deepcopy(base_dict)

                pdb_go_map[uniprot]['go_terms'].append(go_id)
                pdb_go_map[uniprot]['evidence'].append(evidence)

                try:
                    go_prop_map[go_id][
                        'go_class_prop'] = ProteinBot.get_go_class(
                            go_id, go_class)
                except KeyError:
                    print('GO term {} not yet in Wikidata'.format(go_id))
                    continue

        return pdb_go_map

    parser = argparse.ArgumentParser(description='ProteinBot parameters')
    parser.add_argument('--run-locally',
                        action='store_true',
                        help='Locally stored data files and run progress '
                        'will be used. Acts also as if continuing a run.')
    parser.add_argument('--user',
                        action='store',
                        help='Username on Wikidata',
                        required=True)
    parser.add_argument('--pwd',
                        action='store',
                        help='Password on Wikidata',
                        required=True)
    args = parser.parse_args()

    read_local = args.run_locally

    login = PBB_login.WDLogin(user=args.user, pwd=args.user)

    # generate a basic mapping of Uniprot to Entrez and Wikidata genes and proteins
    base_map = get_uniprot_for_entrez()

    # generate mappings of GO terms to their Wikidata QIDs
    go_prop_map = get_go_map()

    # generate a map of Uniprot IDs with the matches PDB IDs, GO term and GO evidence codes
    pdb_to_go = const_go_map()

    if read_local and os.path.isfile('uniprot_progress.json'):
        with open('uniprot_progress.json', 'r') as infile:
            progress = json.load(infile)
    else:
        progress = dict()

    for count, x in enumerate(base_map):
        if x in progress:
            continue

        pprint.pprint(x)
        pprint.pprint(base_map[x])
        ProteinBot(uniprot=x,
                   base_map=base_map,
                   pdb_to_go=pdb_to_go,
                   go_prop_map=go_prop_map,
                   login=login,
                   progress=progress)

        with open('uniprot_progress.json', 'w') as outfile:
            json.dump(progress, outfile)
Esempio n. 14
0
        "   Usage: MicrobeBotModularPackage.py <Wikidata user name> <Wikidata Password> <run number> <domain "
        "i.e. genes/proteins/encode_genes/encode_proteins>, <number of genomes to process> "
    )
    sys.exit()
else:
    pass


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for c in range(0, len(l), n):
        yield l[c:c + n]


# Login to Wikidata with bot credentials
login = PBB_login.WDLogin(sys.argv[1], sys.argv[2])

# Retrieve Current Bacterial Reference Genomes from NCBI
print('Retrieving current list of NCBI Bacterial Reference Genomes')
print('Standby...')

genome_records = MBR.get_ref_microbe_taxids()
ref_taxids = genome_records['taxid'].tolist()
# break up list of taxids into chunks of 5 for subruns
count = 0
runs_list = chunks(ref_taxids, int(sys.argv[5]))

taxids = {}

for i in runs_list:
    count += 1
def main():
    prefix = '''
    PREFIX schema: <http://schema.org/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    '''

    query = '''
    SELECT ?entrez_id ?cid ?article ?label WHERE {
        ?cid wdt:P351 ?entrez_id .
        ?cid wdt:P703 wd:Q5 .
        OPTIONAL {
            ?cid rdfs:label ?label filter (lang(?label) = "en") .
        }
        ?article schema:about ?cid .
        ?article schema:inLanguage "en" .
        FILTER (SUBSTR(str(?article), 1, 25) = "https://en.wikipedia.org/") .
        FILTER (SUBSTR(str(?article), 1, 38) != "https://en.wikipedia.org/wiki/Template")
    }
    '''

    print(sys.argv[1])

    sparql_results = PBB_Core.WDItemEngine.execute_sparql_query(query=query, prefix=prefix)

    curr_date = datetime.datetime.now()
    end_date = datetime.date(year=curr_date.year, month=curr_date.month, day=1) - datetime.timedelta(days=1)
    start_date = datetime.date(year=end_date.year, month=end_date.month, day=1)

    total_views = 0
    from_timestamp = '{}00'.format(start_date.strftime('%Y%m%d'))
    to_timestamp = '{}00'.format(end_date.strftime('%Y%m%d'))

    all_items = list()
    url = 'https://en.wikipedia.org/w/api.php'

    for count, i in enumerate(sparql_results['results']['bindings']):
        article = i['article']['value'].split('/')[-1]
        print(article)

        r = requests.get(url='https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/'
                             'en.wikipedia/all-access/user/{}/daily/{}/{}'.format(article, from_timestamp,
                                                                                  to_timestamp))
        article_views = 0

        if 'items' in r.json():

            for day in r.json()['items']:
                article_views += day['views']

            total_views += article_views

            params = {
                'action': 'query',
                'prop': 'pageprops|info',
                'titles': urllib.parse.unquote(article),
                'format': 'json'
            }

            page_size = 0
            size_results = requests.get(url=url, params=params).json()['query']['pages']
            for x in size_results.values():
                page_size = x['length']

            all_items.append((urllib.parse.unquote(article), article_views, page_size))

            # do some printing for the user
            print(count, 'article views: ', article_views, 'total views: ', total_views,
                  'mean views: ', total_views/(count + 1), 'page size:', page_size)

            if count % 100 == 0:
                # print top accessed pages
                all_items.sort(key=lambda z: z[1], reverse=True)
                pprint.pprint(all_items[0:10])

                # print largest pages
                all_items.sort(key=lambda z: z[2], reverse=True)
                pprint.pprint(all_items[0:10])

        else:
            pprint.pprint(r.text)

    # final sort and print top accessed pages
    all_items.sort(key=lambda z: z[1], reverse=True)
    pprint.pprint(all_items[0:10])
    table_data = [all_items[0:10]]

    # print largest pages
    all_items.sort(key=lambda z: z[2], reverse=True)
    pprint.pprint(all_items[0:10])
    table_data.append(all_items[0:10])

    login = PBB_login.WDLogin(user='******', pwd=sys.argv[1], server='en.wikipedia.org')

    # get page text
    params = {
        'action': 'query',
        'titles': 'Portal:Gene_Wiki/Quick_Links',
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json'
    }

    page_text = [x['revisions'][0]['*']
                 for x in requests.get(url=url, params=params).json()['query']['pages'].values()][0]

    re_pattern = re.match(re.compile('^{.*?}', re.DOTALL), page_text)

    wp_string = \
        '''{{| align="right" border="1" style="text-align:center" cellpadding="0" cellspacing="0" class="wikitable"
        |+ Top Gene Wiki articles (as of {}. 1, {})
        ! Rank !! by size (word count) !! by page views in {}., {}{}
        |}}'''

    wp_table_row = '''
    |-
    |{0}
    | [[{1}]]
    | [[{2}]]'''

    tmp_string = ''
    for i in range(1, 11):
        tmp_string += wp_table_row.format(i, table_data[1][i - 1][0], table_data[0][i - 1][0])

    table_string = wp_string.format(curr_date.strftime("%B")[0:3], curr_date.year, end_date.strftime("%B")[0:3],
                                    end_date.year, tmp_string)
    print(table_string + page_text[re_pattern.end():])

    params = {
        'action': 'edit',
        'title': 'Portal:Gene_Wiki/Quick_Links',
        'section': '0',
        'text': table_string + page_text[re_pattern.end():],
        'token': login.get_edit_token(),
        'format': 'json'
    }

    r = requests.post(url=url, data=params, cookies=login.get_edit_cookie())
    pprint.pprint(r.json())
Esempio n. 16
0
def main():
    # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication/?molecule_chembl_id=CHEMBL1637&limit=100&format=json'
    # params = {
    #     'molecule_chembl_id': 'CHEMBL1637',
    #     'limit': '1000',
    #     'format': 'json'
    # }
    #
    # url = 'https://www.ebi.ac.uk/chembl/api/data/drug_indication'
    #
    # r = requests.get(url, params=params)
    # pprint.pprint(r.json())
    #
    # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication.json?limit=1000&offset=0'

    # get_parent_molecule('CHEMBL2364968')

    chembl_wd_map = get_id_wd_map('P592')
    mesh_wd_map = get_id_wd_map('P486')
    ndfrt_wd_map = get_id_wd_map('P2115')
    wd_ndfrt_map = {ndfrt_wd_map[x]: x for x in ndfrt_wd_map}

    # contains drug QIDs as keys, and a dict of 'disease_qid', 'source_id' as keys. values are disease item QID and the
    # db identifier for NDF-RT or CHEMBL.

    drug_disease_map = dict()

    if os.path.isfile('drug_disease.json'):
        with open('drug_disease.json', 'r') as infile:
            drug_disease_map = json.load(infile)

    for nui in ndfrt_wd_map:
        diseases = get_ndfrt_drug_links(nui)
        drug_qid = ndfrt_wd_map[nui]
        for disease_mesh in diseases:
            if not disease_mesh:
                continue
            elif disease_mesh in mesh_wd_map:
                disease_qid = mesh_wd_map[disease_mesh]
            else:
                print('Disease not found in Wikidata:', disease_mesh, diseases[disease_mesh])
                continue

            if drug_qid in drug_disease_map:
                drug_disease_map[drug_qid]['disease_qid'].append(disease_qid)
                drug_disease_map[drug_qid]['source_id'].append(nui)
            else:
                drug_disease_map.update({
                    drug_qid: {
                        'disease_qid': [disease_qid],
                        'source_id': [nui]
                    }
                })

    # pprint.pprint(drug_disease_map)

    if os.path.isfile('full_drug_disease_map.json'):
        with open('full_drug_disease_map.json', 'r') as infile:
            drug_disease_map = json.load(infile)
    else:
        all_indications = get_all_chembl_indications()
        all_indications.to_csv('all_chembl_indications.csv', index=False)

        unique_chembl_ids = all_indications['molecule_chembl_id'].unique()
        chembl_to_parent = dict()
        unique_mesh_ids = all_indications['mesh_id'].unique()

        for chembl_id in unique_chembl_ids:
            # print('chembl id:', chembl_id)
            if chembl_id in chembl_wd_map:
                curr_chembl = chembl_id
            else:
                parent_chembl = get_parent_molecule(chembl_id)
                chembl_to_parent.update({chembl_id: parent_chembl})
                curr_chembl = parent_chembl

            if curr_chembl not in chembl_wd_map:
                print(curr_chembl, 'not found in Wikidata')
                continue

            curr_drug_qid = chembl_wd_map[curr_chembl]

            chembl_id_df = all_indications[all_indications['molecule_chembl_id'] == curr_chembl]
            # pprint.pprint(chembl_id_df)

            for x in chembl_id_df.index:
                curr_mesh = chembl_id_df.loc[x, 'mesh_id']
                # print('this current mesh', curr_mesh)
                if pd.notnull(curr_mesh) and curr_mesh in mesh_wd_map:
                    print(curr_chembl, curr_mesh, 'pair found', 'index', x)

                    disease_qid = mesh_wd_map[curr_mesh]
                    if curr_drug_qid in drug_disease_map:
                        if disease_qid not in drug_disease_map[curr_drug_qid]['disease_qid']:
                            drug_disease_map[curr_drug_qid]['disease_qid'].append(disease_qid)
                            drug_disease_map[curr_drug_qid]['source_id'].append(chembl_id)
                    else:
                        drug_disease_map.update({
                            curr_drug_qid: {
                                'disease_qid': [disease_qid],
                                'source_id': [chembl_id]
                            }
                        })

    with open('full_drug_disease_map.json', 'w') as outfile:
        json.dump(drug_disease_map, outfile)

    print(sys.argv[1])
    login = PBB_login.WDLogin(user='******', pwd=sys.argv[1])

    for count, drug in enumerate(drug_disease_map):
        statements = list()

        for c, disease in enumerate(drug_disease_map[drug]['disease_qid']):
            ref_source_id = drug_disease_map[drug]['source_id'][c]
            references = generate_refs(ref_source_id)

            statements.append(PBB_Core.WDItemID(value=disease, prop_nr='P2175', references=references))

        try:
            item = PBB_Core.WDItemEngine(wd_item_id=drug, data=statements)
            item_qid = item.write(login)
            print('sucessfully written to', item_qid, item.get_label())
        except Exception as e:
            print('write failed to drug item:', drug)
            print(e)

        # if count > 2:
        #     break

    disease_drug_map = {z: {'drug_qid': list(), 'source_id': list()} for x in drug_disease_map
                        for z in drug_disease_map[x]['disease_qid']}

    for count, drug in enumerate(drug_disease_map):
        for c, disease in enumerate(drug_disease_map[drug]['disease_qid']):
            source = drug_disease_map[drug]['source_id'][c]
            disease_drug_map[disease]['drug_qid'].append(drug)
            disease_drug_map[disease]['source_id'].append(source)

    for count, disease in enumerate(disease_drug_map):
        statements = list()

        for c, drug in enumerate(disease_drug_map[disease]['drug_qid']):
            ref_source_id = disease_drug_map[disease]['source_id'][c]
            references = generate_refs(ref_source_id)

            statements.append(PBB_Core.WDItemID(value=drug, prop_nr='P2176', references=references))

        try:
            item = PBB_Core.WDItemEngine(wd_item_id=disease, data=statements)
            item_qid = item.write(login)
            print('sucessfully written to', item_qid, item.get_label())
        except Exception as e:
            print('write failed to disease item:', disease)
            print(e)
Esempio n. 17
0
def main():

    print(sys.argv[1], sys.argv[2])
    # pwd = input('Password:'******'''
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX schema: <http://schema.org/>
    '''

    missing_go_query = '''
        SELECT distinct ?protein ?label WHERE {
          ?protein wdt:P279 wd:Q8054 .
          ?protein wdt:P703 wd:Q5 .
          OPTIONAL {
              ?protein rdfs:label ?label filter (lang(?label) = "en") .
              #?article schema:about ?protein .
          }
          FILTER NOT EXISTS {?protein wdt:P351 ?m} .
          FILTER NOT EXISTS {?protein wdt:P352 ?n} .
          FILTER NOT EXISTS {?protein wdt:P31 wd:Q21996465} .
          FILTER NOT EXISTS {?protein wdt:P31 wd:Q14633939} .
        }
        #GROUP BY ?protein
    '''

    results = PBB_Core.WDItemEngine.execute_sparql_query(prefix=prefix, query=missing_go_query)['results']['bindings']
    start_time = time.time()

    for count, x in enumerate(results):
        protein_qid = x['protein']['value'].split('/')[-1]
        # pprint.pprint(x)
        if 'label' in x:
            label = x['label']['value']
        else:
            print('No label found for', protein_qid)


        print_item(protein_qid)

        gene_qid = lookup_symbol(symbol=label)
        print('count:', count, 'Gene QID:', gene_qid)
        if gene_qid is not None:
            decision = input('Merge? (y):')

            if decision == 'y':
                merge(merge_from=protein_qid, merge_to=gene_qid, login_obj=login_obj)

        else:
            # Protein class/family Q417841
            # protein complex Q14633939
            decision = input('Protein class? (p):\nProtein complex? (c)\nSearch (s):')

            if decision == 's':
                s_qids, s_labels, s_descr, s_aliases = get_wd_search_results(search_string=label)

                for s_count, s in enumerate(s_qids):
                    print(s_count, s_qids[s_count], s_labels[s_count], s_descr[s_count], s_aliases[s_count])

                decision = input('Select by number:')
                try:
                    number = int(decision)
                    merge_to_qid = s_qids[number]

                    merge(merge_to=merge_to_qid, merge_from=protein_qid, login_obj=login_obj)
                    continue
                except ValueError:
                    decision = input('\n\nProtein class? (p):\nProtein complex? (c):')

            try:
                if decision == 'p':
                    data = [PBB_Core.WDItemID(value='Q417841', prop_nr='P31')]
                elif decision == 'c':
                    data = [PBB_Core.WDItemID(value='Q14633939', prop_nr='P31')]
                else:
                    continue

                wd_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=data)

                wd_item.write(login=login_obj)

                print('added protein class')
            except Exception as e:
                pprint.pprint(e)
                continue

            pass
Esempio n. 18
0
def main():
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    PDBImageFix(login)
Esempio n. 19
0
def main():
    cid_wd_map = get_id_wd_map('P662')
    uniprot_wd_map = get_id_wd_map('P352')
    # pprint.pprint(cid_wd_map)

    interaction_types = {
        'Agonist': 'Q389934',
        'Inhibitor': 'Q427492',
        'Allosteric modulator': 'Q2649417',
        'Antagonist': 'Q410943',
        'Channel blocker': 'Q5072487'
    }

    all_ligands = pd.read_csv('./iuphar_data/ligands.csv',
                              header=0,
                              sep=',',
                              dtype={
                                  'PubChem CID': np.str,
                                  'PubChem SID': np.str,
                                  'Ligand id': np.str
                              },
                              low_memory=False)
    all_interactions = pd.read_csv('./iuphar_data/interactions.csv',
                                   header=0,
                                   sep=',',
                                   dtype={
                                       'ligand_id': np.str,
                                       'ligand_pubchem_sid': np.str
                                   },
                                   low_memory=False)

    print(sys.argv[1])
    login = PBB_login.WDLogin(user='******', pwd=sys.argv[1])

    for count, uniprot_id in enumerate(
            all_interactions['target_uniprot'].unique()):
        if uniprot_id in uniprot_wd_map:
            uniprot_id_df = all_interactions[all_interactions['target_uniprot']
                                             == uniprot_id]

            statements = list()
            for sid in uniprot_id_df['ligand_pubchem_sid']:
                try:
                    cid = all_ligands.loc[all_ligands['PubChem SID'] == sid,
                                          'PubChem CID'].iloc[0]
                    iuphar_ligand = all_ligands.loc[
                        all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0]
                    itype = uniprot_id_df.loc[
                        uniprot_id_df['ligand_pubchem_sid'] == sid,
                        'type'].iloc[0]

                    qualifier = []
                    if itype in interaction_types:
                        qualifier.append(
                            PBB_Core.WDItemID(value=interaction_types[itype],
                                              prop_nr='P366',
                                              is_qualifier=True))

                    if cid in cid_wd_map:
                        # print(cid, 'will be added to', uniprot_id)
                        compound_qid = cid_wd_map[cid]
                        statements.append(
                            PBB_Core.WDItemID(
                                value=compound_qid,
                                prop_nr='P129',
                                references=generate_refs(iuphar_ligand),
                                qualifiers=qualifier))
                except IndexError as e:
                    print('No CID found for:', sid, uniprot_id)
                    continue

            if len(statements) == 0:
                continue
            try:
                print(len(statements))
                item = PBB_Core.WDItemEngine(
                    wd_item_id=uniprot_wd_map[uniprot_id], data=statements)
                item_qid = item.write(login)
                # pprint.pprint(item.get_wd_json_representation())
                print('sucessfully written to', item_qid, item.get_label())

            except Exception as e:
                print(e)

    for count, sid in enumerate(
            all_interactions['ligand_pubchem_sid'].unique()):
        try:
            cid = all_ligands.loc[all_ligands['PubChem SID'] == sid,
                                  'PubChem CID'].iloc[0]
        except IndexError:
            continue
        if cid in cid_wd_map:
            sid_df = all_interactions[all_interactions['ligand_pubchem_sid'] ==
                                      sid]

            statements = list()
            for uniprot in sid_df['target_uniprot']:
                try:
                    # cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0]
                    iuphar_ligand = all_ligands.loc[
                        all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0]
                    itype = sid_df.loc[sid_df['ligand_pubchem_sid'] == sid,
                                       'type'].iloc[0]

                    qualifier = []
                    if itype in interaction_types:
                        qualifier.append(
                            PBB_Core.WDItemID(value=interaction_types[itype],
                                              prop_nr='P794',
                                              is_qualifier=True))

                    if uniprot in uniprot_wd_map:
                        # print(cid, 'will be added to', uniprot_id)
                        uniprot_qid = uniprot_wd_map[uniprot]
                        statements.append(
                            PBB_Core.WDItemID(
                                value=uniprot_qid,
                                prop_nr='P129',
                                references=generate_refs(iuphar_ligand),
                                qualifiers=qualifier))
                except IndexError as e:
                    print('No Uniprot found for:', uniprot)
                    continue

            if len(statements) == 0:
                continue
            try:
                print(len(statements))
                item = PBB_Core.WDItemEngine(wd_item_id=cid_wd_map[cid],
                                             data=statements)
                item_qid = item.write(login)
                # pprint.pprint(item.get_wd_json_representation())
                print('sucessfully written to', item_qid, item.get_label())

            except Exception as e:
                print(e)
Esempio n. 20
0
def main():
    current_taxon_id = ''
    current_taxon_qid = ''

    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:
            line = f.readline()
            if not line:
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line

    def get_uniprot_for_entrez():
        # query WD for all entrez IDs (eukaryotic)
        # query Uniprot for all high quality annotated Uniprots based on the entrez id.

        query = '''
        SELECT * WHERE {{
            ?gene wdt:P351 ?entrez .
            ?gene wdt:P703 wd:{} .
            OPTIONAL {{
                {{?gene wdt:P354 ?hgnc_id .}} UNION
                {{?gene wdt:P671 ?mgi_id .}}
            }}
        }}
        '''.format(current_taxon_qid)

        results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings']

        entrez_to_qid = dict()
        global res_id_to_entrez_qid
        res_id_to_entrez_qid = dict()

        for z in results:
            # ensure that the correct prefix exists so the identifier can be found in the Uniprot XML file
            entrez_qid = z['gene']['value'].split('/')[-1]
            entrez_id = z['entrez']['value']

            res_id = ''
            if 'hgnc_id' in z:
                res_id = z['hgnc_id']['value']
                if len(res_id.split(':')) <= 1:
                    res_id = 'HGNC:' + res_id
            elif 'mgi_id' in z:
                res_id = z['mgi_id']['value']
                if len(res_id.split(':')) <= 1:
                    res_id = 'MGI:' + res_id

            entrez_to_qid[entrez_id] = (entrez_qid, res_id)
            res_id_to_entrez_qid.update({res_id: (entrez_qid, entrez_id)})

        print('Wikidata Entrez query complete')

        uniprot_to_qid = get_all_wd_uniprots()
        print('Wikidata Uniprot query complete')

        up_prefix = '''
        PREFIX taxon:<http://purl.uniprot.org/taxonomy/>
        PREFIX up:<http://purl.uniprot.org/core/>
        PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        '''

        headers = {
            'content-type': 'application/sparql-results+json',
            'charset': 'utf-8'
        }

        up_query = '''
        SELECT DISTINCT * WHERE {{
            ?uniprot rdfs:seeAlso ?gene .
            ?uniprot up:reviewed ?reviewed .
            ?uniprot up:organism taxon:{} .

            FILTER regex(?gene, "^http://purl.uniprot.org/geneid/")
        }}
        GROUP BY ?uniprot ?gene ?reviewed
        '''.format(current_taxon_id)

        query_string = up_prefix + up_query

        data = {
            'format': 'srj',
            'query': query_string
        }

        uniprot_entrez_map_filename = 'uniprot_entrez_map_{}.json'.format(current_taxon_id)
        if read_local and os.path.isfile(uniprot_entrez_map_filename):
            with open(uniprot_entrez_map_filename, 'r') as f:
                results = json.load(f)
        else:
            reply = requests.post(url='http://sparql.uniprot.org/sparql/', params=data, headers=headers)
            results = reply.json()

            with open(uniprot_entrez_map_filename, 'w') as of:
                json.dump(results, of)

        print('Uniprot query complete')
        uniprot_map = dict()

        for ids in results['results']['bindings']:
            entrez_id = ids['gene']['value'].split('/')[-1]
            uniprot_id = ids['uniprot']['value'].split('/')[-1]

            reviewed = False
            if ids['reviewed']['value'] == 'true':
                reviewed = True

            if reviewed or (not reviewed and uniprot_id in uniprot_to_qid):
                if entrez_id not in entrez_to_qid:
                    print('Entrez ID {} not in Wikidata'.format(entrez_id))
                    continue

                if uniprot_id not in uniprot_to_qid:
                    protein_qid = ''
                else:
                    protein_qid = uniprot_to_qid[uniprot_id]

                uniprot_map[uniprot_id] = {
                    'entrez': {
                        'id': entrez_id,
                        'qid': entrez_to_qid[entrez_id][0],
                        'res_id': entrez_to_qid[entrez_id][1]
                    },
                    'qid': protein_qid
                }

        # Uniprot items in WD without a link to a gene should also be updated, therefore add them to uniprot_map,
        # keep entrez empty.
        for wd_protein_item in uniprot_to_qid:
            if wd_protein_item not in uniprot_map:
                uniprot_map[wd_protein_item] = {
                    'entrez': {
                        'id': '',
                        'qid': '',
                        'res_id': ''
                    },
                    'qid': uniprot_to_qid[wd_protein_item]
                }

        return uniprot_map

    def get_all_wd_uniprots():
        query = '''
        SELECT * WHERE {{
            ?protein wdt:P352 ?uniprot .
            ?protein wdt:P703 wd:{} .
        }}
        '''.format(current_taxon_qid)

        results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings']

        wd_up_map = dict()
        for z in results:
            up = z['uniprot']['value']
            qid = z['protein']['value'].split('/')[-1]

            # Make sure to reliably detect duplicate Uniprot IDs in Wikidata.
            # For performance reasons, this is done here and not by using PBB_core.
            if up in wd_up_map:
                PBB_Core.WDItemEngine.log(
                    'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format(
                        main_data_id='{}'.format(up),
                        exception_type='Duplicate Uniprot ID error.',
                        message='Duplicate Uniprot IDs in Wikidata. Cleanup required!!',
                        wd_id=qid,
                        duration=time.time()
                    ))
            else:
                wd_up_map.update({up: qid})

        return wd_up_map

    def get_go_map():
        query = '''
        SELECT * WHERE {
            ?qid wdt:P686 ?go .
        }
        '''

        results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings']

        go_to_qid = dict()
        for z in results:
            go_to_qid[z['go']['value']] = {
                'qid': z['qid']['value'].split('/')[-1],
                'go_class_prop': ''

            }

        return go_to_qid

    def get_pdb_to_uniprot():
        file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz'

        pdb_uniprot_map = dict()

        for c, line in enumerate(read_file(file)):
            if c < 2:
                print(line)
                continue

            dt = line.strip('\n').split(',')

            if dt[0] not in pdb_uniprot_map:
                pdb_uniprot_map[dt[0]] = dt[1].split(';')

        return pdb_uniprot_map

    def const_go_map():
        base_dict = {
            'go_terms': list(),
            'evidence': list(),
            'pdb': set()
        }

        file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_go.csv.gz'

        pdb_go_map = dict()

        for c, line in enumerate(read_file(file)):
            if c < 2:
                print(line)
                c += 1
                continue

            dt = line.strip('\n').split(',')
            uniprot = copy.copy(dt[2])

            if uniprot not in base_map:
                continue

            if uniprot not in pdb_go_map:
                pdb_go_map[uniprot] = copy.deepcopy(base_dict)

            pdb_go_map[uniprot]['go_terms'].append(dt[-1])
            pdb_go_map[uniprot]['evidence'].append(dt[-2])

            pdb_go_map[uniprot]['pdb'].add(dt[0])

        print('total number of PDBs', len(pdb_go_map))

        pdb_to_uniprot = get_pdb_to_uniprot()
        for uniprot in pdb_to_uniprot:
            if uniprot in pdb_go_map:
                pdb_go_map[uniprot]['pdb'].update(pdb_to_uniprot[uniprot])
            else:
                pdb_go_map[uniprot] = copy.deepcopy(base_dict)
                pdb_go_map[uniprot]['pdb'] = set(pdb_to_uniprot[uniprot])

        entrez_to_uniprot = {base_map[z]['entrez']['id']: z for z in base_map if base_map[z]['entrez']['id']}

        # Download and process latest human and mouse GO term annotation files
        # files = [
        #     'http://geneontology.org/gene-associations/gene_association.goa_human.gz',
        #     'http://geneontology.org/gene-associations/gene_association.mgi.gz'
        # ]
        #
        # for file in files:
        #     for line in read_file(file):
        #         if line.startswith('!'):
        #             continue
        #
        #         cols = line.split('\t')
        #         uniprot = cols[1]
        #         go_id = cols[4]
        #         evidence = cols[6]
        #         go_class = cols[8]
        #
        #         if cols[0] == 'MGI':
        #             try:
        #                 mgi = cols[1]
        #                 entrez = res_id_to_entrez_qid[mgi][1]
        #                 uniprot = entrez_to_uniprot[entrez]
        #             except KeyError:
        #                 continue
        #
        #         if uniprot not in pdb_go_map:
        #             pdb_go_map[uniprot] = copy.deepcopy(base_dict)
        #
        #         pdb_go_map[uniprot]['go_terms'].append(go_id)
        #         pdb_go_map[uniprot]['evidence'].append(evidence)
        #
        #         try:
        #             go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_class)
        #         except KeyError:
        #             print('GO term {} not yet in Wikidata'.format(go_id))
        #             continue

        return pdb_go_map

    parser = argparse.ArgumentParser(description='ProteinBot parameters')
    parser.add_argument('--run-locally', action='store_true', help='Locally stored data files and run progress '
                                                                   'will be used. Acts also as if continuing a run.')
    parser.add_argument('--user', action='store', help='Username on Wikidata', required=True)
    parser.add_argument('--pwd', action='store', help='Password on Wikidata', required=True)
    parser.add_argument('--taxon-ids', action='store',
                        help='Taxonomy IDs for the species the proteins should be written. Enter separated by a colon!'
                             'e.g. 9606,10090 for human and mouse')
    args = parser.parse_args()

    read_local = args.run_locally

    login = PBB_login.WDLogin(user=args.user, pwd=args.pwd)

    taxon_ids = [x.strip() for x in args.taxon_ids.split(',')]

    if len(taxon_ids) == 0:
        print('No taxon IDs given, falling back to human (9606) and mouse (10090)')
        taxon_ids = ['9606', '10090']

    for ti in taxon_ids:
        current_taxon_id = ti
        current_taxon_qid = ProteinBot.taxon_map[ti]
        progress_file_name = 'uniprot_progress_taxon_{}.json'.format(ti)

        # generate a basic mapping of Uniprot to Entrez and Wikidata genes and proteins
        base_map = get_uniprot_for_entrez()

        # generate mappings of GO terms to their Wikidata QIDs
        go_prop_map = get_go_map()

        # generate a map of Uniprot IDs with the matches PDB IDs, GO term and GO evidence codes
        pdb_to_go = const_go_map()

        if read_local and os.path.isfile(progress_file_name):
            with open(progress_file_name, 'r') as infile:
                progress = json.load(infile)
        else:
            progress = dict()

        for count, x in enumerate(base_map):
            if x in progress:
                continue

            pprint.pprint(x)
            pprint.pprint(base_map[x])
            ProteinBot(uniprot=x, base_map=base_map, pdb_to_go=pdb_to_go, go_prop_map=go_prop_map, login=login,
                       progress=progress, fast_run=True)

            with open(progress_file_name, 'w') as outfile:
                    json.dump(progress, outfile)
                    PREFIX wd: <http://www.wikidata.org/entity/> 
                    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

                    SELECT * WHERE {
                        ?diseases wdt:P699 "DOID:""" + doid + """\"
                    }

                """)
                sparql.setReturnFormat(JSON)
                results = sparql.query().convert()

                disease_wdid = results['results']['bindings'][0]['diseases'][
                    'value'].split("/")[4]
                if results['results']['bindings'][0]['diseases']['value']:
                    login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(),
                                              os.environ['wikidataApi'])
                    if not (values["Gene Symbol"] in gnsym_gemma_ids):
                        gemmaGeneIds = "http://sandbox.chibi.ubc.ca/Gemma/rest/phenotype/find-candidate-genes?phenotypeValueUris=" + values[
                            "Phenotype URIs"]
                        result = requests.get(gemmaGeneIds, stream=True).json()
                        for item in result:
                            gnsym_gemma_ids[
                                item['officialSymbol']] = item['id']

                    refURL = PBB_Core.WDUrl(
                        value=
                        'http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_'
                        + doid + '&geneId=' +
                        str(gnsym_gemma_ids[values["Gene Symbol"]]),
                        prop_nr='P854',
                        is_reference=True)
Esempio n. 22
0
                    PREFIX wd: <http://www.wikidata.org/entity/> 
                    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

                    SELECT * WHERE {
                        ?diseases wdt:P699 "DOID:""" + doid + """\"
                    }

                """)
                sparql.setReturnFormat(JSON)
                results = sparql.query().convert()
                pprint.pprint(results)
                # The current Disease Ontology term exists in Wikidata
                if len(results['results']['bindings'])!=0:    
                    disease_wdid = results['results']['bindings'][0]['diseases']['value'].split("/")[4]
                    if results['results']['bindings'][0]['diseases']['value']:
                        login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # put back in when using Jenkins: os.environ['wikidataApi']
                        # Only hit the API endpoint if we do not already have the gene symbol to Gemma ID mapping
                        if not (values["Gene Symbol"] in gnsym_gemma_ids):
                            gemmaGeneIds =  "http://sandbox.chibi.ubc.ca/Gemma/rest/phenotype/find-candidate-genes?phenotypeValueUris="+doid_url
                            result = requests.get(gemmaGeneIds, stream=True).json()
                            for item in result:
                                gnsym_gemma_ids[item['officialSymbol']] = item['id']
                        # not doing for now, until duplicate detection exists (for using qual)
                        # writing diseases to genes
                        refURL = PBB_Core.WDUrl(value='http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_'+doid+'&geneId='+str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True)
                        refURL2 = PBB_Core.WDUrl(value=values["Web Link"], prop_nr='P854', is_reference=True)
                        refImported = PBB_Core.WDItemID(value='Q22330995', prop_nr='P143', is_reference=True)
                        refImported.overwrite_references = True
                        refStated = PBB_Core.WDItemID(value='Q22978334', prop_nr='P248', is_reference=True)
                        timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime())
                        refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True)
Esempio n. 23
0
def main():
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    GOCleaner(login)
Esempio n. 24
0
def main():
    print(sys.argv[1])
    # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1])

    GeneWikiStubMerger(login)