Example #1
0
def create_uniprot_relationships(login,
                                 release_wdid,
                                 collection,
                                 taxon=None,
                                 write=True,
                                 run_one=False):
    # only do uniprot proteins that are already in wikidata
    # returns list of qids of items that are modified or skipped (excluding created)
    if taxon:
        uniprot2wd = wdi_helpers.id_mapper(UNIPROT, (("P703", taxon), ))
        fast_run_base_filter = {UNIPROT: "", "P703": taxon}
    else:
        uniprot2wd = wdi_helpers.id_mapper(UNIPROT)
        fast_run_base_filter = {UNIPROT: ""}

    uniprotids = sorted(list(uniprot2wd.keys()))
    qids = []
    for n, key in tqdm(enumerate(uniprotids), total=len(uniprotids)):
        if key not in collection:
            continue
        doc = collection[key]
        wd_item = create_for_one_protein(login,
                                         key,
                                         doc,
                                         release_wdid,
                                         uniprot2wd,
                                         fast_run_base_filter,
                                         write=write)
        if wd_item and not wd_item.create_new_item:
            qids.append(wd_item.wd_item_id)
        if run_one:
            break
    return qids
Example #2
0
    def __init__(self,
                 catalog_tsv_path=GWAS_PATH,
                 login=None,
                 fast_run=False,
                 write=True):
        self.gwas_catalog = GWASCatalog(catalog_tsv_path=catalog_tsv_path)

        self.fast_run_base_gene_filter = {
            PROPS['Entrez Gene ID']: "",
            PROPS['found in taxon']: 'Q15978631'
        }
        self.fast_run_base_disease_filter = {PROPS['Disease Ontology ID']: ""}
        self.login = login
        self.write = write
        self.fast_run = fast_run

        # Load doid -> wdid mapper
        self.doid_wdid_map = id_mapper(PROPS['Disease Ontology ID'])

        # Load entrez gene -> wdid mapper
        self.gene_wdid_map = id_mapper(PROPS['Entrez Gene ID'],
                                       filters=[(PROPS['found in taxon'],
                                                 'Q15978631')])

        # Load pmid -> wdid mapper
        # No! this eats up like 8gb of RAM!!!!
        # self.pmid_wdid_map = id_mapper(PROPS['PubMed ID'])

        self.wd_items = []
Example #3
0
    def __init__(self, sparql_endpoint_url, mediawiki_api_url, node_out_path, edge_out_path):
        self.sparql_endpoint_url = sparql_endpoint_url
        self.mediawiki_api_url = mediawiki_api_url
        self.node_out_path = node_out_path
        self.edge_out_path = edge_out_path

        uri_pid = wdi_helpers.id_mapper("P2", endpoint=sparql_endpoint_url)
        self.pid_uri = {v: k for k, v in uri_pid.items()}
        dbxref_pid = uri_pid['http://www.geneontology.org/formats/oboInOwl#DbXref']
        dbxref_qid = wdi_helpers.id_mapper(dbxref_pid, endpoint=sparql_endpoint_url)
        self.qid_dbxref = {v: k for k, v in dbxref_qid.items()}
        self.ref_supp_text_pid = uri_pid["http://reference_supporting_text"]
        self.reference_uri_pid = uri_pid["http://www.wikidata.org/entity/P854"]
        self.type_pid = uri_pid["http://type"]

        # prop label and descriptions
        pids = {x for x in self.qid_dbxref if x.startswith("P")}
        props = wdi_core.WDItemEngine.generate_item_instances(list(pids), mediawiki_api_url)
        self.pid_label = {pid: item.get_label() for pid, item in props}
        self.pid_descr = {pid: item.get_description() for pid, item in props}

        # get all items and all statements
        qids = {x for x in self.qid_dbxref if x.startswith("Q")}
        self.item_iter = self.item_chunker(sorted(list(qids)))
        # self.item_iter = self.item_chunker(['Q94', "Q347"])

        self.edge_lines = []
        self.node_lines = []
Example #4
0
 def __init__(self,
              sparql_endpoint_url='https://query.wikidata.org/sparql'):
     self.sparql_endpoint_url = sparql_endpoint_url
     # a map of property URIs to a PID in the wikibase you are using
     try:
         equiv_prop_pid = self.guess_equivalent_property_pid()
     except Exception:
         raise ValueError(
             "Error: No property found with URI 'http://www.w3.org/2002/07/owl#equivalentProperty'"
         )
     uri_pid = id_mapper(equiv_prop_pid,
                         endpoint=self.sparql_endpoint_url,
                         return_as_set=True)
     # remove duplicates/conflicts
     self.URI_PID = {
         k: list(v)[0]
         for k, v in uri_pid.items() if len(v) == 1
     }
     # get equivalent class PID
     if 'http://www.w3.org/2002/07/owl#equivalentClass' not in self.URI_PID:
         raise ValueError(
             "Error: No property found with URI 'http://www.w3.org/2002/07/owl#equivalentClass'"
         )
     equiv_class_pid = self.URI_PID[
         'http://www.w3.org/2002/07/owl#equivalentClass']
     # a map of item URIs to a QID in the wikibase you are using
     uri_qid = id_mapper(equiv_class_pid,
                         endpoint=self.sparql_endpoint_url,
                         return_as_set=True)
     # remove duplicates/conflicts
     self.URI_QID = {
         k: list(v)[0]
         for k, v in uri_qid.items() if len(v) == 1
     }
Example #5
0
    def __init__(self,
                 node_path,
                 edge_path,
                 mediawiki_api_url,
                 sparql_endpoint_url,
                 login,
                 simulate=False):
        self.node_path = node_path
        self.edge_path = edge_path
        self.nodes = None
        self.edges = None
        self.parse_nodes_edges()
        self.login = login
        self.write = not simulate
        self.mediawiki_api_url = mediawiki_api_url
        self.sparql_endpoint_url = sparql_endpoint_url

        self.item_engine = wdi_core.WDItemEngine.wikibase_item_engine_factory(
            mediawiki_api_url=mediawiki_api_url,
            sparql_endpoint_url=sparql_endpoint_url)
        self.get_equiv_prop_pid()
        self.uri_pid = wdi_helpers.id_mapper(self.get_equiv_prop_pid(),
                                             endpoint=sparql_endpoint_url)
        self.dbxref_pid = self.uri_pid[
            'http://www.geneontology.org/formats/oboInOwl#DbXref']
        self.dbxref_qid = wdi_helpers.id_mapper(self.dbxref_pid,
                                                endpoint=sparql_endpoint_url)
 def get_existing_items(self):
     # get all existing items we can add relationships to
     doid_wdid = id_mapper('P699')
     self.purl_wdid = {"http://purl.obolibrary.org/obo/{}".format(k.replace(":", "_")): v for k, v in
                       doid_wdid.items()}
     # add in uberon items so we can do "located in"
     uberon_wdid = id_mapper('P1554')
     self.purl_wdid.update({"http://purl.obolibrary.org/obo/UBERON_{}".format(k): v for k, v in uberon_wdid.items()})
     # add in taxonomy items for "has_material_basis_in"
     ncbi_wdid = id_mapper("P685")
     self.purl_wdid.update(
         {"http://purl.obolibrary.org/obo/NCBITaxon_{}".format(k): v for k, v in ncbi_wdid.items()})
def test_id_mapper_mrt():
    # this may break if it changes in wikidata ....
    d = id_mapper("P486", prefer_exact_match=True)
    assert d['D000998'] == 'Q40207875'
    assert d['D000037'] == 'Q388113'
    assert 'D0000333' not in d

    d = id_mapper("P486", prefer_exact_match=True, return_as_set=True)
    assert d['D000998'] == {'Q40207875'}
    assert d['D000037'] == {'Q388113'}
    assert 'D0000333' not in d

    d = id_mapper("P486", prefer_exact_match=False, return_as_set=True)
Example #8
0
def normalize_to_qids(d: dict):
    chembl_qid = wdi_helpers.id_mapper(PROPS['ChEMBL ID'])
    doid_qid = wdi_helpers.id_mapper(PROPS['Disease Ontology ID'])

    d = {chembl_qid.get(k): d[k] for k in d.keys()}

    for ind in chain(*d.values()):
        ind['indication_qid'] = doid_qid.get(ind['ConditionDoId'])
    for key in d:
        d[key] = [x for x in d[key] if x['indication_qid']]
        for x in d[key]:
            x['drug_qid'] = key
    d = {k: v for k, v in d.items() if k and v}
    return d
Example #9
0
    def __init__(self):
        self.login = wdi_login.WDLogin(WDUSER, WDPASS)
        self._get_mixtures_in_wd()

        rxnorm_qid = wdi_helpers.id_mapper("P3345", return_as_set=True)
        rxnorm_qid = {k: list(v)[0] for k, v in rxnorm_qid.items() if len(v) == 1}
        self.rxnorm_qid = rxnorm_qid
def main(json_path='doid.json', log_dir="./logs", fast_run=True, write=True):
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name,
                                        header=json.dumps(__metadata__))

    with open(json_path) as f:
        d = json.load(f)
    graphs = {g['id']: g for g in d['graphs']}
    graph = graphs['http://purl.obolibrary.org/obo/doid.owl']
    # get the has phenotype, has_material_basis_in, and transmitted by edges from another graph
    graph['edges'].extend(graphs['http://purl.obolibrary.org/obo/doid/obo/ext.owl']['edges'])
    do = DOGraph(graph, login, fast_run)
    nodes = sorted(do.nodes.values(), key=lambda x: x.doid)
    items = []
    for n, node in tqdm(enumerate(nodes), total=len(nodes)):
        item = node.create(write=write)
        # if n>100:
        #    sys.exit(0)
        if item:
            items.append(item)

    sleep(10 * 60)
    doid_wdid = id_mapper('P699')
    frc = items[0].fast_run_container
    if not frc:
        print("fastrun container not found. not removing deprecated statements")
        return None
    frc.clear()
    for doid in tqdm(doid_wdid.values()):
        remove_deprecated_statements(doid, frc, do.release, list(PROPS.values()), login)

    print("You have to remove these deprecated diseases manually: ")
    print(get_deprecated_nodes(graph))
 def cleanup(self, releases, last_updated):
     print(self.failed)
     uniprot_wdid = wdi_helpers.id_mapper(
         PROPS['UniProt ID'],
         ((PROPS['found in taxon'], self.organism_info['wdid']), ))
     print(len(uniprot_wdid))
     uniprot_wdid = {
         uniprot: qid
         for uniprot, qid in uniprot_wdid.items()
         if uniprot not in self.failed
     }
     print(len(uniprot_wdid))
     filter = {
         PROPS['UniProt ID']: '',
         PROPS['found in taxon']: self.organism_info['wdid']
     }
     frc = FastRunContainer(wdi_core.WDBaseDataType,
                            wdi_core.WDItemEngine,
                            base_filter=filter,
                            use_refs=True)
     frc.clear()
     props = [PROPS[x] for x in FASTRUN_PROPS]
     for qid in tqdm(uniprot_wdid.values()):
         remove_deprecated_statements(qid, frc, releases, last_updated,
                                      props, self.login)
def test_id_mapper():
    # get all uniprot to wdid, where taxon is human
    d = id_mapper("P352", (("P703", "Q15978631"), ))
    assert 100000 > len(d) > 20000

    d = id_mapper("P683", raise_on_duplicate=False, return_as_set=True)
    assert '3978' in d
    assert type(d['3978']) == set

    # should raise error
    raised = False
    try:
        d = id_mapper("P492", raise_on_duplicate=True)
    except ValueError:
        raised = True
    assert raised
def get_microbe_taxids(force_download=False):
    """
    Download the latest bacterial genome assembly summary from the NCBI genome ftp site
    and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
    :return: pandas dataframe of bacteria reference genome data
    """
    if force_download or not os.path.exists("reference_genomes.csv"):
        assembly = urllib.request.urlretrieve(
            "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt"
        )
        df = pd.read_csv(assembly[0],
                         sep="\t",
                         dtype=object,
                         skiprows=1,
                         header=0)
        df = df[df['refseq_category'].isin(
            ['reference genome', 'representative genome'])]

        all_tax_wdid = id_mapper('P685')

        df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None))
        df = df.rename(columns={'# assembly_accession': 'assembly_accession'})
        df.to_csv('reference_genomes.csv', sep="\t")
        df.taxid = df.taxid.astype(int)
        return df
    else:  # use predownloaded and parsed flatfile
        df = pd.read_csv("reference_genomes.csv",
                         sep="\t",
                         dtype=object,
                         index_col=0)
        df.taxid = df.taxid.astype(int)
        return df
Example #14
0
    def cleanup(self, releases, last_updated):
        """

        :param releases:
        :param last_updated:
        :param failed: list of entrez ids to skip
        :return:
        """
        print(self.failed)
        entrez_qid = wdi_helpers.id_mapper(
            'P351', ((PROPS['found in taxon'], self.organism_info['wdid']), ))
        print(len(entrez_qid))
        entrez_qid = {
            entrez: qid
            for entrez, qid in entrez_qid.items() if entrez not in self.failed
        }
        print(len(entrez_qid))
        filter = {
            PROPS['Entrez Gene ID']: '',
            PROPS['found in taxon']: self.organism_info['wdid']
        }
        frc = FastRunContainer(wdi_core.WDBaseDataType,
                               wdi_core.WDItemEngine,
                               base_filter=filter,
                               use_refs=True)
        frc.clear()
        props = [PROPS[x] for x in FASTRUN_PROPS]
        for qid in tqdm(entrez_qid.values()):
            remove_deprecated_statements(qid, frc, releases, last_updated,
                                         props, self.login)
Example #15
0
def normalize_to_qids(df: pd.DataFrame):
    rxnorm_qid = wdi_helpers.id_mapper(PROPS['RxNorm CUI'])
    mondo_qid = wdi_helpers.id_mapper(PROPS['Mondo ID'])

    df['drug_qid'] = df.drug_rxcui.astype(str).map(rxnorm_qid.get)
    df.dropna(subset=['drug_qid'], inplace=True)
    assert len(set(df.drug_qid)) == len(df)

    df['indications_qid'] = df.indications_mondo.apply(
        lambda x: set(mondo_qid[str(y)] for y in x if str(y) in mondo_qid))
    df.indications_qid = df.indications_qid.map(lambda x: x if x else None)
    df.dropna(subset=['indications_qid'], inplace=True)

    drug_indications = dict(zip(df.drug_qid, df.indications_qid))

    return drug_indications
def get_deprecated_nodes(graph):
    doid_qid = id_mapper('P699')
    nodes = [node['id'].split("/")[-1].replace("_", ":") for node in graph['nodes'] if
             "http://purl.obolibrary.org/obo/DOID_" in node['id']
             and 'meta' in node and node['meta'].get("deprecated", False)
             and node.get('type', None) == "CLASS"]
    dep_nodes = {node: doid_qid[node] for node in nodes if node in doid_qid}
    return dep_nodes
Example #17
0
def get_interpro_releases():
    """
    returns {'58.0': 'Q27877335',
     '59.0': 'Q27135875',
     ...
     }
     """
    return wdi_helpers.id_mapper("P393", (("P629", "Q3047275"), ))
Example #18
0
def test_id_mapper_mrt():
    # this may break if it changes in wikidata ....
    d = id_mapper("P486", prefer_exact_match=True)
    assert d['D000998'] == 'Q40207875'
    assert d['D000037'] == 'Q388113'
    assert 'D000033' not in d

    d = id_mapper("P486", prefer_exact_match=True, return_as_set=True)
    assert d['D000998'] == {'Q40207875'}
    assert d['D000037'] == {'Q388113'}
    assert 'D000033' not in d

    d = id_mapper("P486", prefer_exact_match=False, return_as_set=True)
    # unique value constraint
    assert d['D000998'] == {'Q40207875', 'Q846227'}
    # single value constraint
    assert d['D000037'] == d['D000033'] == {'Q388113'}
Example #19
0
def _scratch():
    mesh_qid = wdi_helpers.id_mapper("P486", return_as_set=True)
    mesh_qid = {k: list(v)[0] for k, v in mesh_qid.items() if len(v) == 1}
    df = get_mesh_tree()
    df['qid'] = df.mesh.map(mesh_qid.get)
    vc = df.topLevel.value_counts()
    df['vc'] = df.topLevel.map(vc)
    df[df.treeNum.isin(df.topLevel[df.vc > 5])]
Example #20
0
 def get_all_releases(self):
     # helper function to get all releases for the edition_of_qid given
     helper = self.helper
     edition_dict = id_mapper(
         helper.get_pid("P393"),
         ((helper.get_pid("P629"), self.edition_of_qid),
          (helper.get_pid("P31"), helper.get_qid("Q3331189"))),
         endpoint=self.sparql_endpoint_url)
     return edition_dict
Example #21
0
def main(write=True, run_one=None):
    omim_qid = wdi_helpers.id_mapper(PROPS['OMIM ID'],
                                     prefer_exact_match=True,
                                     return_as_set=True)
    omim_qid = {k: list(v)[0] for k, v in omim_qid.items() if len(v) == 1}
    hpo_qid = wdi_helpers.id_mapper(PROPS['Human Phenotype Ontology ID'],
                                    prefer_exact_match=True,
                                    return_as_set=True)
    hpo_qid = {k: list(v)[0] for k, v in hpo_qid.items() if len(v) == 1}

    df = pd.read_csv("mitodb.csv", dtype=str)
    df['disease_qid'] = df.disease.map(omim_qid.get)
    df['phenotype_qid'] = df.hpo.map(hpo_qid.get)
    df.dropna(subset=['disease_qid', 'phenotype_qid'], inplace=True)

    records = df.to_dict("records")
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    bot = MitoBot(records, login, write=write, run_one=run_one)
    bot.run()
def retrieve_qid_from_curie(curie):
    pid, ext_id_value = cu.parse_curie(curie)
    pid = h.get_pid(pid)
    if pid not in EXT_ID_MAP:
        EXT_ID_MAP[pid] = wdi_helpers.id_mapper(pid, endpoint=h.sparql_endpoint_url)
    if ext_id_value not in EXT_ID_MAP[pid]:
        print("Curie not found: {}".format(curie))
        return None
    qid = EXT_ID_MAP[pid][ext_id_value]
    return qid
Example #23
0
def get_companies():
    """Find wikidata elements where
    Business Registry code is present

    Returns:
        <class 'dict'> {qid: reg_code}
    """
    company_map = wdi_helpers.id_mapper(
        props['Business Registry code'], return_as_set=True)

    return {list(v)[0]: k for k, v in company_map.items()}
Example #24
0
def main(df, log_dir="./logs", fast_run=False):
    df = filter_df_clinical_missense(df)
    # df = df.head(2)

    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)

    # make sure we have all the variant items we need
    hgvs_qid = id_mapper(PROPS['HGVS nomenclature'])
    for _, row in tqdm(df.iterrows(), total=len(df)):
        if row.gDNA not in hgvs_qid:
            continue
            label = "{} ({})".format(row.gDNA, row['individual_mutation'])
            print("creating {}".format(label))
            try:
                item = create_missense_variant_item(row.gDNA,
                                                    label,
                                                    login,
                                                    fast_run=fast_run)
            except Exception as e:
                print(e)
                wdi_core.WDItemEngine.log(
                    "ERROR",
                    wdi_helpers.format_msg(row.gDNA, "gDNA", None, str(e),
                                           type(e)))
                continue
            hgvs_qid[row.gDNA] = item.wd_item_id

    for _, row in tqdm(df.iterrows(), total=len(df)):
        if row.gDNA not in hgvs_qid:
            wdi_core.WDItemEngine.log(
                "WARNING",
                wdi_helpers.format_msg(
                    row.gDNA, "gDNA", None,
                    "variant not found: {}".format(row.gDNA),
                    "variant not found"))
            continue
        if row.Association not in association_map:
            wdi_core.WDItemEngine.log(
                "WARNING",
                wdi_helpers.format_msg(
                    row.gDNA, "gDNA", None,
                    "Association not found: {}".format(row.Association),
                    "association not found"))
            continue
        qid = hgvs_qid[row.gDNA]
        association = association_map[row.Association]
        drug_qid = row.Drug_qid
        prim_tt_qid = row.prim_tt_qid
        source = row.Source
        evidence_level = row['Evidence level']

        item = create_variant_annotation(qid, association, drug_qid,
                                         prim_tt_qid, source, evidence_level,
                                         login)
Example #25
0
def get_qids(props=props):
    """Find wikidata elements where
    Inforegister ID prop is present.

    Returns:
        <class 'set'> {qid1, qid2, qid3}
    """
    qid_map = wdi_helpers.id_mapper(
        props['Inforegister ID'], return_as_set=True)

    return {k for k, _ in qid_map.items()}
Example #26
0
def __test_one_protein():
    doc = {
        "_id": "Q03135",
        "has_part": ["IPR018361"],
        "subclass": ["IPR001612"]
    }
    UNIPROT = "P352"
    taxon = "Q15978631"
    uniprot2wd = wdi_helpers.id_mapper(UNIPROT, (("P703", taxon), ))
    fast_run_base_filter = {UNIPROT: "", "P703": taxon}
    item = create_for_one_protein(login, doc, "Q29947749", uniprot2wd,
                                  fast_run_base_filter)
Example #27
0
    def __init__(self,
                 node_path,
                 edge_path,
                 mediawiki_api_url,
                 sparql_endpoint_url,
                 login,
                 simulate=False):
        self.node_path = node_path
        self.edge_path = edge_path
        self.nodes = None
        self.edges = None
        self.parse_nodes_edges()
        self.login = login
        self.write = not simulate
        self.mediawiki_api_url = mediawiki_api_url
        self.sparql_endpoint_url = sparql_endpoint_url
        self.dbxref_pid = None

        self.item_engine = wdi_core.WDItemEngine.wikibase_item_engine_factory(
            mediawiki_api_url=mediawiki_api_url,
            sparql_endpoint_url=sparql_endpoint_url)
        self.get_equiv_prop_pid()
        self.uri_pid = wdi_helpers.id_mapper(self.get_equiv_prop_pid(),
                                             endpoint=sparql_endpoint_url)

        ####
        # these lines have to be done in this order because we need dbxref first before the others can be created
        now = datetime.utcnow().replace(microsecond=0)
        dbxref_pid, created = self.create_dbxref_prop()
        self.dbxref_pid = self.uri_pid[
            'http://www.geneontology.org/formats/oboInOwl#DbXref']
        self.create_initial_props()
        # this fails on first run because the sparql endpoint has not yet been updated. so we need to wait
        if created:
            wdi_helpers.wait_for_last_modified(now,
                                               entity="http://wikibase.svc",
                                               delay=20,
                                               endpoint=sparql_endpoint_url)
        self.dbxref_qid = wdi_helpers.id_mapper(self.dbxref_pid,
                                                endpoint=sparql_endpoint_url)
Example #28
0
def get_1to1_uberon_to_wikidata_id_mappings(
        uberon_wiki_id_map: dict = None) -> dict:
    """
    A mapper from an UBERON ontology term id to its corresponding wikidata individual id. If there is more than one
     wikidata entry per UBERON term, the entry with the shortest label is chosen. Therefore, the mappings are always
      1 UBERON id to 1 wikidata entry id.
    :param uberon_wiki_id_map: A dictionary where keys are UBERON ids that maps to a list of wikidata entry ids.
    :return: 1-to-1 mappings between UBERON ids and wikidata entry ids
    """
    response = {}
    if uberon_wiki_id_map is None:
        uberon_wiki_id_mapper = wdi_helpers.id_mapper(PROPS['UBERON ID'],
                                                      return_as_set=True)
        uberon_wiki_id_mapper.update(
            wdi_helpers.id_mapper(PROPS['Cell Ontology ID'],
                                  return_as_set=True))
        #choose the Wikidata entry with the shortest label when there is a uberon id mapping to several wikidata entries
        uberon_wiki_id_map = {
            k: list(v)
            for k, v in uberon_wiki_id_mapper.items() if len(v) > 1
        }
        uberon_wiki_id_map = get_1to1_uberon_to_wikidata_id_mappings(
            uberon_wiki_id_map)
        uberon_wiki_id_map.update({
            k: v.pop()
            for k, v in uberon_wiki_id_mapper.items() if len(v) == 1
        })
        response = uberon_wiki_id_map
    else:
        for uberon_id, wikidata_ids in uberon_wiki_id_map.items():
            labels = []
            label_to_wiki_item = {}
            for wiki_id in wikidata_ids:
                item = wdi_core.WDItemEngine(wd_item_id=wiki_id,
                                             search_only=True)
                labels.append(item.get_label())
                label_to_wiki_item[item.get_label()] = wiki_id
            response[uberon_id] = label_to_wiki_item[shortest_string_in_list(
                labels)]
    return response
Example #29
0
class Product:
    login = wdi_login.WDLogin(WDUSER, WDPASS)
    rxnorm_qid = wdi_helpers.id_mapper("P3345", return_as_set=True)
    rxnorm_qid = {k: list(v)[0] for k, v in rxnorm_qid.items() if len(v) == 1}
    qid_rxnorm = {v: k for k, v in rxnorm_qid.items()}

    def __init__(self, qid=None, rxcui=None, label=None):
        self.qid = qid
        self.rxcui = rxcui
        self.label = label
        if self.qid:
            # get the rxnorm id for this brand
            if rxcui and (self.qid_rxnorm[self.qid] != rxcui):
                raise ValueError("something is wrong: {}".format((self.qid, self.rxcui, rxcui)))
            self.rxcui = self.qid_rxnorm[self.qid]

    def add_active_ingredient(self, ingredient_qid):
        assert self.qid
        s = [wdi_core.WDItemID(ingredient_qid, 'P3781', references=make_ref(self.rxcui))]
        # purposely overwriting this
        item = wdi_core.WDItemEngine(wd_item_id=self.qid, data=s, domain="drugs",
                                     fast_run=True, fast_run_use_refs=True,
                                     fast_run_base_filter={"P3345": ""},
                                     ref_handler=ref_handlers.update_retrieved_if_new)
        item.write(self.login)

        # and adding the inverse
        s = [wdi_core.WDItemID(self.qid, 'P3780', references=make_ref(self.rxcui))]
        # do not overwrite
        item = wdi_core.WDItemEngine(wd_item_id=ingredient_qid, data=s, domain="drugs",
                                     fast_run=True, fast_run_use_refs=True,
                                     fast_run_base_filter={"P3345": ""},
                                     ref_handler=ref_handlers.update_retrieved_if_new,
                                     append_value=['P3780'])
        item.write(self.login)

    def get_or_create(self):
        assert self.rxcui
        if self.rxcui in self.rxnorm_qid:
            return self.rxnorm_qid[self.rxcui]
        assert self.label
        s = []
        s.append(wdi_core.WDItemID('Q28885102', 'P31', references=make_ref(self.rxcui)))  # pharma product
        s.append(wdi_core.WDExternalID(self.rxcui, "P3345", references=make_ref(self.rxcui)))

        item = wdi_core.WDItemEngine(item_name=self.label, data=s, domain="drugs")
        item.set_label(self.label)
        item.set_description("pharmaceutical product")
        item.write(self.login)
        qid = item.wd_item_id
        self.qid = qid
        return qid
Example #30
0
    def get_or_create(self, organism_info, retrieved=None, login=None):
        """
        Make sure all chromosome items exist
        return a map of chr num to wdid. looks like:
        {'1': 'Q28114580',  '2': 'Q28114581', ..., 'MT': 'Q28114585'}

        :param organism_info: {'name': name, 'taxid': taxid, 'wdid': wdid, 'type': type}
        :type organism_info: dict
        :param retrieved: for reference statement
        :type retrieved: datetime
        :param login:
        :return:
        """
        self.login = login
        self.retrieved = datetime.now() if retrieved is None else retrieved

        taxid = int(organism_info['taxid'])
        if taxid not in self.chr_df:
            self.get_assembly_report(taxid)

        # map of chr_num to wdid for this taxon ("1" -> "Q1234")
        chr_num_wdid = dict()

        # get assembled chromosomes, which we will create items for
        chrdf = self.chr_df[taxid][self.chr_df[taxid]['SequenceRole'] ==
                                   'assembled-molecule']

        existing_chr = wdi_helpers.id_mapper("P2249")
        existing_chr = {k.split(".")[0]: v for k, v in existing_chr.items()}

        for record in chrdf.to_dict("records"):
            chrom_num = record['SequenceName']
            # if a field has "chr" in it, remove it
            chrom_num = chrom_num.replace("chr",
                                          "").replace("Chr",
                                                      "").replace("CHR", "")
            genome_id = record['RefSeqAccn']
            genome_id = genome_id.split(".")[0]
            chr_type = record['AssignedMoleculeLocationType']
            # {'Chromosome','Mitochondrion'}
            # chrom_type = record['Assigned-Molecule-Location/Type']
            if genome_id in existing_chr:
                chr_num_wdid[chrom_num] = existing_chr[genome_id]
            else:
                # chromosome doesn't exist in wikidata. create it
                print("chromosome being created: {}, {}".format(
                    chrom_num, genome_id))
                chr_num_wdid[chrom_num] = self.create_chrom(
                    organism_info, chrom_num, genome_id, chr_type, login)

        return chr_num_wdid