Example #1
0
def test_make_gene_class():
    coll = MongoClient().wikidata_src.mygene
    metadata_coll = MongoClient().wikidata_src.mygene_sources
    metadata = metadata_coll.find_one()
    doc_filter = {'_id': '100861512'}
    docs = coll.find(doc_filter)
    print("total number of records: {}".format(coll.find(doc_filter).count()))

    validate_type = 'eukaryotic'
    docs = HelperBot.validate_docs(docs, validate_type, 'P351')
    records = HelperBot.tag_mygene_docs(docs, metadata)
    record = next(records)

    organism_info = {
        "name": "H**o sapiens",
        "type": "mammalian",
        "wdid": "Q15978631",
        'taxid': 9606
    }

    login = wdi_login.WDLogin(WDUSER, WDPASS)

    gene = Gene(record, organism_info, login)
    gene.create_item(fast_run=False, write=True)
    gene.remove_deprecated_statements()
Example #2
0
def validate_all_human_genes():
    # runs all genes through the validator
    # and generates a log file

    coll = MongoClient().wikidata_src.mygene
    metadata_coll = MongoClient().wikidata_src.mygene_sources
    metadata = metadata_coll.find_one()
    doc_filter = {'taxid': 9606, 'entrezgene': {'$exists': True}}
    docs = coll.find(doc_filter)
    print("total number of records: {}".format(coll.find(doc_filter).count()))

    validate_type = 'eukaryotic'
    docs = HelperBot.validate_docs(docs, validate_type, 'P351')
    records = HelperBot.tag_mygene_docs(docs, metadata)

    _ = list(records)
Example #3
0
def genes():
    entrez_wd = id_mapper("P351")

    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)

    coll = MongoClient().wikidata_src.mygene
    metadata_coll = MongoClient().wikidata_src.mygene_sources
    metadata = metadata_coll.find_one()
    organism_info = organisms_info[7955]

    doc_filter = {'taxid': 7955, 'entrezgene': {'$exists': True}}
    docs = coll.find(doc_filter).batch_size(20)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, 'eukaryotic', PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)
    records = list(records)

    # find all names with dupes
    dupe_names = {
        k
        for k, v in Counter([x['symbol']['@value'] for x in records]).items()
        if v > 1
    }

    # for all records that have one of these names, change the name to "name (entrezgene)"
    records = [x for x in records if x['symbol']['@value'] in dupe_names]
    for record in records:
        record['symbol']['@value'] = record['symbol']['@value'] + " (" + str(
            record['entrezgene']['@value']) + ")"

    # skip items that aren't already in wikidata (DONT CREATE NEW ITEMS!)
    records = [
        x for x in records if str(x['entrezgene']['@value']) in entrez_wd
    ]

    print("len records: {}".format(len(records)))

    cb = ChromosomeBot()
    chr_num_wdid = cb.get_or_create(organism_info, login=login)
    bot = GeneBot.ChromosomalGeneBot(organism_info, chr_num_wdid, login)
    bot.filter = lambda x: iter(x)
    bot.run(records, total=total, fast_run=True, write=True)
Example #4
0
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating genes

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one gene
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    # TODO: this can be pulled from wd
    if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial":
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
        # make sure all chromosome items are found in wikidata
        cb = ChromosomeBot()
        chr_num_wdid = cb.get_or_create(organism_info, login=login)
        chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()}
        if int(organism_info['taxid']) == 9606:
            bot = HumanGeneBot(organism_info, chr_num_wdid, login)
        else:
            bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login)
    else:
        # check if its one of the reference microbial genomes
        # raises valueerror if not...
        organism_info = mcb.get_organism_info(taxid)
        refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login)
        print(organism_info)
        bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login)
        validate_type = "microbial"

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()
    print("done updating, waiting 10 min")
    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)
Example #5
0
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating genes

    :param coll: mongo collection containing gene data from mygene
    :type coll: pymongo.collection.Collection
    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        logger_name='WD_logger',
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    if taxid in organisms_info:
        # its one of fungal, mammalian, plant (not microbe)
        validate_type = 'gene'
        organism_info = organisms_info[taxid]
        # make sure all chromosome items are found in wikidata
        cb = ChromosomeBot()
        chr_num_wdid = cb.get_or_create(organism_info, login=login)
        if int(organism_info['taxid']) == 9606:
            bot = HumanGeneBot(organism_info, chr_num_wdid, login)
        else:
            bot = MammalianGeneBot(organism_info, chr_num_wdid, login)
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        print(organism_info)
        bot = MicrobeGeneBot(organism_info, login)
        validate_type = "microbial"

    # only do certain records
    doc_filter = {
        'taxid': taxid,
        'type_of_gene': 'protein-coding',
        'genomic_pos': {
            '$exists': True
        }
    }
    docs = coll.find(doc_filter, no_cursor_timeout=True)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, validate_type,
                                   PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    docs.close()

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
Example #6
0
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating proteins

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one protein (given by entrezgene id)
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    if taxid in organisms_info:
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        validate_type = 'microbial'
        print(organism_info)

    # get all entrez gene id -> wdid mappings, where found in taxon is this strain
    gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),))

    bot = ProteinBot(organism_info, gene_wdid_mapping, login)

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()

    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
Example #7
0
def main(metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating genes

    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        logger_name='WD_logger',
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get all ids mappings
    entrez_wdid = wdi_helpers.id_mapper(PROPS['Entrez Gene ID'])
    wdid_entrez = {v: k for k, v in entrez_wdid.items()}
    homo_wdid = wdi_helpers.id_mapper(PROPS['HomoloGene ID'],
                                      return_as_set=True)
    wdid_homo = dict()
    for h**o, wdids in homo_wdid.items():
        for wdid in wdids:
            wdid_homo[wdid] = h**o
    entrez_homo = {
        wdid_entrez[wdid]: h**o
        for wdid, h**o in wdid_homo.items() if wdid in wdid_entrez
    }
    taxon_wdid = wdi_helpers.id_mapper(PROPS['NCBI Taxonomy ID'])

    # only do certain records
    mgd = MyGeneDownloader(
        q="_exists_:homologene AND type_of_gene:protein-coding",
        fields=','.join(['taxid', 'homologene', 'entrezgene']))
    docs, total = mgd.query()
    docs = list(tqdm(docs, total=total))
    records = HelperBot.tag_mygene_docs(docs, metadata)

    # group together all orthologs
    # d[taxid][entrezgene] = { set of entrezgene ids for orthologs }
    d = defaultdict(lambda: defaultdict(set))
    entrez_taxon = dict()  # keep this for the qualifier on the statements
    for doc in records:
        this_taxid = doc['taxid']['@value']
        this_entrez = doc['entrezgene']['@value']
        entrez_taxon[str(this_entrez)] = str(this_taxid)
        if str(this_entrez) not in entrez_wdid:
            continue
        for taxid, entrez in doc['homologene']['@value']['genes']:
            if taxid == 4932 and this_taxid == 559292:
                # ridiculous workaround because entrez has the taxid for the strain and homologene has it for the species
                # TODO: This needs to be fixed if you want to use other things that may have species/strains .. ?`
                continue
            if taxid != this_taxid and str(entrez) in entrez_wdid:
                d[str(this_taxid)][str(this_entrez)].add(str(entrez))

    print("taxid: # of genes  : {}".format({k: len(v) for k, v in d.items()}))

    homogene_ver = metadata['homologene']
    release = wdi_helpers.Release(
        "HomoloGene build{}".format(homogene_ver),
        "Version of HomoloGene",
        homogene_ver,
        edition_of_wdid='Q468215',
        archive_url='ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build{}/'.format(
            homogene_ver)).get_or_create(login)

    reference = lambda homogeneid: [
        wdi_core.WDItemID(release, PROPS['stated in'], is_reference=True),
        wdi_core.WDExternalID(
            homogeneid, PROPS['HomoloGene ID'], is_reference=True)
    ]

    ec = 0
    for taxid, subd in tqdm(d.items()):
        for entrezgene, orthologs in tqdm(subd.items(), leave=False):
            try:
                do_item(entrezgene, orthologs, reference, entrez_homo,
                        entrez_taxon, taxon_wdid, entrez_wdid, login, write)
            except Exception as e:
                wdi_helpers.format_msg(entrezgene, PROPS['Entrez Gene ID'],
                                       None, str(e), type(e))
                ec += 1
        # clear the fast run store once we move on to the next taxon
        wdi_core.WDItemEngine.fast_run_store = []
        wdi_core.WDItemEngine.fast_run_container = None

    print("Completed succesfully with {} exceptions".format(ec))
Example #8
0
def proteins():
    uni_wd = id_mapper("P352")

    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)

    coll = MongoClient().wikidata_src.mygene
    metadata_coll = MongoClient().wikidata_src.mygene_sources
    metadata = metadata_coll.find_one()
    organism_info = organisms_info[7955]

    doc_filter = {
        'taxid': 7955,
        'uniprot': {
            '$exists': True
        },
        'entrezgene': {
            '$exists': True
        }
    }
    docs = coll.find(doc_filter).batch_size(20)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, 'eukaryotic', PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)
    records = list(records)

    for record in records:
        if 'Swiss-Prot' in record['uniprot']['@value']:
            record['uniprot_id'] = record['uniprot']['@value']['Swiss-Prot']
        elif 'TrEMBL' in record['uniprot']['@value']:
            record['uniprot_id'] = record['uniprot']['@value']['TrEMBL']
    records = [
        x for x in records
        if 'uniprot_id' in x and isinstance(x['uniprot_id'], str)
    ]

    # find all names with dupes
    dupe_names = {
        k
        for k, v in Counter([x['name']['@value'] for x in records]).items()
        if v > 1
    }

    # for all records that have one of these names, change the name to "name (uniprot)"
    records = [x for x in records if x['name']['@value'] in dupe_names]
    print("len dupe records: {}".format(len(records)))

    for record in records:
        record['name']['@value'] = record['name']['@value'] + " (" + record[
            'uniprot_id'] + ")"

    # skip items that aren't already in wikidata (DONT CREATE NEW ITEMS!)
    records = [x for x in records if x['uniprot_id'] in uni_wd]

    print("len records: {}".format(len(records)))

    cb = ChromosomeBot()
    chr_num_wdid = cb.get_or_create(organism_info, login=login)
    bot = ProteinBot.ProteinBot(organism_info, chr_num_wdid, login)
    bot.filter = lambda x: iter(x)
    bot.run(records, total=total, fast_run=False, write=True)
Example #9
0
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating proteins

    :param coll: mongo collection containing protein data from mygene
    :type coll: pymongo.collection.Collection
    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    validate_type = 'protein'
    if taxid in organisms_info:
        organism_info = organisms_info[taxid]
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        print(organism_info)

    # get all entrez gene id -> wdid mappings, where found in taxon is this strain
    gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']), ))

    bot = ProteinBot(organism_info, gene_wdid_mapping, login)

    # only do certain records
    doc_filter = {'taxid': taxid, 'type_of_gene': 'protein-coding'}
    docs = coll.find(doc_filter, no_cursor_timeout=True)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, validate_type,
                                   PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    docs.close()

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []