Python prop2qid Examples, wikidataintegrator.wdi_helpers.prop2qid Python Examples

Example #1

0

Show file

File: MicrobialChromosomeBot.py Project: rpatil524/scheduled-bots

    def get_or_create_chromosomes(self, taxid, login=None):
        # main function to use to get or create all of the chromosomes for a bacterial organism
        # returns dict with key = refseq ID, value = qid for chromosome item
        if self.df.empty:
            self.get_microbial_ref_genome_table()
        df = self.df

        taxid = str(taxid)
        entry = df[df.TaxID == taxid].to_dict("records")[0]
        organism_name = entry['Organism/Name']
        organism_qid = prop2qid(PROPS['NCBI Taxonomy ID'], taxid)

        chroms = self.get_chromosome_info(taxid)
        chr_map = dict()
        chr_name_type = {'chromosome circular': 'circular',
                         'chromosome linear': 'chromosome',
                         'chromosome': 'chromosome'}
        for chrom in chroms:
            chrom_name = chrom['name'].lower()
            genome_id = chrom['refseq']
            if chrom_name in chr_name_type:
                chr_type = chr_name_type[chrom_name]
            elif "plasmid" in chrom_name:
                chr_type = 'plasmid'
            else:
                raise ValueError("unknown chromosome type: {}".format(chrom['name']))
            qid = self.create_chrom(organism_name, organism_qid, chrom_name, genome_id, chr_type, login=login)
            chr_map[chrom['refseq']] = qid

        return chr_map

Example #2

0

Show file

File: MicrobialChromosomeBot.py Project: rpatil524/scheduled-bots

 def get_organism_info(self, taxid):
     taxid = str(taxid)
     if taxid not in self.get_all_taxids():
         raise ValueError("taxid {} not found in microbe ref genomes".format(taxid))
     entry = self.df[self.df.TaxID == taxid].to_dict("records")[0]
     qid = prop2qid(PROPS['NCBI Taxonomy ID'], taxid)
     return {'name': entry['Organism/Name'],
             'type': "microbial",
             'wdid': qid,
             'qid': qid,
             'taxid': taxid}

Example #3

0

Show file

 def published_in_isbn(self, value):
     assert self.published_in_issn is None, "Can't give both ISSN and ISBN"
     if not isinstance(value, list):
         value = [value]
     self._published_in_isbn = value
     qids = set(prop2qid(PROPS['ISBN-13'], v) for v in value)
     if len(qids) == 1:
         self.published_in_qid = list(qids)[0]
     else:
         self.warnings.append("conflictings ISBN qids: {}".format(qids))
     if not self.published_in_qid:
         self.warnings.append("ISBN:{} not found".format(value))

Example #4

0

Show file

File: GOBot.py Project: aleachi/scheduled-bots

def main(taxon, file, retrieved, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for annotating GO terms on proteins
    
    :param taxon: taxon to use (ncbi tax id)
    :type taxid: str
    :param file: path to gaf_file to use. See below for format
    :type str
    :param retrieved: date that the GO annotations were retrieved
    :type retrieved: datetime
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return:

    The following columns are expected in the gaf file
    ['db','id','go_id','reference','evidence','aspect','taxon','source']

    This can be created by selecting columns $1,$2,$5,$6,$7,$9,$13,$15 from goa_uniprot_all.gaf.gz
    downloaded from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz

    """
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    organism_wdid = wdi_helpers.prop2qid("P685", taxon)
    if not organism_wdid:
        raise ValueError("organism {} not found".format(taxon))
    print("Running organism: {} {}".format(taxon, organism_wdid))

    # get all uniprot id -> wdid mappings, where found in taxon is this organism
    prot_wdid_mapping = wdi_helpers.id_mapper(UNIPROT,
                                              (("P703", organism_wdid), ))

    # get all goID to wdid mappings
    go_map = wdi_helpers.id_mapper("P686")

    # Get GO terms from our local store for this taxon
    colnames = [
        'db', 'id', 'go_id', 'reference', 'evidence', 'aspect', 'taxon',
        'source'
    ]
    df = pd.read_csv(file, sep=' ', names=colnames, index_col=False)
    df = df[df.taxon == "taxon:" + str(taxon)]
    if len(df) == 0:
        print("No GO annotations found for taxid: {}".format(taxon))
        return None
    else:
        print("Found {} GO annotations".format(len(df)))

    # get all pmids and make items for them
    pmids = set([x[5:] for x in df['reference'] if x.startswith("PMID:")])
    print("Need {} pmids".format(len(pmids)))
    pmid_map = get_values("P698", pmids)
    print("Found {} pmids".format(len(pmid_map)))
    pmids_todo = pmids - set(pmid_map.keys())
    print("Creating {} pmid items".format(len(pmids_todo)))
    new_pmids = create_articles(pmids_todo, login, write)
    pmid_map.update(new_pmids)
    print("Done creating pmid items")

    # get all external IDs we may need by uniprot id
    external_ids = defaultdict(dict)
    external_ids_info = {
        'Saccharomyces Genome Database ID': 'P3406',
        'Mouse Genome Informatics ID': 'P671',
        'UniProt ID': 'P352'
    }
    for external_id_name, prop in external_ids_info.items():
        id_map = wdi_helpers.id_mapper(prop, (("P703", organism_wdid), ))
        if not id_map:
            continue
        for id, wdid in id_map.items():
            external_ids[wdid][external_id_name] = id

    # groupby ID, GOID & evidence, the make references a list
    go_annotations = df.groupby(
        ['id', 'go_id', 'evidence', 'source', 'db',
         'aspect'])['reference'].apply(list)

    # iterate through all proteins & write
    failed_items = []
    for uniprot_id, item_wdid in tqdm(prot_wdid_mapping.items()):
        # if uniprot_id != "Q9RJK2":
        #    continue
        if uniprot_id not in go_annotations:
            continue
        this_go = go_annotations[uniprot_id]
        external_id = external_ids[item_wdid]
        # print(this_go)
        try:
            statements = make_go_statements(uniprot_id, this_go, go_map,
                                            pmid_map, external_id, retrieved)
            wditem = wdi_core.WDItemEngine(
                wd_item_id=item_wdid,
                domain='protein',
                data=statements,
                fast_run=fast_run,
                fast_run_base_filter={
                    UNIPROT: "",
                    "P703": organism_wdid
                },
                fast_run_use_refs=True,
                ref_handler=update_retrieved_if_new,
                append_value=['P680', 'P681', 'P682'],
                core_props=core_props)
            wdi_helpers.try_write(wditem,
                                  record_id=uniprot_id,
                                  record_prop=UNIPROT,
                                  edit_summary="update GO terms",
                                  login=login,
                                  write=write)
        except Exception as e:
            print(e)
            traceback.print_exc()
            failed_items.append(uniprot_id)
            wdi_core.WDItemEngine.log(
                "ERROR",
                wdi_helpers.format_msg(uniprot_id,
                                       UNIPROT,
                                       item_wdid,
                                       str(e),
                                       msg_type=type(e)))

    print("{} items failed: {}".format(len(failed_items), failed_items))
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []

Example #5

0

Show file

File: GOBot.py Project: sebotic/scheduled-bots

def main(coll, taxon, retrieved, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for annotating GO terms on proteins
    
    :param coll: mongo collection containing GO annotations
    :type coll: pymongo.collection.Collection
    :param taxon: taxon to use (ncbi tax id)
    :type taxid: str
    :param retrieved: date that the GO annotations were retrieved
    :type retrieved: datetime
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: 
    """
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    organism_wdid = wdi_helpers.prop2qid("P685", taxon)
    if not organism_wdid:
        raise ValueError("organism {} not found".format(taxon))
    print("Running organism: {} {}".format(taxon, organism_wdid))

    # get all uniprot id -> wdid mappings, where found in taxon is this organism
    prot_wdid_mapping = wdi_helpers.id_mapper(UNIPROT,
                                              (("P703", organism_wdid), ))

    # get all goID to wdid mappings
    go_map = wdi_helpers.id_mapper("P686")

    # Get GO terms from our local store for this taxon
    df = pd.DataFrame(list(coll.find({'Taxon': int(taxon)})))
    if len(df) == 0:
        print("No GO annotations found for taxid: {}".format(taxon))
        return None

    # get all pmids and make items for them
    pmids = set([x[5:] for x in df['Reference'] if x.startswith("PMID:")])
    pmid_map = wdi_helpers.id_mapper("P698")
    print("Total number of pmids: {}".format(len(pmids)))
    pmids_todo = pmids - set(pmid_map.keys())
    print("Creating {} pmid items".format(len(pmids_todo)))
    new_pmids = create_articles(pmids_todo, login, write)
    pmid_map.update(new_pmids)
    print("Done creating pmid items")

    # get all external IDs we may need by uniprot id
    external_ids = defaultdict(dict)
    external_ids_info = {
        'Saccharomyces Genome Database ID': 'P3406',
        'Mouse Genome Informatics ID': 'P671',
        'UniProt ID': 'P352'
    }
    for external_id_name, prop in external_ids_info.items():
        id_map = wdi_helpers.id_mapper(prop, (("P703", organism_wdid), ))
        if not id_map:
            continue
        for id, wdid in id_map.items():
            external_ids[wdid][external_id_name] = id

    # groupby ID, GOID & evidence, the make references a list
    go_annotations = df.groupby(
        ['ID', 'GO ID', 'Evidence', 'Source', 'DB',
         'Aspect'])['Reference'].apply(list)

    # iterate through all proteins & write
    failed_items = []
    for uniprot_id, item_wdid in tqdm(prot_wdid_mapping.items()):
        #if uniprot_id != "P28005":
        #    continue
        if uniprot_id not in go_annotations:
            continue
        this_go = go_annotations[uniprot_id]
        external_id = external_ids[item_wdid]
        try:
            statements = make_go_statements(item_wdid, uniprot_id, this_go,
                                            retrieved, go_map, pmid_map,
                                            external_id, login)
            wditem = wdi_core.WDItemEngine(wd_item_id=item_wdid,
                                           domain='protein',
                                           data=statements,
                                           fast_run=fast_run,
                                           fast_run_base_filter={
                                               UNIPROT: "",
                                               "P703": organism_wdid
                                           })
            # good_refs=[{'P248': None}], keep_good_ref_statements=True)
            wdi_helpers.try_write(wditem,
                                  record_id=uniprot_id,
                                  record_prop=UNIPROT,
                                  edit_summary="update GO terms",
                                  login=login,
                                  write=write)
        except Exception as e:
            print(e)
            failed_items.append(uniprot_id)
            wdi_core.WDItemEngine.log(
                "ERROR",
                wdi_helpers.format_msg(uniprot_id,
                                       UNIPROT,
                                       item_wdid,
                                       str(e),
                                       msg_type=type(e)))

    print("{} items failed: {}".format(len(failed_items), failed_items))
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []

Example #6

0

Show file

File: GeneBot.py Project: turoger/scheduled-bots

def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating genes

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one gene
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    # TODO: this can be pulled from wd
    if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial":
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
        # make sure all chromosome items are found in wikidata
        cb = ChromosomeBot()
        chr_num_wdid = cb.get_or_create(organism_info, login=login)
        chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()}
        if int(organism_info['taxid']) == 9606:
            bot = HumanGeneBot(organism_info, chr_num_wdid, login)
        else:
            bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login)
    else:
        # check if its one of the reference microbial genomes
        # raises valueerror if not...
        organism_info = mcb.get_organism_info(taxid)
        refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login)
        print(organism_info)
        bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login)
        validate_type = "microbial"

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()
    print("done updating, waiting 10 min")
    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)

Example #7

0

Show file

File: GeneBot.py Project: sebotic/scheduled-bots

def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating genes

    :param coll: mongo collection containing gene data from mygene
    :type coll: pymongo.collection.Collection
    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        logger_name='WD_logger',
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    if taxid in organisms_info:
        # its one of fungal, mammalian, plant (not microbe)
        validate_type = 'gene'
        organism_info = organisms_info[taxid]
        # make sure all chromosome items are found in wikidata
        cb = ChromosomeBot()
        chr_num_wdid = cb.get_or_create(organism_info, login=login)
        if int(organism_info['taxid']) == 9606:
            bot = HumanGeneBot(organism_info, chr_num_wdid, login)
        else:
            bot = MammalianGeneBot(organism_info, chr_num_wdid, login)
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        print(organism_info)
        bot = MicrobeGeneBot(organism_info, login)
        validate_type = "microbial"

    # only do certain records
    doc_filter = {
        'taxid': taxid,
        'type_of_gene': 'protein-coding',
        'genomic_pos': {
            '$exists': True
        }
    }
    docs = coll.find(doc_filter, no_cursor_timeout=True)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, validate_type,
                                   PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    docs.close()

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []

Example #8

0

Show file

File: ProteinBot.py Project: turoger/scheduled-bots

def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None):
    """
    Main function for creating/updating proteins

    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :param entrez: Only run this one protein (given by entrezgene id)
    :type entrez: int
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
        wdi_core.WDItemEngine.logger.handlers = []

    run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M')
    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    __metadata__['taxid'] = taxid
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    if taxid in organisms_info:
        validate_type = 'eukaryotic'
        organism_info = organisms_info[taxid]
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        validate_type = 'microbial'
        print(organism_info)

    # get all entrez gene id -> wdid mappings, where found in taxon is this strain
    gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),))

    bot = ProteinBot(organism_info, gene_wdid_mapping, login)

    # Get handle to mygene records
    mgd = MyGeneDownloader()
    if entrez:
        doc, total = mgd.get_mg_gene(entrez)
        docs = iter([doc])
    else:
        doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x)
        docs, total = mgd.get_mg_cursor(taxid, doc_filter)
    print("total number of records: {}".format(total))
    # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now
    docs = list(docs)
    docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    for frc in wdi_core.WDItemEngine.fast_run_store:
        frc.clear()

    time.sleep(10 * 60)
    releases = dict()
    releases_to_remove = set()
    last_updated = dict()
    metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}}
    for k, v in parse_mygene_src_version(metadata).items():
        if "release" in v:
            if k not in releases:
                releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),))
            to_remove = set(releases[k].values())
            to_remove.discard(releases[k][v['release']])
            releases_to_remove.update(to_remove)
            print(
                "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}),
                                                                        v['release']))
        else:
            last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d")
    print(last_updated)
    bot.cleanup(releases_to_remove, last_updated)

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []

Example #9

0

Show file

 def lookup_author(orcid_id):
     return prop2qid(PROPS['orcid id'], orcid_id)

Example #10

0

Show file

File: ProteinBot.py Project: sebotic/scheduled-bots

def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True):
    """
    Main function for creating/updating proteins

    :param coll: mongo collection containing protein data from mygene
    :type coll: pymongo.collection.Collection
    :param taxid: taxon to use (ncbi tax id)
    :type taxid: str
    :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. }
    :type metadata: dict
    :param log_dir: dir to store logs
    :type log_dir: str
    :param fast_run: use fast run mode
    :type fast_run: bool
    :param write: actually perform write
    :type write: bool
    :return: None
    """

    # make sure the organism is found in wikidata
    taxid = int(taxid)
    organism_wdid = wdi_helpers.prop2qid("P685", str(taxid))
    if not organism_wdid:
        print("organism {} not found in wikidata".format(taxid))
        return None

    # login
    login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__))

    # get organism metadata (name, organism type, wdid)
    validate_type = 'protein'
    if taxid in organisms_info:
        organism_info = organisms_info[taxid]
    else:
        # check if its one of the microbe refs
        # raises valueerror if not...
        organism_info = get_organism_info(taxid)
        print(organism_info)

    # get all entrez gene id -> wdid mappings, where found in taxon is this strain
    gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']), ))

    bot = ProteinBot(organism_info, gene_wdid_mapping, login)

    # only do certain records
    doc_filter = {'taxid': taxid, 'type_of_gene': 'protein-coding'}
    docs = coll.find(doc_filter, no_cursor_timeout=True)
    total = docs.count()
    print("total number of records: {}".format(total))
    docs = HelperBot.validate_docs(docs, validate_type,
                                   PROPS['Entrez Gene ID'])
    records = HelperBot.tag_mygene_docs(docs, metadata)

    bot.run(records, total=total, fast_run=fast_run, write=write)
    docs.close()

    # after the run is done, disconnect the logging handler
    # so that if we start another, it doesn't write twice
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []