コード例 #1
0
def get_ref_microbe_taxids():
    """
    Downloads the latest bacterial genome assembly summary from the NCBI genome
    ftp site and generate a list of taxids of the bacterial reference genomes.

    :return:
    """
    import urllib.request
    import csv

    urlbase = 'ftp://ftp.ncbi.nlm.nih.gov'
    urlextension = '/genomes/refseq/bacteria/assembly_summary.txt'
    assembly = urllib.request.urlopen(urlbase + urlextension)
    datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t")
    taxid = []

    for row in datareader:
        if len(row) == 1 and row[0].startswith("#"):
            continue
        if row[4] in ['reference genome','representative genome']:
            taxid.append(row[5])

    ts = get_timestamp()
    dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts))

    return taxid
コード例 #2
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num,
                           diff_folder):
    new = create_backend(new_db_col_names)
    docs_common = new.mget_from_ids(id_list_old)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_old = list(set(id_list_old) - set(ids_common))
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'delete': id_in_old,
        'add': [],
        'update': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    summary = {"add": 0, "update": 0, "delete": len(id_in_old)}
    if len(id_in_old) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary
コード例 #3
0
def get_ref_microbe_taxids():
    """
    Downloads the latest bacterial genome assembly summary from the NCBI genome
    ftp site and generate a list of taxids of the bacterial reference genomes.

    :return:
    """
    import urllib.request
    import csv

    urlbase = 'ftp://ftp.ncbi.nlm.nih.gov'
    urlextension = '/genomes/refseq/bacteria/assembly_summary.txt'
    assembly = urllib.request.urlopen(urlbase + urlextension)
    datareader = csv.reader(assembly.read().decode().splitlines(),
                            delimiter="\t")
    taxid = []

    for row in datareader:
        if len(row) == 1 and row[0].startswith("#"):
            continue
        if row[4] in ['reference genome', 'representative genome']:
            taxid.append(row[5])

    ts = get_timestamp()
    dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts))

    return taxid
コード例 #4
0
    def parse_gbff(self, gbff_files, job_manager):
        out_d = {}
        jobs = []
        got_error = False
        for infile in gbff_files:
            baseinfile = os.path.basename(infile)
            pinfo = self.get_pinfo()
            pinfo["step"] = "post-dump (gbff)"
            pinfo["description"] = baseinfile
            job = yield from job_manager.defer_to_process(
                pinfo, partial(parser_worker, infile))

            def parsed(res, fn):
                nonlocal out_d
                try:
                    out_li = res.result()
                    self.logger.info("%d records parsed from %s" %
                                     (len(out_li), fn))
                    species = os.path.basename(fn).split('.')[0]
                    out_d.setdefault(species, []).extend(out_li)
                except Exception as e:
                    self.logger.error("Failed parsing gbff file '%s': %s" %
                                      (fn, e))
                    nonlocal got_error
                    got_error = e

            job.add_done_callback(partial(parsed, fn=infile))
            jobs.append(job)
            # stop the loop asap if error
            if got_error:
                raise got_error
        if jobs:
            yield from asyncio.gather(*jobs)
            if got_error:
                raise got_error
            # if we get here, result is ready to be dumped
            outfile = os.path.join(self.new_data_folder,
                                   'rna.gbff.parsed.pyobj')
            self.logger.info("Dump gbff parsed data to '%s'" % outfile)
            dump(out_d, outfile, compress="lzma")
            #output gene2summary text file
            self.logger.info("Generate gene2summary")
            sumout = os.path.join(self.new_data_folder, 'gene2summary_all.txt')
            output_gene2summary(out_d, sumout)
            assert os.path.getsize(sumout) > 0
            #output gene2ec text file
            self.logger.info("Generate gene2ec")
            ecout = os.path.join(self.new_data_folder, 'gene2ec_all.txt')
            output_gene2ec(out_d, ecout)
            assert os.path.getsize(ecout) > 0
コード例 #5
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
 def reset_synced(self, diff_folder, backend=None):
     """
     Remove "synced" flag from any pyobj file in diff_folder
     """
     diff_files = glob.glob(os.path.join(diff_folder, "*.pyobj"))
     for diff in diff_files:
         pyobj = loadobj(diff)
         try:
             if pyobj.get("synced"):
                 if backend:
                     self.logger.info(
                         "Removing synced flag from '%s' for backend '%s'" %
                         (diff, backend))
                     pyobj["synced"].pop(backend, None)
                 else:
                     self.logger.info("Removing synced flag from '%s'" %
                                      diff)
                     pyobj.pop("synced")
                 dump(pyobj, diff)
         except AttributeError:
             # pyobj not a dict
             continue
コード例 #6
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
 def diff_mapping(old, new, diff_folder):
     summary = {}
     old_build = get_src_build().find_one(
         {"_id": old.target_collection.name})
     new_build = get_src_build().find_one(
         {"_id": new.target_collection.name})
     if old_build and new_build:
         # mapping diff always in jsondiff
         mapping_diff = jsondiff(old_build["mapping"],
                                 new_build["mapping"])
         if mapping_diff:
             file_name = os.path.join(diff_folder, "mapping.pyobj")
             dump(mapping_diff, file_name)
             md5 = md5sum(file_name)
             summary["mapping_file"] = {
                 "name": os.path.basename(file_name),
                 "md5sum": md5
             }
     else:
         self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \
                 (old.target_collection.name,new.target_collection.name))
     return summary
コード例 #7
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
def diff_worker_new_vs_old(id_list_new,
                           old_db_col_names,
                           new_db_col_names,
                           batch_num,
                           diff_folder,
                           diff_func,
                           exclude=[],
                           selfcontained=False):
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    docs_common = old.mget_from_ids(id_list_new)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_new = list(set(id_list_new) - set(ids_common))
    _updates = []
    if len(ids_common) > 0:
        _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude)
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'add': id_in_new,
        'update': _updates,
        'delete': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    if selfcontained:
        _result["add"] = new.mget_from_ids(id_in_new)
    summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0}
    if len(_updates) != 0 or len(id_in_new) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary
コード例 #8
0
ファイル: backend.py プロジェクト: SuLab/myvariant.info
 def finalize(self):
     '''dump target_dict into a file.'''
     from biothings.utils.common import dump
     dump(self.target_dict, self.target_name+'.pyobj')
コード例 #9
0
def get_geneid_d(data_folder,
                 species_li=None,
                 load_cache=True,
                 save_cache=True,
                 only_for={}):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set(
            [TAXONOMY[species]["tax_id"] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(data_folder)

    # check cache file
    _cache_file = 'geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene_info.gz') and \
       file_newer(_cache_file, 'gene_history.gz'):
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(data_folder, 'gene_info.gz')
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set and (
            only_for and ld[1] in only_for)
    elif only_for:
        species_filter = lambda ld: only_for and ld[1] in only_for
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))

    DATAFILE = os.path.join(data_folder, 'gene_history.gz')

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    # TODO: this fills memory with key==value ...
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
コード例 #10
0
 def finalize(self):
     '''dump target_dict into a file.'''
     from biothings.utils.common import dump
     dump(self.target_dict, self.target_name + '.pyobj')
コード例 #11
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={}):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    new = create_backend(new_db_col_names)  # mongo collection to sync from
    indexer = create_backend(es_config).target_esidxer
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("es") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, doc["_id"], action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({
                        "_id": doc["_id"],
                        "file": diff_file,
                        "error": e
                    })
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise

    # update: get doc from indexer and apply diff
    batch = []
    ids = [p["_id"] for p in diff["update"]]
    for i, doc in enumerate(indexer.get_docs(ids)):
        try:
            patch_info = diff["update"][
                i]  # same order as what's return by get_doc()...
            assert patch_info["_id"] == doc["_id"]  # ... but just make sure
            newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
            if newdoc == doc:
                # already applied
                res["skipped"] += 1
                continue
            batch.append(newdoc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
            batch = []
    if batch:
        res["updated"] += indexer.index_bulk(batch, batch_size)[0]

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("es", True)
    dump(diff, diff_file)
    return res
コード例 #12
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={}):
    """Worker to sync data between a new and an old mongo collection"""
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("mongo") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("mongo", True)
    dump(diff, diff_file)
    return res
コード例 #13
0
def load_all(data_folder):
    '''Load "uniprot" using yield, while building "PDB" and "PIR"
    data dict while reading data file. These dict are then dumped
    (pickled) and stored later'''
    def cvt_fn(pdb_id):
        return pdb_id.split(':')[0]

    def merge(xli, transcode=False):
        xli2 = []
        uniprot_acc, section, entrez_id, ensembl_id = xli
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            except KeyError:
                xli2.append((uniprot_acc, section, ensembl_id))
        return xli2

    def transform(xli2):
        gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
        gene2uniprot = value_convert(gene2uniprot,
                                     _dict_convert,
                                     traverse_list=False)
        gid, uniprot = list(gene2uniprot.items())[0]
        docs = []
        for gid, uniprot in gene2uniprot.items():
            doc = {"_id": gid}
            doc.update(uniprot)
            docs.append(doc)
        return docs

    def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None):
        xli2 = []
        entrez_id, ensembl_id, x_value = xli

        if not x_value:
            return

        if cvt_fn:
            x_value = cvt_fn(x_value)

        if entrez_id:
            xli2.append((entrez_id, x_value))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = x_ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((_eid, x_value))
            except KeyError:
                xli2.append((ensembl_id, x_value))
        for x in xli2:
            gene2x.setdefault(x[0], []).append(x[1])

    uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz')
    t0 = time.time()

    # cache for uniprot
    ensembl2geneid = {}
    # cache for PDB and PIR
    x_ensembl2geneid = {}

    remains = []
    pdb_remains = []
    pir_remains = []

    # once filled, will be dumped for later storage
    gene2pdb = {}
    gene2pir = {}

    # store all PDB & PIR data while looping, the whole will be stored later
    for ld in tabfile_feeder(uniprot_datafile,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        # Uniprot data will be stored as we read line by line
        xlis = []
        pdbxlis = []
        pirxlis = []

        # raw lines for each sources
        uniprotld = [ld[0], ld[1], ld[2], ld[18]]
        pdbld = [ld[2], ld[19], ld[5]]
        pirld = [ld[2], ld[19], ld[11]]

        # UniProt
        # GeneID and EnsemblID columns may have duplicates
        for value in dupline_seperator(dupline=uniprotld,
                                       dup_idx=[2, 3],
                                       dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xlis.append(value)
        # PDB
        for value in dupline_seperator(dupline=pdbld, dup_sep='; '):
            pdbxlis.append(value)

        # PIR
        for value in dupline_seperator(dupline=pirld, dup_sep='; '):
            pirxlis.append(value)

        for xli in xlis:
            # feed mapping
            if xli[2] != '' and xli[3] != '':
                ensembl2geneid.setdefault(xli[3], []).append(xli[2])
            try:
                # postpone ensemblid->entrezid resolution while parsing uniprot as the
                # full transcodification dict is only correct at the end.
                # ex:
                #     1. UniprotID-A    EntrezID-A  EnsemblID
                #     2. UniprotID-B                EnsemblID
                #     3. UniprotID-C    EntrezID-B  EnsemblID
                #
                #     UniprotID-B should associated to both EntrezID-A and EntrezID-B
                #     but we need to read up to line 3 to do so
                xli2 = merge(xli, transcode=False)
                if not xli2:
                    continue
                docs = transform(xli2)
                for doc in docs:
                    yield doc
            except KeyError:
                remains.append(xli)

        for xli in pdbxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb")
            except KeyError:
                pdb_remains.append(xli)

        for xli in pirxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pir, transcode=False)
            except KeyError:
                pir_remains.append(xli)

    # now transcode with what we have
    for remain in remains:
        try:
            xli2 = merge(remain, transcode=True)
            if not xli2:
                continue
            docs = transform(xli2)
            for doc in docs:
                yield doc
        except KeyError:
            pass

    for remain in pdb_remains:
        try:
            merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn)
        except KeyError:
            pass

    for remain in pir_remains:
        try:
            merge_x(remain, gene2pir, transcode=True)
        except KeyError:
            pass

    # PDB
    def normalize(value, keyname):
        res = None
        uniq = sorted(set(value))
        if len(uniq) > 1:
            res = {keyname: uniq}
        else:
            res = {keyname: uniq[0]}
        return res

    def normalize_pdb(value):
        return normalize(value, "pdb")

    def normalize_pir(value):
        return normalize(value, "pir")

    # PDB
    gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False)
    pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
    dump(gene2pdb, pdb_dumpfile)

    # PIR
    gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False)
    pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
    dump(gene2pir, pir_dumpfile)