コード例 #1
0
def get_ref_microbe_taxids():
    """
    Downloads the latest bacterial genome assembly summary from the NCBI genome
    ftp site and generate a list of taxids of the bacterial reference genomes.

    :return:
    """
    import urllib.request
    import csv

    urlbase = 'ftp://ftp.ncbi.nlm.nih.gov'
    urlextension = '/genomes/refseq/bacteria/assembly_summary.txt'
    assembly = urllib.request.urlopen(urlbase + urlextension)
    datareader = csv.reader(assembly.read().decode().splitlines(),
                            delimiter="\t")
    taxid = []

    for row in datareader:
        if len(row) == 1 and row[0].startswith("#"):
            continue
        if row[4] in ['reference genome', 'representative genome']:
            taxid.append(row[5])

    ts = get_timestamp()
    dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts))

    return taxid
コード例 #2
0
def get_ref_microbe_taxids():
    """
    Downloads the latest bacterial genome assembly summary from the NCBI genome
    ftp site and generate a list of taxids of the bacterial reference genomes.

    :return:
    """
    import urllib.request
    import csv

    urlbase = 'ftp://ftp.ncbi.nlm.nih.gov'
    urlextension = '/genomes/refseq/bacteria/assembly_summary.txt'
    assembly = urllib.request.urlopen(urlbase + urlextension)
    datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t")
    taxid = []

    for row in datareader:
        if len(row) == 1 and row[0].startswith("#"):
            continue
        if row[4] in ['reference genome','representative genome']:
            taxid.append(row[5])

    ts = get_timestamp()
    dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts))

    return taxid
コード例 #3
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num,
                           diff_folder):
    new = create_backend(new_db_col_names)
    docs_common = new.mget_from_ids(id_list_old)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_old = list(set(id_list_old) - set(ids_common))
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'delete': id_in_old,
        'add': [],
        'update': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    summary = {"add": 0, "update": 0, "delete": len(id_in_old)}
    if len(id_in_old) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary
コード例 #4
0
ファイル: cpdb_base.py プロジェクト: SuLab/mygene.info
def _download(__metadata__):
    from utils.dataload import download as _download

    output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp())
    for species in ['human', 'mouse', 'yeast']:
        url = __metadata__['__url_{}__'.format(species)]
        output_file = 'CPDB_pathways_genes_{}.tab'.format(species)
        _download(url, output_folder, output_file)
コード例 #5
0
def _download(__metadata__):
    from utils.dataload import download as _download

    output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp())
    for species in ['human', 'mouse', 'yeast']:
        url = __metadata__['__url_{}__'.format(species)]
        output_file = 'CPDB_pathways_genes_{}.tab'.format(species)
        _download(url, output_folder, output_file)
コード例 #6
0
ファイル: __init__.py プロジェクト: SuLab/mygene.info
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
コード例 #7
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection_name and self.db[self.temp_collection_name].count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([self.collection_name, 'archive', get_timestamp(), get_random_string()])
             self.collection.rename(new_name, dropTarget=True)
         self.db[self.temp_collection_name].rename(self.collection_name)
     else:
         raise ResourceError("No temp collection (or it's empty)")
コード例 #8
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([
                 self.__collection__, 'archive',
                 get_timestamp(),
                 get_random_string()
             ])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
コード例 #9
0
def backup(folder=".", archive=None):
    """
    Dump the whole hub_db database in given folder. "archive" can be pass
    to specify the target filename, otherwise, it's randomly generated
    Note: this doesn't backup source/merge data, just the internal data
          used by the hub
    """
    # get database name (ie. hub_db internal database)
    db_name = get_src_dump().database.name
    dump = {}
    for getter in [
            get_src_dump, get_src_master, get_src_build, get_src_build_config,
            get_data_plugin, get_api, get_cmd, get_event, get_hub_config
    ]:
        col = getter()
        dump[col.name] = []
        for doc in col.find():
            dump[col.name].append(doc)
    if not archive:
        archive = "backup_%s_%s.pyobj" % (get_timestamp(), get_random_string())
    path = os.path.join(folder, archive)
    dumpobj(dump, path)
    return path
コード例 #10
0
ファイル: differ.py プロジェクト: SuLab/biothings.api
def diff_worker_new_vs_old(id_list_new,
                           old_db_col_names,
                           new_db_col_names,
                           batch_num,
                           diff_folder,
                           diff_func,
                           exclude=[],
                           selfcontained=False):
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    docs_common = old.mget_from_ids(id_list_new)
    ids_common = [_doc['_id'] for _doc in docs_common]
    id_in_new = list(set(id_list_new) - set(ids_common))
    _updates = []
    if len(ids_common) > 0:
        _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude)
    file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num))
    _result = {
        'add': id_in_new,
        'update': _updates,
        'delete': [],
        'source': new.target_name,
        'timestamp': get_timestamp()
    }
    if selfcontained:
        _result["add"] = new.mget_from_ids(id_in_new)
    summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0}
    if len(_updates) != 0 or len(id_in_new) != 0:
        dump(_result, file_name)
        # compute md5 so when downloaded, users can check integreity
        md5 = md5sum(file_name)
        summary["diff_file"] = {
            "name": os.path.basename(file_name),
            "md5sum": md5
        }

    return summary
コード例 #11
0
 def generate_target_name(self, build_config_name):
     assert build_config_name is not None
     return '{}_{}_{}'.format(build_config_name, get_timestamp(),
                              get_random_string()).lower()
コード例 #12
0
 def generate_target_name(self, build_config_name):
     return 'genedoc_{}_{}_{}'.format(build_config_name, get_timestamp(),
                                      get_random_string()).lower()
コード例 #13
0
ファイル: builder.py プロジェクト: SuLab/mygene.info
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(), get_random_string()).lower()
コード例 #14
0
ファイル: builder.py プロジェクト: raonyguimaraes/mygene.info
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(),
                                      get_random_string()).lower()