Ejemplo n.º 1
0
    def log_building_start(self):
        if self.merge_logging:
            #setup logging
            logfile = 'databuild_{}_{}.log'.format(
                'genedoc' + '_' + self._build_config['name'],
                time.strftime('%Y%m%d'))
            logfile = os.path.join(self.log_folder, logfile)
            setup_logfile(logfile)

        src_build = getattr(self, 'src_build', None)
        if src_build:
            #src_build.update({'_id': self._build_config['_id']}, {"$unset": {"build": ""}})
            d = {
                'status': 'building',
                'started_at': datetime.now(),
                'logfile': logfile,
                'target_backend': self.target.name
            }
            if self.target.name == 'mongodb':
                d['target'] = self.target.target_collection.name
            elif self.target.name == 'es':
                d['target'] = self.target.target_esidxer.ES_INDEX_NAME
            logging.info(pformat(d))
            src_build.update({'_id': self._build_config['_id']},
                             {"$push": {
                                 'build': d
                             }})
            _cfg = src_build.find_one({'_id': self._build_config['_id']})
            if len(_cfg['build']) > self.max_build_status:
                #remove the first build status record
                src_build.update({'_id': self._build_config['_id']},
                                 {"$pop": {
                                     'build': -1
                                 }})
Ejemplo n.º 2
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {
        '_id': 'ucsc',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': latest_lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Ejemplo n.º 3
0
    def log_building_start(self):
        if self.merge_logging:
            #setup logging
            logfile = 'databuild_{}_{}.log'.format('genedoc' + '_' + self._build_config['name'],
                                                   time.strftime('%Y%m%d'))
            logfile = os.path.join(self.log_folder, logfile)
            setup_logfile(logfile)

        src_build = getattr(self, 'src_build', None)
        if src_build:
            #src_build.update({'_id': self._build_config['_id']}, {"$unset": {"build": ""}})
            d = {'status': 'building',
                 'started_at': datetime.now(),
                 'logfile': logfile,
                 'target_backend': self.target.name}
            if self.target.name == 'mongodb':
                d['target'] = self.target.target_collection.name
            elif self.target.name == 'es':
                d['target'] = self.target.target_esidxer.ES_INDEX_NAME
            logging.info(pformat(d))
            src_build.update({'_id': self._build_config['_id']}, {"$push": {'build': d}})
            _cfg = src_build.find_one({'_id': self._build_config['_id']})
            if len(_cfg['build']) > self.max_build_status:
                #remove the first build status record
                src_build.update({'_id': self._build_config['_id']}, {"$pop": {'build': -1}})
Ejemplo n.º 4
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {'_id': 'ucsc',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': latest_lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Ejemplo n.º 5
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Ejemplo n.º 6
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    logging.info("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    logging.info(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Ejemplo n.º 7
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {
        '_id': 'entrez',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 8
0
def update_from_temp_collections(config, no_confirm=False, use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(
        len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config,
                                                    time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Ejemplo n.º 9
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'exac'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILES_PATH[0])
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'exac_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'exac',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'exac'}, {'$set': _updates})
Ejemplo n.º 10
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 11
0
def update_from_temp_collections(config,no_confirm=False,use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Ejemplo n.º 12
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'uniprot',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
Ejemplo n.º 13
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'],
                                 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'ensembl',
        'release': mart_version,
        'timestamp': time.strftime('%Y%m%d'),
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(
            os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})