def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = { '_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = {'_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def redo_parse_gbff(path): '''call this function manually to re-start the parsing step and set src_dump. This is used when main() is broken at parsing step, then parsing need to be re-started after the fix. ''' #mark the download starts src_dump = get_src_dump() t0 = time.time() t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(path) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def get_src_version(self): src_dump = get_src_dump(self.src.client) src_version = {} for src in src_dump.find(): version = src.get('release', src.get('timestamp', None)) if version: src_version[src['_id']] = version return src_version
def prepare_src_dump(self): src_dump = get_src_dump() # just populate/initiate an src_dump record (b/c no dump before) if needed self.src_doc = src_dump.find_one({'_id': self.main_source}) if not self.src_doc: src_dump.save({"_id":self.main_source}) self.src_doc = src_dump.find_one({'_id': self.main_source}) return src_dump
def check_refseq_release(): refseq_release = get_refseq_release() src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0)
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def poll(self): if not self.poll_schedule: raise ManagerError("poll_schedule is not defined") src_dump = get_src_dump() @asyncio.coroutine def check_pending_to_upload(): sources = [src['_id'] for src in src_dump.find({'pending_to_upload': True}) if type(src['_id']) == str] logging.info("Found %d resources to upload (%s)" % (len(sources),repr(sources))) for src_name in sources: logging.info("Launch upload for '%s'" % src_name) try: self.upload_src(src_name) except ResourceNotFound: logging.error("Resource '%s' needs upload but is not registered in manager" % src_name) cron = aiocron.crontab(self.poll_schedule,func=partial(check_pending_to_upload), start=True, loop=self.job_manager.loop)
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. logging.info("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() logging.info(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = { '_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def version(self): import biothings.utils.mongo as mongo if self.target_collection.database.name == btconfig.DATA_SRC_DATABASE: fulln = mongo.get_source_fullname(self.target_collection.name) if not fulln: return mainsrc = fulln.split(".")[0] col = mongo.get_src_dump() src = col.find_one({"_id": mainsrc}) return src.get("release") elif self.target_collection.database.name == btconfig.DATA_TARGET_DATABASE: col = mongo.get_src_build() tgt = col.find_one({"_id": self.target_collection.name}) if not tgt: return return tgt.get("_meta", {}).get("build_version") else: return None
def __init__(self, log_dir=None, date=None, dry_run=False): self.log_dir = log_dir if log_dir else os.getcwd() d = datetime.now() self.date = date if date else "".join(map(str, [d.year, d.month, d.day])) self.dry_run = dry_run self.login_instance = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS) self.fast_run_base_filter = {self.DOID_PROP: ''} self.info_log_path = None self.exc_log_path = None self.reference = None self.setup_logging() self.collection = get_src_db().mondo src_dump = get_src_dump() src_doc = src_dump.find_one({'_id': 'mondo'}) or {} self.retrieved = src_doc.get("download", {}).get("started_at", False) or datetime.now() self.ref_url = "https://github.com/monarch-initiative/monarch-disease-ontology/raw/{}/src/mondo/mondo.obo".format( src_doc.get("release", "master")) self.create_reference()
def register_status(self,src_name,status,**extra): """ Register overall status for resource """ src_dump = get_src_dump() upload_info = {'status': status} upload_info.update(extra) if status == "uploading": upload_info["jobs"] = {} # unflag "need upload" src_dump.update_one({"_id" : src_name},{"$unset" : {"pending_to_upload":None}}) src_dump.update_one({"_id" : src_name},{"$set" : {"upload" : upload_info}}) else: # we want to keep information upd = {} for k,v in upload_info.items(): upd["upload.%s" % k] = v src_dump.update_one({"_id" : src_name},{"$set" : upd})
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'exac'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILES_PATH[0]) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'exac_dump.log') setup_logfile(logfile) #mark the download starts doc = { '_id': 'exac', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'exac'}, {'$set': _updates})
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'uniprot'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILE_PATH) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'uniprot', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
def prepare_src_dump(self): """Sync with src_dump collection, collection information (src_doc) Return src_dump collection""" src_dump = get_src_dump() self.src_doc = src_dump.find_one({'_id': self.main_source}) return src_dump
* with "-d" parameter, it will continue monitoring, without "-d", it will quit after all running jobs are done. ''' from subprocess import Popen, STDOUT, check_output import time from datetime import datetime import sys import os.path from biothings.utils.mongo import get_src_dump from biothings.utils.common import safewfile, timesofar src_path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0] sys.path.append(src_path) src_dump = get_src_dump() def check_mongo(): '''Check for "pending_to_upload" flag in src_dump collection. And return a list of sources should be uploaded. ''' # filter some more: _id is supposed to be a user-defined string, not an ObjectId() return [src['_id'] for src in src_dump.find({'pending_to_upload': True}) if type(src['_id']) == str] def dispatch(src): src_doc = src_dump.find_one({'_id': src}) datadump_logfile = src_doc.get('logfile', '') if datadump_logfile: upload_logfile = os.path.join(os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src))
def prepare_src_dump(self): # Mongo side self.src_dump = get_src_dump() self.src_doc = self.src_dump.find_one({'_id': self.src_name}) or {}
* with "-d" parameter, it will continue monitoring, without "-d", it will quit after all running jobs are done. ''' from subprocess import Popen, STDOUT, check_output import time from datetime import datetime import sys import os.path from biothings.utils.mongo import get_src_dump from biothings.utils.common import safewfile, timesofar src_path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0] sys.path.append(src_path) src_dump = get_src_dump() def check_mongo(): '''Check for "pending_to_upload" flag in src_dump collection. And return a list of sources should be uploaded. ''' # filter some more: _id is supposed to be a user-defined string, not an ObjectId() return [ src['_id'] for src in src_dump.find({'pending_to_upload': True}) if type(src['_id']) == str ] def dispatch(src): src_doc = src_dump.find_one({'_id': src})
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = { '_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main( os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main( os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene( os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})