def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = self.parser.get('compute', 'datapath') if queue: self.queue = queue print('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq', 'default_routing_key') self.min_free_space = float( self.parser.get('compute', 'min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth']) self.gc_lock = multiprocessing.Lock()
def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf, datapath, binpath): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list, binpath) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = datapath if queue: self.queue = queue logging.info('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq','default_routing_key') self.min_free_space = float(self.parser.get('compute','min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] ###### TODO Use REST API self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth'], m['mongo.collection.data'] ) self.gc_lock = multiprocessing.Lock()
def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queue, kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath): self.parser = SafeConfigParser() self.parser.read(config) self.kill_list = kill_list self.kill_list_lock = kill_list_lock self.job_list = job_list self.job_list_lock = job_list_lock # Load plugins self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath) # Set up environment self.shockurl = shockurl self.datapath = datapath self.rmq_host = rmq_host self.rmq_port = rmq_port self.mongo_host = mongo_host self.mongo_port = mongo_port self.queue = queue self.min_free_space = float(self.parser.get('compute','min_free_space')) self.data_expiration_days = float(self.parser.get('compute','data_expiration_days')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] collections = {'jobs': m.get('mongo.collection', 'jobs'), 'auth': m.get('mongo.collection.auth', 'auth'), 'data': m.get('mongo.collection.data', 'data'), 'running': m.get('mongo.collection.running', 'running_jobs')} ###### TODO Use REST API self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'], collections) self.gc_lock = multiprocessing.Lock()
def __init__(self, threads, datapath, binpath, modulebin): self.threads = threads self.binpath = binpath self.modulebin = modulebin self.pmanager = ModuleManager(threads, None, None, None, binpath, modulebin) self.datapath = datapath
def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queues, kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath, modulebin): self.parser = SafeConfigParser() self.parser.read(config) self.kill_list = kill_list self.kill_list_lock = kill_list_lock self.job_list = job_list self.job_list_lock = job_list_lock # Load plugins self.threads = threads self.binpath = binpath self.modulebin = modulebin self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath, modulebin) # Set up environment self.shockurl = shockurl self.datapath = datapath self.rmq_host = rmq_host self.rmq_port = rmq_port self.mongo_host = mongo_host self.mongo_port = mongo_port self.queues = queues self.min_free_space = float( self.parser.get('compute', 'min_free_space')) self.data_expiration_days = float( self.parser.get('compute', 'data_expiration_days')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] collections = { 'jobs': m.get('mongo.collection', 'jobs'), 'auth': m.get('mongo.collection.auth', 'auth'), 'data': m.get('mongo.collection.data', 'data'), 'running': m.get('mongo.collection.running', 'running_jobs') } ###### TODO Use REST API self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'], collections) self.gc_lock = multiprocessing.Lock()
class ArastStandalone: def __init__(self, threads, datapath, binpath, modulebin): self.threads = threads self.binpath = binpath self.modulebin = modulebin self.pmanager = ModuleManager(threads, None, None, None, binpath, modulebin) self.datapath = datapath def compute(self, jobpath, input_description): try: os.makedirs(jobpath) except OSError as e: if e.errno != errno.EEXIST: raise pipelines = input_description['pipelines'] recipe = input_description['recipe'] wasp_in = input_description['wasp_in'] ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(input_description['job_id'])) self.out_report = open(self.out_report_name, 'w') job_id = input_description['job_id'] # create job data (ArastJob object) # # input_description is dictionary containing three # input sets: reads, reference, and contigs. # Each contains a list of fileinfo objects. # # It also contains fields user, containing the end system's username, # and job_id, a job_id allocated by the end system. # uid = str(uuid.uuid4()) # # We need to populate the files list in each of the filesets. # print input_description for sub in ['reads', 'reference', 'contigs']: l = input_description[sub] for fs in l: print sub, fs fs['files'] = [] for x in fs['fileinfos']: fs['files'].append(x['local_file']) print input_description job_data = ArastJob({'job_id' : job_id, 'uid': uid, 'user' : input_description['user'], 'reads' : input_description['reads'], 'logfiles': [], 'reference': input_description['reference'], 'contigs': input_description['contigs'], 'initial_reads': list(input_description['reads']), 'raw_reads': copy.deepcopy(input_description['reads']), 'params' : [], 'exceptions' : [], 'pipeline_data' : {}, 'out_report': self.out_report, 'datapath': self.datapath }) status = '' logger.debug('job_data = {}'.format(job_data)) self.start_time = time.time() #### Parse pipeline to wasp exp reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0], job_id) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif not pipelines: wasp_exp = recipes.get('auto', job_id) elif pipelines: ## Legacy client if pipelines[0] == 'auto': wasp_exp = recipes.get('auto', job_id) ########## else: if type(pipelines[0]) is not list: # --assemblers pipelines = [pipelines] all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) logger.debug("pipelines = {}".format(all_pipes)) wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) else: raise asmtypes.ArastClientRequestError('Malformed job request.') logger.debug('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data) ###### Run Job try: w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags print "Done - job data: " , pprint.pformat(job_data) # uploaded_fsets = job_data.upload_results(url, token) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log errors if len(job_data['errors']) > 0: new_report.write('PIPELINE ERRORS\n') for i,e in enumerate(job_data['errors']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20)) with open(log) as l: new_report.write(l.read()) ### Log tracebacks if len(job_data['tracebacks']) > 0: new_report.write('EXCEPTION TRACEBACKS\n') for i,e in enumerate(job_data['tracebacks']): new_report.write('{}: {}\n'.format(i, e)) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) # res = self.upload(url, user, token, self.out_report_name) print "Would upload ", self.out_report_name # report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) status = 'Complete with errors' if job_data.get('errors') else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] # # Write this somewhere # self.metadata.update_job(uid, 'data', job_data) # self.metadata.update_job(uid, 'result_data', uploaded_fsets) sys.stdout.flush() touch(os.path.join(jobpath, "_DONE_")) logger.info('============== JOB COMPLETE ===============') except asmtypes.ArastUserInterrupt: status = 'Terminated by user' sys.stdout.flush() touch(os.path.join(jobpath, "_CANCELLED__")) logger.info('============== JOB KILLED ===============')
class ArastConsumer: def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queues, kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath, modulebin): self.parser = SafeConfigParser() self.parser.read(config) self.kill_list = kill_list self.kill_list_lock = kill_list_lock self.job_list = job_list self.job_list_lock = job_list_lock # Load plugins self.threads = threads self.binpath = binpath self.modulebin = modulebin self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath, modulebin) # Set up environment self.shockurl = shockurl self.datapath = datapath self.rmq_host = rmq_host self.rmq_port = rmq_port self.mongo_host = mongo_host self.mongo_port = mongo_port self.queues = queues self.min_free_space = float(self.parser.get('compute','min_free_space')) self.data_expiration_days = float(self.parser.get('compute','data_expiration_days')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] collections = {'jobs': m.get('mongo.collection', 'jobs'), 'auth': m.get('mongo.collection.auth', 'auth'), 'data': m.get('mongo.collection.data', 'data'), 'running': m.get('mongo.collection.running', 'running_jobs')} ###### TODO Use REST API self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'], collections) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, required_space, user, job_id, data_id): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" datapath = self.datapath required_space = self.min_free_space expiration = self.data_expiration_days ### Remove expired directories def can_remove(d, user, job_id, data_id): u, data, j = d.split('/')[-4:-1] if u == user and j == str(job_id): return False if data == str(data_id) and j == 'raw': return False if os.path.isdir(d): return True return False dir_depth = 3 dirs = filter(lambda f: can_remove(f, user, job_id, data_id), glob.glob(datapath + '/' + '*/' * dir_depth)) removed = [] logger.info('Searching for directories older than {} days'.format(expiration)) for d in dirs: file_modified = None try: file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(d)) except os.error as e: logger.warning('GC ignored "{}": could not get timestamp: {}'.format(d, e)) continue tdiff = datetime.datetime.now() - file_modified if tdiff > datetime.timedelta(days=expiration): logger.info('GC: removing expired directory: {} (modified {} ago)'.format(d, tdiff)) removed.append(d) shutil.rmtree(d, ignore_errors=True) else: logger.debug('GC: not removing: {} (modified {} ago)'.format(d, tdiff)) for r in removed: dirs.remove(r) ### Check free space and remove old directories free_space = free_space_in_path(datapath) logger.info("Required space in GB: {} (free = {})".format(required_space, free_space)) times = [] for d in dirs: try: t = os.path.getmtime(d) times.append([t, d]) except: pass times.sort() logger.debug("Directories sorted by time: {}".format(times)) dirs = [x[1] for x in times] busy_dirs = [] while free_space < self.min_free_space and len(dirs) > 0: d = dirs.pop(0) if is_dir_busy(d): busy_dirs.append(d) else: free_space = self.remove_dir(d) while free_space < self.min_free_space: if len(busy_dirs) == 0: logger.error("GC: free space {} < {} GB; waiting for system space to be available...".format(free_space, self.min_free_space)) time.sleep(60) else: logger.warning("GC: free space {} < {} GB; waiting for jobs to complete to reclaim space: {} busy directories..." .format(free_space, self.min_free_space, len(busy_dirs))) checked_dirs = [] while free_space < self.min_free_space and len(busy_dirs) > 0: bd = busy_dirs.pop(0) if is_dir_busy(bd): checked_dirs.append(bd) continue free_space = self.remove_dir(bd) # self.remove_empty_dirs() if free_space < self.min_free_space: busy_dirs = checked_dirs time.sleep(20) free_space = free_space_in_path(self.datapath) self.remove_empty_dirs() def remove_dir(self, d): shutil.rmtree(d, ignore_errors=True) logger.info("GC: space required; %s removed." % d) return free_space_in_path(self.datapath) def remove_empty_dirs(self): data_dirs = filter(lambda f: os.path.isdir(f), glob.glob(self.datapath + '/' + '*/' * 2)) for dd in data_dirs: if not os.listdir(dd): logger.info('GC: removing empty directory: {}'.format(dd)) try: os.rmdir(dd) except os.error as e: logger.warning('GC: could not remove empty dir "{}": {}'.format(dd, e)) def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) logger.debug('New Data Format') return self._get_data(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] job_id = params['job_id'] data_id = params['data_id'] token = params['oauth_token'] uid = params['_id'] self.gc_lock.acquire() try: self.garbage_collect(self.datapath, self.min_free_space, user, job_id, data_id) except: logger.error('Unexpected error in GC.') raise finally: self.gc_lock.release() ##### Get data from ID ##### data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) logger.debug('data_doc = {}'.format(data_doc)) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') with ignored(OSError): os.makedirs(filepath) touch(filepath) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: if file_set['type'] == 'paired_url': file_set['type'] = 'paired' elif file_set['type'] == 'single_url': file_set['type'] = 'single' elif file_set['type'] == 'reference_url': file_set['type'] = 'reference' file_set['files'] = [] #legacy for file_info in file_set['file_infos']: #### File is stored on Shock if file_info['filename']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): local_file = self.extract_file(local_file) logger.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download_shock(file_info['shock_url'], user, token, file_info['shock_id'], filepath) elif file_info['direct_url']: local_file = os.path.join(filepath, os.path.basename(file_info['direct_url'])) if os.path.exists(local_file): local_file = self.extract_file(local_file) logger.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download_url(file_info['direct_url'], filepath, token=token) file_info['local_file'] = local_file if file_set['type'] == 'single' and asm.is_long_read_file(local_file): if not 'tags' in file_set: file_set['tags'] = [] if not 'long_read' in file_set['tags']: file_set['tags'].append('long_read') # pacbio or nanopore reads file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def prepare_job_data(self, body): params = json.loads(body) job_id = params['job_id'] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except OSError as e: if e.errno != errno.EEXIST: raise ### Protect data directory from GC before any job starts touch(os.path.join(rawpath, "_READY_")) ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] contigs = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) elif fileset['type'] == 'contigs': contigs.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({'job_id' : params['job_id'], 'uid' : params['_id'], 'user' : params['ARASTUSER'], 'reads': reads, 'logfiles': [], 'reference': reference, 'contigs': contigs, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'params': [], 'exceptions': [], 'pipeline_data': {}, 'datapath': datapath, 'out_report' : self.out_report}) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) return job_data def compute(self, body): self.job_list_lock.acquire() try: job_data = self.prepare_job_data(body) self.job_list.append(job_data) except: logger.error("Error in adding new job to job_list") raise finally: self.job_list_lock.release() status = '' logger.debug('job_data = {}'.format(job_data)) params = json.loads(body) job_id = params['job_id'] data_id = params['data_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params.get('pipeline') recipe = params.get('recipe') wasp_in = params.get('wasp') jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id)) url = shock.verify_shock_url(self.shockurl) self.start_time = time.time() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() #### Parse pipeline to wasp exp reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0], job_id) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif not pipelines: wasp_exp = recipes.get('auto', job_id) elif pipelines: ## Legacy client if pipelines[0] == 'auto': wasp_exp = recipes.get('auto', job_id) ########## else: if type(pipelines[0]) is not list: # --assemblers pipelines = [pipelines] all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) logger.debug("pipelines = {}".format(all_pipes)) wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) else: raise asmtypes.ArastClientRequestError('Malformed job request.') logger.debug('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata) ###### Run Job try: w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags uploaded_fsets = job_data.upload_results(url, token) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log errors if len(job_data['errors']) > 0: new_report.write('PIPELINE ERRORS\n') for i,e in enumerate(job_data['errors']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20)) with open(log) as l: new_report.write(l.read()) ### Log tracebacks if len(job_data['tracebacks']) > 0: new_report.write('EXCEPTION TRACEBACKS\n') for i,e in enumerate(job_data['tracebacks']): new_report.write('{}: {}\n'.format(i, e)) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])]) status = 'Complete with errors' if job_data.get('errors') else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] self.metadata.update_job(uid, 'data', job_data) self.metadata.update_job(uid, 'result_data', uploaded_fsets) ###### Legacy Support ####### filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info])) contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds'] download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']} contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']} self.metadata.update_job(uid, 'result_data_legacy', [download_ids]) self.metadata.update_job(uid, 'contig_ids', [contig_ids]) ################### sys.stdout.flush() touch(os.path.join(jobpath, "_DONE_")) logger.info('============== JOB COMPLETE ===============') except asmtypes.ArastUserInterrupt: status = 'Terminated by user' sys.stdout.flush() touch(os.path.join(jobpath, "_CANCELLED__")) logger.info('============== JOB KILLED ===============') finally: self.remove_job_from_lists(job_data) logger.debug('Reinitialize plugin manager...') # Reinitialize to get live changes self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin) self.metadata.update_job(uid, 'status', status) def remove_job_from_lists(self, job_data): self.job_list_lock.acquire() try: for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']: self.job_list.pop(i) except: logger.error("Unexpected error in removing executed jobs from job_list") raise finally: self.job_list_lock.release() # kill_list cleanup for cases where a kill request is enqueued right before the corresponding job gets popped self.kill_list_lock.acquire() try: for i, kill_request in enumerate(self.kill_list): if kill_request['user'] == job_data['user'] and kill_request['job_id'] == job_data['job_id']: self.kill_list.pop(i) except: logger.error("Unexpected error in removing executed jobs from kill_list") raise finally: self.kill_list_lock.release() def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logger.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'contigs' or filetype == 'scaffolds': res = sclient.upload_contigs(file) else: res = sclient.upload_file(file, filetype, curl=True) return res def download_shock(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return self.extract_file(downloaded) def download_url(self, url, outdir, token=None): downloaded = shock.curl_download_url(url, outdir=outdir, token=token) return self.extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection(pika.ConnectionParameters( host=self.rmq_host, port=self.rmq_port)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(exclusive=False, auto_delete=False, durable=True) logger.info('Fetching job...') channel.basic_qos(prefetch_count=1) for queue in self.queues: print 'Using queue: {}'.format(queue) channel.basic_consume(self.callback, queue=queue) channel.start_consuming() def callback(self, ch, method, properties, body): params = json.loads(body) display = ['ARASTUSER', 'job_id', 'message', 'recipe', 'pipeline', 'wasp'] logger.info('Incoming job: ' + ', '.join(['{}: {}'.format(k, params[k]) for k in display if params[k]])) logger.debug(params) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) ## Check if job was not killed if job_doc is None: logger.error('Error: no job_doc found for {}'.format(params.get('job_id'))) return if job_doc.get('status') == 'Terminated by user': logger.warn('Job {} was killed, skipping'.format(params.get('job_id'))) else: self.done_flag = threading.Event() uid = None try: uid = job_doc['_id'] self.compute(body) except Exception as e: tb = format_exc() status = "[FAIL] {}".format(e) logger.error("{}\n{}".format(status, tb)) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) self.done_flag.set() def start(self): self.fetch_job() def extract_file(self, filename): """ Decompress files if necessary """ unp_bin = os.path.join(self.modulebin, 'unp') filepath = os.path.dirname(filename) uncompressed = ['fasta', 'fa', 'fastq', 'fq', 'fna', 'h5' ] supported = ['tar.gz', 'tar.bz2', 'bz2', 'gz', 'lz', 'rar', 'tar', 'tgz','zip'] for ext in uncompressed: if filename.endswith('.'+ext): return filename for ext in supported: if filename.endswith('.'+ext): extracted_file = filename[:filename.index(ext)-1] if os.path.exists(extracted_file): # Check extracted already return extracted_file logger.info("Extracting {}...".format(filename)) # p = subprocess.Popen([unp_bin, filename], # cwd=filepath, stderr=subprocess.STDOUT) # p.wait() # Hide the "broken pipe" message from unp out = subprocess.Popen([unp_bin, filename], cwd=filepath, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0] if os.path.exists(extracted_file): return extracted_file else: logger.error("Extraction of {} failed: {}".format(filename, out)) raise Exception('Archive structure error') logger.error("Could not extract {}".format(filename)) return filename
def compute(self, body): self.job_list_lock.acquire() try: job_data = self.prepare_job_data(body) self.job_list.append(job_data) except: logger.error("Error in adding new job to job_list") raise finally: self.job_list_lock.release() status = '' logger.debug('job_data = {}'.format(job_data)) params = json.loads(body) job_id = params['job_id'] data_id = params['data_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params.get('pipeline') recipe = params.get('recipe') wasp_in = params.get('wasp') jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id)) url = shock.verify_shock_url(self.shockurl) self.start_time = time.time() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() #### Parse pipeline to wasp exp reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0], job_id) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif not pipelines: wasp_exp = recipes.get('auto', job_id) elif pipelines: ## Legacy client if pipelines[0] == 'auto': wasp_exp = recipes.get('auto', job_id) ########## else: if type(pipelines[0]) is not list: # --assemblers pipelines = [pipelines] all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) logger.debug("pipelines = {}".format(all_pipes)) wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) else: raise asmtypes.ArastClientRequestError('Malformed job request.') logger.debug('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata) ###### Run Job try: w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags uploaded_fsets = job_data.upload_results(url, token) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log errors if len(job_data['errors']) > 0: new_report.write('PIPELINE ERRORS\n') for i,e in enumerate(job_data['errors']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20)) with open(log) as l: new_report.write(l.read()) ### Log tracebacks if len(job_data['tracebacks']) > 0: new_report.write('EXCEPTION TRACEBACKS\n') for i,e in enumerate(job_data['tracebacks']): new_report.write('{}: {}\n'.format(i, e)) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])]) status = 'Complete with errors' if job_data.get('errors') else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] self.metadata.update_job(uid, 'data', job_data) self.metadata.update_job(uid, 'result_data', uploaded_fsets) ###### Legacy Support ####### filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info])) contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds'] download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']} contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']} self.metadata.update_job(uid, 'result_data_legacy', [download_ids]) self.metadata.update_job(uid, 'contig_ids', [contig_ids]) ################### sys.stdout.flush() touch(os.path.join(jobpath, "_DONE_")) logger.info('============== JOB COMPLETE ===============') except asmtypes.ArastUserInterrupt: status = 'Terminated by user' sys.stdout.flush() touch(os.path.join(jobpath, "_CANCELLED__")) logger.info('============== JOB KILLED ===============') finally: self.remove_job_from_lists(job_data) logger.debug('Reinitialize plugin manager...') # Reinitialize to get live changes self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin) self.metadata.update_job(uid, 'status', status)
class ArastConsumer: def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = self.parser.get('compute','datapath') if queue: self.queue = queue print('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq','default_routing_key') self.min_free_space = float(self.parser.get('compute','min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth']) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, user, required_space): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" self.gc_lock.acquire() s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) logging.debug("Required space in bytes: %s" % required_space) while ((free_space - self.min_free_space) < required_space): #Delete old data dirs = os.listdir(os.path.join(datapath, user)) times = [] for dir in dirs: times.append(os.path.getmtime(os.path.join(datapath, user, dir))) if len(dirs) > 0: old_dir = os.path.join(datapath, user, dirs[times.index(min(times))]) shutil.rmtree(old_dir, ignore_errors=True) else: logging.error("No more directories to remove") break logging.info("Space required. %s removed." % old_dir) s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) self.gc_lock.release() def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) if 'assembly_data' in params: logging.info('New Data Format') return self._get_data(body) else: return self._get_data_old(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] token = params['oauth_token'] uid = params['_id'] ##### Get data from ID ##### data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') try:os.makedirs(filepath) except:pass ### TODO Garbage collect ### download_url = 'http://{}'.format(self.shockurl) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: file_set['files'] = [] #legacy for file_info in file_set['file_infos']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): logging.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download(download_url, user, token, file_info['shock_id'], filepath) file_info['local_file'] = local_file file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def _get_data_old(self, body): params = json.loads(body) #filepath = self.datapath + str(params['data_id']) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] uid = params['_id'] job_id = params['job_id'] user = params['ARASTUSER'] data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if data_doc: paired = data_doc['pair'] single = data_doc['single'] files = data_doc['filename'] ids = data_doc['ids'] token = params['oauth_token'] try: ref = data_doc['reference'] except: pass else: self.metadata.update_job(uid, 'status', 'Invalid Data ID') raise Exception('Data {} does not exist on Shock Server'.format( params['data_id'])) all_files = [] if os.path.isdir(filepath): logging.info("Requested data exists on node") try: for l in paired: filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No single files submitted!') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No reference files submitted!') touch(datapath) ## Data does not exist on current compute node else: self.metadata.update_job(uid, 'status', 'Data transfer') os.makedirs(filepath) # Get required space and garbage collect try: req_space = 0 for file_size in data_doc['file_sizes']: req_space += file_size self.garbage_collect(self.datapath, user, req_space) except: pass url = "http://%s" % (self.shockurl) try: for l in paired: #FILEDICT contains a single read library's info filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: #logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') print all_files return datapath, all_files def compute(self, body): error = False params = json.loads(body) job_id = params['job_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params['pipeline'] #support legacy arast client if len(pipelines) > 0: if type(pipelines[0]) is not list: pipelines = [pipelines] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except: raise Exception ('Data Error') ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({'job_id' : params['job_id'], 'uid' : params['_id'], 'user' : params['ARASTUSER'], 'reads': reads, 'reference': reference, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'processed_reads': list(reads), 'pipeline_data': {}, 'datapath': datapath, 'out_report' : self.out_report, 'logfiles': []}) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) self.job_list.append(job_data) self.start_time = time.time() self.done_flag = threading.Event() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() download_ids = {} contig_ids = {} url = "http://%s" % (self.shockurl) # url += '/node' try: include_all_data = params['all_data'] except: include_all_data = False contigs = not include_all_data status = '' ## TODO CHANGE: default pipeline default_pipe = ['velvet'] exceptions = [] if pipelines: try: if pipelines == ['auto']: pipelines = [default_pipe,] for p in pipelines: self.pmanager.validate_pipe(p) result_files, summary, contig_files, exceptions = self.run_pipeline(pipelines, job_data, contigs_only=contigs) for i, f in enumerate(result_files): #fname = os.path.basename(f).split('.')[0] fname = str(i) res = self.upload(url, user, token, f) download_ids[fname] = res['data']['id'] for c in contig_files: fname = os.path.basename(c).split('.')[0] res = self.upload(url, user, token, c, filetype='contigs') contig_ids[fname] = res['data']['id'] # Check if job completed with no errors if exceptions: status = 'Complete with errors' elif not summary: status = 'Complete: No valid contigs' else: status += "Complete" self.out_report.write("Pipeline completed successfully\n") except: traceback = format_exc(sys.exc_info()) status = "[FAIL] {}".format(sys.exc_info()[1]) print traceback self.out_report.write("ERROR TRACE:\n{}\n". format(format_tb(sys.exc_info()[2]))) # Format report for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']: self.job_list.pop(i) self.done_flag.set() new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log exceptions if len(exceptions) > 0: new_report.write('PIPELINE ERRORS') for i,e in enumerate(exceptions): new_report.write('{}: {}\n'.format(i, e)) try: for sum in summary: with open(sum) as s: new_report.write(s.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) download_ids['report'] = res['data']['id'] # Get location self.metadata.update_job(uid, 'result_data', download_ids) self.metadata.update_job(uid, 'contig_ids', contig_ids) self.metadata.update_job(uid, 'status', status) print '=========== JOB COMPLETE ============' def update_time_record(self): elapsed_time = time.time() - self.start_time ftime = str(datetime.timedelta(seconds=int(elapsed_time))) self.metadata.update_job(uid, 'computation_time', ftime) def run_pipeline(self, pipes, job_data, contigs_only=True): """ Runs all pipelines in list PIPES """ all_pipes = [] for p in pipes: all_pipes += self.pmanager.parse_input(p) logging.info('{} pipelines:'.format(len(all_pipes))) for p in all_pipes: print '->'.join(p) #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads' include_reads = False pipeline_num = 1 all_files = [] pipe_outputs = [] logfiles = [] ale_reports = {} final_contigs = [] final_scaffolds = [] output_types = [] exceptions = [] num_pipes = len(all_pipes) for pipe in all_pipes: try: #job_data = copy.deepcopy(job_data_global) #job_data['out_report'] = job_data_global['out_report'] pipeline, overrides = self.pmanager.parse_pipe(pipe) job_data.add_pipeline(pipeline_num, pipeline) num_stages = len(pipeline) pipeline_stage = 1 pipeline_results = [] cur_outputs = [] # Reset job data job_data['reads'] = copy.deepcopy(job_data['raw_reads']) job_data['processed_reads'] = [] print job_data self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format('='*15, pipeline_num, pipe)) pipe_suffix = '' # filename code for indiv pipes pipe_start_time = time.time() pipe_alive = True # Store data record for pipeline for module_name in pipeline: if not pipe_alive: self.out_report.write('\n{0} Module Failure, Killing Pipe {0}'.format( 'X'*10)) break module_code = '' # unique code for data reuse print '\n\n{0} Running module: {1} {2}'.format( '='*20, module_name, '='*(35-len(module_name))) self.garbage_collect(self.datapath, job_data['user'], 2147483648) # 2GB ## PROGRESS CALCULATION pipes_complete = (pipeline_num - 1) / float(num_pipes) stage_complete = (pipeline_stage - 1) / float(num_stages) pct_segment = 1.0 / num_pipes stage_complete *= pct_segment total_complete = pipes_complete + stage_complete cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format( int(total_complete * 100), pipeline_num, num_pipes, pipeline_stage, num_stages, module_name) self.metadata.update_job(job_data['uid'], 'status', cur_state) ## LOG REPORT For now, module code is 1st and last letter short_name = self.pmanager.get_short_name(module_name) if short_name: #pipe_suffix += short_name.capitalize() module_code += short_name.capitalize() else: #pipe_suffix += module_name[0].upper() + module_name[-1] module_code += module_name[0].upper() + module_name[-1] mod_overrides = overrides[pipeline_stage - 1] for k in mod_overrides.keys(): #pipe_suffix += '_{}{}'.format(k[0], par[k]) module_code += '_{}{}'.format(k[0], mod_overrides[k]) pipe_suffix += module_code self.out_report.write('PIPELINE {} -- STAGE {}: {}\n'.format( pipeline_num, pipeline_stage, module_name)) logging.debug('New job_data for stage {}: {}'.format( pipeline_stage, job_data)) job_data['params'] = overrides[pipeline_stage-1].items() module_start_time = time.time() ## RUN MODULE # Check if output data exists reuse_data = False enable_reuse = True # KILL SWITCH if enable_reuse: for k, pipe in enumerate(pipe_outputs): if reuse_data: break if not pipe: continue # Check that all previous pipes match for i in range(pipeline_stage): try: if not pipe[i][0] == cur_outputs[i][0]: break except: pass try: if (pipe[i][0] == module_code and i == pipeline_stage - 1): #and overrides[i].items() == job_data['params']): #copy! print('Found previously computed data, reusing {}.'.format( module_code)) output = [] + pipe[i][1] pfix = (k+1, i+1) alldata = [] + pipe[i][2] reuse_data = True job_data.get_pipeline(pipeline_num).get_module( pipeline_stage)['elapsed_time'] = time.time( job_data.get_pipeline(i).get_module( pipeline_stage)['elapsed_time']) break except: # Previous pipes may be shorter pass output_type = self.pmanager.output_type(module_name) if not reuse_data: output, alldata, mod_log = self.pmanager.run_module( module_name, job_data, all_data=True, reads=include_reads) ##### Module produced no output, attach log and proceed to next ##### if not output: pipe_alive = False try: print mod_log logfiles.append(mod_log) except: print 'error attaching ', mod_log break ##### Prefix outfiles with pipe stage (only assembler modules) ##### alldata = [asm.prefix_file_move( file, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) for file in alldata] module_elapsed_time = time.time() - module_start_time job_data.get_pipeline(pipeline_num).get_module( pipeline_stage)['elapsed_time'] = module_elapsed_time if alldata: #If log was renamed mod_log = asm.prefix_file(mod_log, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) if output_type == 'contigs' or output_type == 'scaffolds': #Assume assembly contigs if reuse_data: p_num, p_stage = pfix else: p_num, p_stage = pipeline_num, pipeline_stage # If plugin returned scaffolds if type(output) is tuple and len(output) == 2: out_contigs = output[0] out_scaffolds = output[1] cur_scaffolds = [asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_scaffolds] else: out_contigs = output cur_contigs = [asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_contigs] #job_data['reads'] = asm.arast_reads(alldata) job_data['contigs'] = cur_contigs elif output_type == 'reads': #Assume preprocessing if include_reads and reuse_data: # data was prefixed and moved for d in output: files = [asm.prefix_file(f, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) for f in d['files']] d['files'] = files d['short_reads'] = [] + files job_data['reads'] = output job_data['processed_reads'] = list(job_data['reads']) else: # Generic return, don't use in further stages pipeline_results += output logging.info('Generic plugin output: {}'.format(output)) if pipeline_stage == num_stages: # Last stage, add contig for assessment if output and (output_type == 'contigs' or output_type == 'scaffolds'): #If a contig was produced fcontigs = cur_contigs rcontigs = [asm.rename_file_symlink(f, 'P{}_{}'.format( pipeline_num, pipe_suffix)) for f in fcontigs] try: rscaffolds = [asm.rename_file_symlink(f, 'P{}_{}_{}'.format( pipeline_num, pipe_suffix, 'scaff')) for f in cur_scaffolds] if rscaffolds: scaffold_data = {'files': rscaffolds, 'name': pipe_suffix} final_scaffolds.append(scaffold_data) output_types.append(output_type) except: pass if rcontigs: contig_data = {'files': rcontigs, 'name': pipe_suffix, 'alignment_bam': []} final_contigs.append(contig_data) output_types.append(output_type) try: logfiles.append(mod_log) except: print 'error attaching ', mod_log pipeline_stage += 1 cur_contigs = [] cur_scaffolds = [] cur_outputs.append([module_code, output, alldata]) pipe_elapsed_time = time.time() - pipe_start_time pipe_ftime = str(datetime.timedelta(seconds=int(pipe_elapsed_time))) job_data.get_pipeline(pipeline_num)['elapsed_time'] = pipe_elapsed_time if not output: self.out_report.write('ERROR: No contigs produced. See module log\n') else: ## Assessment #self.pmanager.run_module('reapr', job_data) #print job_data # TODO reapr break may be diff from final reapr align! # ale_out, _, _ = self.pmanager.run_module('ale', job_data) # if ale_out: # job_data.get_pipeline(pipeline_num).import_ale(ale_out) # ale_reports[pipe_suffix] = ale_out pipeline_datapath = '{}/{}/pipeline{}/'.format(job_data['datapath'], job_data['job_id'], pipeline_num) try: os.makedirs(pipeline_datapath) except: logging.info("{} exists, skipping mkdir".format(pipeline_datapath)) # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results, # 'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix))) all_files += pipeline_results self.out_report.write('Pipeline {} total time: {}\n\n'.format(pipeline_num, pipe_ftime)) job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix pipe_outputs.append(cur_outputs) pipeline_num += 1 except: print "ERROR: Pipeline #{} Failed".format(pipeline_num) print format_exc(sys.exc_info()) e = str(sys.exc_info()[1]) if e.find('Terminated') != -1: raise Exception(e) exceptions.append(module_name + ':\n' + str(sys.exc_info()[1])) pipeline_num += 1 ## ANALYSIS: Quast job_data['final_contigs'] = final_contigs job_data['final_scaffolds'] = final_scaffolds job_data['params'] = [] #clear overrides from last stage summary = [] # Quast reports for contigs and scaffolds try: #Try to assess, otherwise report pipeline errors if job_data['final_contigs']: job_data['contig_type'] = 'contigs' quast_report, quast_tar, z1, q_log = self.pmanager.run_module('quast', job_data, tar=True, meta=True) if quast_report: summary.append(quast_report[0]) with open(q_log) as infile: self.out_report.write(infile.read()) else: quast_report, quast_tar = '','' if job_data['final_scaffolds']: scaff_data = dict(job_data) scaff_data['final_contigs'] = job_data['final_scaffolds'] scaff_data['contig_type'] = 'scaffolds' scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module('quast', scaff_data, tar=True, meta=True) scaffold_quast = True if scaff_report: summary.append(scaff_report[0]) with open(scaff_log) as infile: self.out_report.write('\n Quast Report - Scaffold Mode \n') self.out_report.write(infile.read()) else: scaffold_quast = False except: if exceptions: if len(exceptions) > 1: raise Exception('Multiple Errors') else: raise Exception(exceptions[0]) else: raise Exception(str(sys.exc_info()[1])) ## CONCAT MODULE LOG FILES self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("="*10)) for log in logfiles: self.out_report.write("\n\n{0} Begin Module {0}\n".format("="*10)) try: with open(log) as infile: self.out_report.write(infile.read()) except: self.out_report.write("Error writing log file") ## Format Returns ctg_analysis = quast_tar.rsplit('/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id']) try: os.rename(quast_tar, ctg_analysis) return_files = [ctg_analysis] except: #summary = '' return_files = [] if scaffold_quast: scf_analysis = scaff_tar.rsplit('/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id']) #summary = quast_report[0] os.rename(scaff_tar, scf_analysis) return_files.append(scf_analysis) contig_files = [] for data in final_contigs + final_scaffolds: for f in data['files']: contig_files.append(os.path.realpath(f)) return_files += all_files ## Deduplicate seen = set() for f in return_files: seen.add(f) return_files = [f for f in seen] #if exceptions: # if len(exceptions) > 1: # raise Exception('Multiple Errors') # else: # raise Exception(exceptions[0]) if contig_files: return_files.append(asm.tar_list('{}/{}'.format(job_data['datapath'], job_data['job_id']), contig_files, '{}_assemblies.tar.gz'.format( job_data['job_id']))) print "return files: {}".format(return_files) return return_files, summary, contig_files, exceptions def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logging.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'default': res = sclient.upload_misc(file, 'default') elif filetype == 'contigs': res = sclient.upload_contigs(file) return res def download(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection(pika.ConnectionParameters( host = self.arasturl)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(queue=self.queue, exclusive=False, auto_delete=False, durable=True) logging.basicConfig(format=("%(asctime)s %s %(levelname)-8s %(message)s",proc().name)) print proc().name, ' [*] Fetching job...' channel.basic_qos(prefetch_count=1) channel.basic_consume(self.callback, queue=self.queue) channel.start_consuming() def callback(self, ch, method, properties, body): print " [*] %r:%r" % (method.routing_key, body) params = json.loads(body) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) uid = job_doc['_id'] ## Check if job was not killed if job_doc['status'] == 'Terminated': print 'Job {} was killed, skipping'.format(params['job_id']) else: try: self.compute(body) except: print sys.exc_info() status = "[FAIL] {}".format(format_tb(sys.exc_info()[2])) print logging.error(status) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) def start(self): self.fetch_job()
class ArastConsumer: def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queues, kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath, modulebin): self.parser = SafeConfigParser() self.parser.read(config) self.kill_list = kill_list self.kill_list_lock = kill_list_lock self.job_list = job_list self.job_list_lock = job_list_lock # Load plugins self.threads = threads self.binpath = binpath self.modulebin = modulebin self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath, modulebin) # Set up environment self.shockurl = shockurl self.datapath = datapath self.rmq_host = rmq_host self.rmq_port = rmq_port self.mongo_host = mongo_host self.mongo_port = mongo_port self.queues = queues self.min_free_space = float( self.parser.get('compute', 'min_free_space')) self.data_expiration_days = float( self.parser.get('compute', 'data_expiration_days')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] collections = { 'jobs': m.get('mongo.collection', 'jobs'), 'auth': m.get('mongo.collection.auth', 'auth'), 'data': m.get('mongo.collection.data', 'data'), 'running': m.get('mongo.collection.running', 'running_jobs') } ###### TODO Use REST API self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'], collections) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, required_space, user, job_id, data_id): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" datapath = self.datapath required_space = self.min_free_space expiration = self.data_expiration_days ### Remove expired directories def can_remove(d, user, job_id, data_id): u, data, j = d.split('/')[-4:-1] if u == user and j == str(job_id): return False if data == str(data_id) and j == 'raw': return False if os.path.isdir(d): return True return False dir_depth = 3 dirs = filter(lambda f: can_remove(f, user, job_id, data_id), glob.glob(datapath + '/' + '*/' * dir_depth)) removed = [] logger.info( 'Searching for directories older than {} days'.format(expiration)) for d in dirs: file_modified = None try: file_modified = datetime.datetime.fromtimestamp( os.path.getmtime(d)) except os.error as e: logger.warning( 'GC ignored "{}": could not get timestamp: {}'.format( d, e)) continue tdiff = datetime.datetime.now() - file_modified if tdiff > datetime.timedelta(days=expiration): logger.info( 'GC: removing expired directory: {} (modified {} ago)'. format(d, tdiff)) removed.append(d) shutil.rmtree(d, ignore_errors=True) else: logger.debug('GC: not removing: {} (modified {} ago)'.format( d, tdiff)) for r in removed: dirs.remove(r) ### Check free space and remove old directories free_space = free_space_in_path(datapath) logger.info("Required space in GB: {} (free = {})".format( required_space, free_space)) times = [] for d in dirs: try: t = os.path.getmtime(d) times.append([t, d]) except: pass times.sort() logger.debug("Directories sorted by time: {}".format(times)) dirs = [x[1] for x in times] busy_dirs = [] while free_space < self.min_free_space and len(dirs) > 0: d = dirs.pop(0) if is_dir_busy(d): busy_dirs.append(d) else: free_space = self.remove_dir(d) while free_space < self.min_free_space: if len(busy_dirs) == 0: logger.error( "GC: free space {} < {} GB; waiting for system space to be available..." .format(free_space, self.min_free_space)) time.sleep(60) else: logger.warning( "GC: free space {} < {} GB; waiting for jobs to complete to reclaim space: {} busy directories..." .format(free_space, self.min_free_space, len(busy_dirs))) checked_dirs = [] while free_space < self.min_free_space and len(busy_dirs) > 0: bd = busy_dirs.pop(0) if is_dir_busy(bd): checked_dirs.append(bd) continue free_space = self.remove_dir(bd) # self.remove_empty_dirs() if free_space < self.min_free_space: busy_dirs = checked_dirs time.sleep(20) free_space = free_space_in_path(self.datapath) self.remove_empty_dirs() def remove_dir(self, d): shutil.rmtree(d, ignore_errors=True) logger.info("GC: space required; %s removed." % d) return free_space_in_path(self.datapath) def remove_empty_dirs(self): data_dirs = filter(lambda f: os.path.isdir(f), glob.glob(self.datapath + '/' + '*/' * 2)) for dd in data_dirs: if not os.listdir(dd): logger.info('GC: removing empty directory: {}'.format(dd)) try: os.rmdir(dd) except os.error as e: logger.warning( 'GC: could not remove empty dir "{}": {}'.format( dd, e)) def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) logger.debug('New Data Format') return self._get_data(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] job_id = params['job_id'] data_id = params['data_id'] token = params['oauth_token'] uid = params['_id'] self.gc_lock.acquire() try: self.garbage_collect(self.datapath, self.min_free_space, user, job_id, data_id) except: logger.error('Unexpected error in GC.') raise finally: self.gc_lock.release() ##### Get data from ID ##### data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) logger.debug('data_doc = {}'.format(data_doc)) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm( data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') with ignored(OSError): os.makedirs(filepath) touch(filepath) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: if file_set['type'] == 'paired_url': file_set['type'] = 'paired' elif file_set['type'] == 'single_url': file_set['type'] = 'single' elif file_set['type'] == 'reference_url': file_set['type'] = 'reference' file_set['files'] = [] #legacy for file_info in file_set['file_infos']: #### File is stored on Shock if file_info['filename']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): local_file = self.extract_file(local_file) logger.info("Requested data exists on node: {}".format( local_file)) else: local_file = self.download_shock( file_info['shock_url'], user, token, file_info['shock_id'], filepath) elif file_info['direct_url']: local_file = os.path.join( filepath, os.path.basename(file_info['direct_url'])) if os.path.exists(local_file): local_file = self.extract_file(local_file) logger.info("Requested data exists on node: {}".format( local_file)) else: local_file = self.download_url(file_info['direct_url'], filepath, token=token) file_info['local_file'] = local_file if file_set['type'] == 'single' and asm.is_long_read_file( local_file): if not 'tags' in file_set: file_set['tags'] = [] if not 'long_read' in file_set['tags']: file_set['tags'].append( 'long_read') # pacbio or nanopore reads file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def prepare_job_data(self, body): params = json.loads(body) job_id = params['job_id'] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except OSError as e: if e.errno != errno.EEXIST: raise ### Protect data directory from GC before any job starts touch(os.path.join(rawpath, "_READY_")) ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] contigs = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) elif fileset['type'] == 'contigs': contigs.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({ 'job_id': params['job_id'], 'uid': params['_id'], 'user': params['ARASTUSER'], 'reads': reads, 'logfiles': [], 'reference': reference, 'contigs': contigs, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'params': [], 'exceptions': [], 'pipeline_data': {}, 'datapath': datapath, 'out_report': self.out_report }) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) return job_data def compute(self, body): self.job_list_lock.acquire() try: job_data = self.prepare_job_data(body) self.job_list.append(job_data) except: logger.error("Error in adding new job to job_list") raise finally: self.job_list_lock.release() status = '' logger.debug('job_data = {}'.format(job_data)) params = json.loads(body) job_id = params['job_id'] data_id = params['data_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params.get('pipeline') recipe = params.get('recipe') wasp_in = params.get('wasp') jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id)) url = shock.verify_shock_url(self.shockurl) self.start_time = time.time() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() #### Parse pipeline to wasp exp reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0], job_id) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif not pipelines: wasp_exp = recipes.get('auto', job_id) elif pipelines: ## Legacy client if pipelines[0] == 'auto': wasp_exp = recipes.get('auto', job_id) ########## else: if type(pipelines[0]) is not list: # --assemblers pipelines = [pipelines] all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) logger.debug("pipelines = {}".format(all_pipes)) wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) else: raise asmtypes.ArastClientRequestError('Malformed job request.') logger.debug('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata) ###### Run Job try: w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags uploaded_fsets = job_data.upload_results(url, token) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log errors if len(job_data['errors']) > 0: new_report.write('PIPELINE ERRORS\n') for i, e in enumerate(job_data['errors']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module( 'quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format( os.path.basename(log), '=' * 20)) with open(log) as l: new_report.write(l.read()) ### Log tracebacks if len(job_data['tracebacks']) > 0: new_report.write('EXCEPTION TRACEBACKS\n') for i, e in enumerate(job_data['tracebacks']): new_report.write('{}: {}\n'.format(i, e)) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) self.metadata.update_job( uid, 'report', [asmtypes.set_factory('report', [report_info])]) status = 'Complete with errors' if job_data.get( 'errors') else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] self.metadata.update_job(uid, 'data', job_data) self.metadata.update_job(uid, 'result_data', uploaded_fsets) ###### Legacy Support ####### filesets = uploaded_fsets.append( asmtypes.set_factory('report', [report_info])) contigsets = [ fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds' ] download_ids = { fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos'] } contig_ids = { fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos'] } self.metadata.update_job(uid, 'result_data_legacy', [download_ids]) self.metadata.update_job(uid, 'contig_ids', [contig_ids]) ################### sys.stdout.flush() touch(os.path.join(jobpath, "_DONE_")) logger.info('============== JOB COMPLETE ===============') except asmtypes.ArastUserInterrupt: status = 'Terminated by user' sys.stdout.flush() touch(os.path.join(jobpath, "_CANCELLED__")) logger.info('============== JOB KILLED ===============') finally: self.remove_job_from_lists(job_data) logger.debug('Reinitialize plugin manager...' ) # Reinitialize to get live changes self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin) self.metadata.update_job(uid, 'status', status) def remove_job_from_lists(self, job_data): self.job_list_lock.acquire() try: for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job[ 'job_id'] == job_data['job_id']: self.job_list.pop(i) except: logger.error( "Unexpected error in removing executed jobs from job_list") raise finally: self.job_list_lock.release() # kill_list cleanup for cases where a kill request is enqueued right before the corresponding job gets popped self.kill_list_lock.acquire() try: for i, kill_request in enumerate(self.kill_list): if kill_request['user'] == job_data['user'] and kill_request[ 'job_id'] == job_data['job_id']: self.kill_list.pop(i) except: logger.error( "Unexpected error in removing executed jobs from kill_list") raise finally: self.kill_list_lock.release() def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logger.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'contigs' or filetype == 'scaffolds': res = sclient.upload_contigs(file) else: res = sclient.upload_file(file, filetype, curl=True) return res def download_shock(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return self.extract_file(downloaded) def download_url(self, url, outdir, token=None): downloaded = shock.curl_download_url(url, outdir=outdir, token=token) return self.extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection( pika.ConnectionParameters(host=self.rmq_host, port=self.rmq_port)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(exclusive=False, auto_delete=False, durable=True) logger.info('Fetching job...') channel.basic_qos(prefetch_count=1) for queue in self.queues: print 'Using queue: {}'.format(queue) channel.basic_consume(self.callback, queue=queue) channel.start_consuming() def callback(self, ch, method, properties, body): params = json.loads(body) display = [ 'ARASTUSER', 'job_id', 'message', 'recipe', 'pipeline', 'wasp' ] logger.info('Incoming job: ' + ', '.join( ['{}: {}'.format(k, params[k]) for k in display if params[k]])) logger.debug(params) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) ## Check if job was not killed if job_doc is None: logger.error('Error: no job_doc found for {}'.format( params.get('job_id'))) return if job_doc.get('status') == 'Terminated by user': logger.warn('Job {} was killed, skipping'.format( params.get('job_id'))) else: self.done_flag = threading.Event() uid = None try: uid = job_doc['_id'] self.compute(body) except Exception as e: tb = format_exc() status = "[FAIL] {}".format(e) logger.error("{}\n{}".format(status, tb)) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) self.done_flag.set() def start(self): self.fetch_job() def extract_file(self, filename): """ Decompress files if necessary """ unp_bin = os.path.join(self.modulebin, 'unp') filepath = os.path.dirname(filename) uncompressed = ['fasta', 'fa', 'fastq', 'fq', 'fna', 'h5'] supported = [ 'tar.gz', 'tar.bz2', 'bz2', 'gz', 'lz', 'rar', 'tar', 'tgz', 'zip' ] for ext in uncompressed: if filename.endswith('.' + ext): return filename for ext in supported: if filename.endswith('.' + ext): extracted_file = filename[:filename.index(ext) - 1] if os.path.exists(extracted_file): # Check extracted already return extracted_file logger.info("Extracting {}...".format(filename)) # p = subprocess.Popen([unp_bin, filename], # cwd=filepath, stderr=subprocess.STDOUT) # p.wait() # Hide the "broken pipe" message from unp out = subprocess.Popen( [unp_bin, filename], cwd=filepath, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0] if os.path.exists(extracted_file): return extracted_file else: logger.error("Extraction of {} failed: {}".format( filename, out)) raise Exception('Archive structure error') logger.error("Could not extract {}".format(filename)) return filename
def compute(self, body): self.job_list_lock.acquire() try: job_data = self.prepare_job_data(body) self.job_list.append(job_data) except: logger.error("Error in adding new job to job_list") raise finally: self.job_list_lock.release() status = '' logger.debug('job_data = {}'.format(job_data)) params = json.loads(body) job_id = params['job_id'] data_id = params['data_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params.get('pipeline') recipe = params.get('recipe') wasp_in = params.get('wasp') jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id)) url = shock.verify_shock_url(self.shockurl) self.start_time = time.time() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() #### Parse pipeline to wasp exp reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0], job_id) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif not pipelines: wasp_exp = recipes.get('auto', job_id) elif pipelines: ## Legacy client if pipelines[0] == 'auto': wasp_exp = recipes.get('auto', job_id) ########## else: if type(pipelines[0]) is not list: # --assemblers pipelines = [pipelines] all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) logger.debug("pipelines = {}".format(all_pipes)) wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) else: raise asmtypes.ArastClientRequestError('Malformed job request.') logger.debug('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata) ###### Run Job try: w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags uploaded_fsets = job_data.upload_results(url, token) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log errors if len(job_data['errors']) > 0: new_report.write('PIPELINE ERRORS\n') for i, e in enumerate(job_data['errors']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module( 'quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format( os.path.basename(log), '=' * 20)) with open(log) as l: new_report.write(l.read()) ### Log tracebacks if len(job_data['tracebacks']) > 0: new_report.write('EXCEPTION TRACEBACKS\n') for i, e in enumerate(job_data['tracebacks']): new_report.write('{}: {}\n'.format(i, e)) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) self.metadata.update_job( uid, 'report', [asmtypes.set_factory('report', [report_info])]) status = 'Complete with errors' if job_data.get( 'errors') else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] self.metadata.update_job(uid, 'data', job_data) self.metadata.update_job(uid, 'result_data', uploaded_fsets) ###### Legacy Support ####### filesets = uploaded_fsets.append( asmtypes.set_factory('report', [report_info])) contigsets = [ fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds' ] download_ids = { fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos'] } contig_ids = { fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos'] } self.metadata.update_job(uid, 'result_data_legacy', [download_ids]) self.metadata.update_job(uid, 'contig_ids', [contig_ids]) ################### sys.stdout.flush() touch(os.path.join(jobpath, "_DONE_")) logger.info('============== JOB COMPLETE ===============') except asmtypes.ArastUserInterrupt: status = 'Terminated by user' sys.stdout.flush() touch(os.path.join(jobpath, "_CANCELLED__")) logger.info('============== JOB KILLED ===============') finally: self.remove_job_from_lists(job_data) logger.debug('Reinitialize plugin manager...' ) # Reinitialize to get live changes self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin) self.metadata.update_job(uid, 'status', status)
class ArastConsumer: def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf, datapath, binpath): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list, binpath) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = datapath if queue: self.queue = queue logging.info('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq','default_routing_key') self.min_free_space = float(self.parser.get('compute','min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] ###### TODO Use REST API self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth'], m['mongo.collection.data'] ) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, user, required_space): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" self.gc_lock.acquire() s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) logging.debug("Required space in bytes: %s" % required_space) while ((free_space - self.min_free_space) < required_space): #Delete old data dirs = os.listdir(os.path.join(datapath, user)) times = [] for dir in dirs: times.append(os.path.getmtime(os.path.join(datapath, user, dir))) if len(dirs) > 0: old_dir = os.path.join(datapath, user, dirs[times.index(min(times))]) shutil.rmtree(old_dir, ignore_errors=True) else: logging.error("No more directories to remove") break logging.info("Space required. %s removed." % old_dir) s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) self.gc_lock.release() def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) if ('assembly_data' in params or params['version'] == 'widget'): logging.info('New Data Format') return self._get_data(body) else: return self._get_data_old(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] token = params['oauth_token'] uid = params['_id'] ##### Get data from ID ##### data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') try:os.makedirs(filepath) except:pass ### TODO Garbage collect ### download_url = 'http://{}'.format(self.shockurl) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: if file_set['type'] == 'paired_url': file_set['type'] = 'paired' elif file_set['type'] == 'single_url': file_set['type'] = 'single' elif file_set['type'] == 'reference_url': file_set['type'] = 'reference' file_set['files'] = [] #legacy for file_info in file_set['file_infos']: #### File is stored on Shock if file_info['filename']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): logging.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download_shock(download_url, user, token, file_info['shock_id'], filepath) elif file_info['direct_url']: local_file = os.path.join(filepath, os.path.basename(file_info['direct_url'])) if os.path.exists(local_file): logging.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download_url(file_info['direct_url'], filepath) file_info['local_file'] = local_file file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def compute(self, body): error = False params = json.loads(body) job_id = params['job_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params['pipeline'] recipe = None wasp_in = None try: ## In case legacy recipe = params['recipe'] wasp_in = params['wasp'] except:pass #support legacy arast client if len(pipelines) > 0: if type(pipelines[0]) is not list: pipelines = [pipelines] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except Exception as e: print e raise Exception ('Data Error') ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({'job_id' : params['job_id'], 'uid' : params['_id'], 'user' : params['ARASTUSER'], 'reads': reads, 'logfiles': [], 'reference': reference, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'params': [], 'exceptions': [], 'pipeline_data': {}, 'datapath': datapath, 'out_report' : self.out_report}) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) self.job_list.append(job_data) self.start_time = time.time() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() url = "http://%s" % (self.shockurl) status = '' #### Parse pipeline to wasp exp wasp_exp = pipelines[0][0] reload(recipes) if recipe: try: wasp_exp = recipes.get(recipe[0]) except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0])) elif wasp_in: wasp_exp = wasp_in[0] elif pipelines[0] == 'auto': wasp_exp = recipes.get('auto') else: all_pipes = [] for p in pipelines: all_pipes += self.pmanager.parse_input(p) print all_pipes wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id']) logging.info('Wasp Expression: {}'.format(wasp_exp)) print('Wasp Expression: {}'.format(wasp_exp)) w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata) w_engine.run_expression(wasp_exp, job_data) ###### Upload all result files and place them into appropriate tags uploaded_fsets = job_data.upload_results(url, token) for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']: self.job_list.pop(i) # Format report new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log exceptions if len(job_data['exceptions']) > 0: new_report.write('PIPELINE ERRORS\n') for i,e in enumerate(job_data['exceptions']): new_report.write('{}: {}\n'.format(i, e)) try: ## Get Quast output quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0] with open(quast_report) as q: new_report.write(q.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) for log in job_data['logfiles']: new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20)) with open(log) as l: new_report.write(l.read()) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id']) self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])]) status = 'Complete with errors' if job_data['exceptions'] else 'Complete' ## Make compatible with JSON dumps() del job_data['out_report'] del job_data['initial_reads'] del job_data['raw_reads'] self.metadata.update_job(uid, 'data', job_data) self.metadata.update_job(uid, 'result_data', uploaded_fsets) self.metadata.update_job(uid, 'status', status) ###### Legacy Support ####### filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info])) contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds'] download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']} contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']} self.metadata.update_job(uid, 'result_data_legacy', [download_ids]) self.metadata.update_job(uid, 'contig_ids', [contig_ids]) ################### print '============== JOB COMPLETE ===============' def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logging.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'contigs' or filetype == 'scaffolds': res = sclient.upload_contigs(file) else: res = sclient.upload_misc(file, filetype) return res def download_shock(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return extract_file(downloaded) def download_url(self, url, outdir): downloaded = asm.curl_download_url(url, outdir=outdir) return extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection(pika.ConnectionParameters( host = self.arasturl)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(queue=self.queue, exclusive=False, auto_delete=False, durable=True) logging.basicConfig(format=("%(asctime)s %s %(levelname)-8s %(message)s",proc().name)) print proc().name, ' [*] Fetching job...' channel.basic_qos(prefetch_count=1) channel.basic_consume(self.callback, queue=self.queue) channel.start_consuming() def callback(self, ch, method, properties, body): params = json.loads(body) display = ['ARASTUSER', 'job_id', 'message'] print ' [+] Incoming:', ', '.join(['{}: {}'.format(k, params[k]) for k in display]) logging.info(params) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) uid = job_doc['_id'] ## Check if job was not killed if job_doc['status'] == 'Terminated': print 'Job {} was killed, skipping'.format(params['job_id']) else: self.done_flag = threading.Event() try: self.compute(body) except Exception as e: tb = format_exc() status = "[FAIL] {}".format(e) print e print logging.error(tb) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) self.done_flag.set() def start(self): self.fetch_job() ###### Legacy Support ###### def _get_data_old(self, body): params = json.loads(body) #filepath = self.datapath + str(params['data_id']) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] uid = params['_id'] job_id = params['job_id'] user = params['ARASTUSER'] data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if data_doc: paired = data_doc['pair'] single = data_doc['single'] files = data_doc['filename'] ids = data_doc['ids'] token = params['oauth_token'] try: ref = data_doc['reference'] except: pass else: self.metadata.update_job(uid, 'status', 'Invalid Data ID') raise Exception('Data {} does not exist on Shock Server'.format( params['data_id'])) all_files = [] if os.path.isdir(filepath): logging.info("Requested data exists on node") try: for l in paired: filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No single files submitted!') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No reference files submitted!') touch(datapath) ## Data does not exist on current compute node else: self.metadata.update_job(uid, 'status', 'Data transfer') os.makedirs(filepath) # Get required space and garbage collect try: req_space = 0 for file_size in data_doc['file_sizes']: req_space += file_size self.garbage_collect(self.datapath, user, req_space) except: pass url = "http://%s" % (self.shockurl) try: for l in paired: #FILEDICT contains a single read library's info filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) dl = self.download_shock(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download_shock(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download_shock(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download_shock(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download_shock(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download_shock(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: #logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') return datapath, all_files
class ArastConsumer: def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = self.parser.get('compute', 'datapath') if queue: self.queue = queue print('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq', 'default_routing_key') self.min_free_space = float( self.parser.get('compute', 'min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth']) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, user, required_space): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" self.gc_lock.acquire() s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) logging.debug("Required space in bytes: %s" % required_space) while ((free_space - self.min_free_space) < required_space): #Delete old data dirs = os.listdir(os.path.join(datapath, user)) times = [] for dir in dirs: times.append( os.path.getmtime(os.path.join(datapath, user, dir))) if len(dirs) > 0: old_dir = os.path.join(datapath, user, dirs[times.index(min(times))]) shutil.rmtree(old_dir, ignore_errors=True) else: logging.error("No more directories to remove") break logging.info("Space required. %s removed." % old_dir) s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) self.gc_lock.release() def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) if 'assembly_data' in params: logging.info('New Data Format') return self._get_data(body) else: return self._get_data_old(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] token = params['oauth_token'] uid = params['_id'] ##### Get data from ID ##### data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm( data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') try: os.makedirs(filepath) except: pass ### TODO Garbage collect ### download_url = 'http://{}'.format(self.shockurl) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: file_set['files'] = [] #legacy for file_info in file_set['file_infos']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): logging.info( "Requested data exists on node: {}".format(local_file)) else: local_file = self.download(download_url, user, token, file_info['shock_id'], filepath) file_info['local_file'] = local_file file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def _get_data_old(self, body): params = json.loads(body) #filepath = self.datapath + str(params['data_id']) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] uid = params['_id'] job_id = params['job_id'] user = params['ARASTUSER'] data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if data_doc: paired = data_doc['pair'] single = data_doc['single'] files = data_doc['filename'] ids = data_doc['ids'] token = params['oauth_token'] try: ref = data_doc['reference'] except: pass else: self.metadata.update_job(uid, 'status', 'Invalid Data ID') raise Exception('Data {} does not exist on Shock Server'.format( params['data_id'])) all_files = [] if os.path.isdir(filepath): logging.info("Requested data exists on node") try: for l in paired: filedict = {'type': 'paired', 'files': []} for word in l: if is_filename(word): baseword = os.path.basename(word) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type': 'single', 'files': []} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No single files submitted!') try: for r in ref: for wordpath in r: filedict = {'type': 'reference', 'files': []} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No reference files submitted!') touch(datapath) ## Data does not exist on current compute node else: self.metadata.update_job(uid, 'status', 'Data transfer') os.makedirs(filepath) # Get required space and garbage collect try: req_space = 0 for file_size in data_doc['file_sizes']: req_space += file_size self.garbage_collect(self.datapath, user, req_space) except: pass url = "http://%s" % (self.shockurl) try: for l in paired: #FILEDICT contains a single read library's info filedict = {'type': 'paired', 'files': []} for word in l: if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type': 'single', 'files': []} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') try: for r in ref: for wordpath in r: filedict = {'type': 'reference', 'files': []} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: #logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') print all_files return datapath, all_files def compute(self, body): error = False params = json.loads(body) job_id = params['job_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params['pipeline'] #support legacy arast client if len(pipelines) > 0: if type(pipelines[0]) is not list: pipelines = [pipelines] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except: raise Exception('Data Error') ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({ 'job_id': params['job_id'], 'uid': params['_id'], 'user': params['ARASTUSER'], 'reads': reads, 'reference': reference, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'processed_reads': list(reads), 'pipeline_data': {}, 'datapath': datapath, 'out_report': self.out_report, 'logfiles': [] }) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) self.job_list.append(job_data) self.start_time = time.time() self.done_flag = threading.Event() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() download_ids = {} contig_ids = {} url = "http://%s" % (self.shockurl) # url += '/node' try: include_all_data = params['all_data'] except: include_all_data = False contigs = not include_all_data status = '' ## TODO CHANGE: default pipeline default_pipe = ['velvet'] exceptions = [] if pipelines: try: if pipelines == ['auto']: pipelines = [ default_pipe, ] for p in pipelines: self.pmanager.validate_pipe(p) result_files, summary, contig_files, exceptions = self.run_pipeline( pipelines, job_data, contigs_only=contigs) for i, f in enumerate(result_files): #fname = os.path.basename(f).split('.')[0] fname = str(i) res = self.upload(url, user, token, f) download_ids[fname] = res['data']['id'] for c in contig_files: fname = os.path.basename(c).split('.')[0] res = self.upload(url, user, token, c, filetype='contigs') contig_ids[fname] = res['data']['id'] # Check if job completed with no errors if exceptions: status = 'Complete with errors' elif not summary: status = 'Complete: No valid contigs' else: status += "Complete" self.out_report.write("Pipeline completed successfully\n") except: traceback = format_exc(sys.exc_info()) status = "[FAIL] {}".format(sys.exc_info()[1]) print traceback self.out_report.write("ERROR TRACE:\n{}\n".format( format_tb(sys.exc_info()[2]))) # Format report for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data[ 'job_id']: self.job_list.pop(i) self.done_flag.set() new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log exceptions if len(exceptions) > 0: new_report.write('PIPELINE ERRORS') for i, e in enumerate(exceptions): new_report.write('{}: {}\n'.format(i, e)) try: for sum in summary: with open(sum) as s: new_report.write(s.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) download_ids['report'] = res['data']['id'] # Get location self.metadata.update_job(uid, 'result_data', download_ids) self.metadata.update_job(uid, 'contig_ids', contig_ids) self.metadata.update_job(uid, 'status', status) print '=========== JOB COMPLETE ============' def update_time_record(self): elapsed_time = time.time() - self.start_time ftime = str(datetime.timedelta(seconds=int(elapsed_time))) self.metadata.update_job(uid, 'computation_time', ftime) def run_pipeline(self, pipes, job_data, contigs_only=True): """ Runs all pipelines in list PIPES """ all_pipes = [] for p in pipes: all_pipes += self.pmanager.parse_input(p) logging.info('{} pipelines:'.format(len(all_pipes))) for p in all_pipes: print '->'.join(p) #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads' include_reads = False pipeline_num = 1 all_files = [] pipe_outputs = [] logfiles = [] ale_reports = {} final_contigs = [] final_scaffolds = [] output_types = [] exceptions = [] num_pipes = len(all_pipes) for pipe in all_pipes: try: #job_data = copy.deepcopy(job_data_global) #job_data['out_report'] = job_data_global['out_report'] pipeline, overrides = self.pmanager.parse_pipe(pipe) job_data.add_pipeline(pipeline_num, pipeline) num_stages = len(pipeline) pipeline_stage = 1 pipeline_results = [] cur_outputs = [] # Reset job data job_data['reads'] = copy.deepcopy(job_data['raw_reads']) job_data['processed_reads'] = [] print job_data self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format( '=' * 15, pipeline_num, pipe)) pipe_suffix = '' # filename code for indiv pipes pipe_start_time = time.time() pipe_alive = True # Store data record for pipeline for module_name in pipeline: if not pipe_alive: self.out_report.write( '\n{0} Module Failure, Killing Pipe {0}'.format( 'X' * 10)) break module_code = '' # unique code for data reuse print '\n\n{0} Running module: {1} {2}'.format( '=' * 20, module_name, '=' * (35 - len(module_name))) self.garbage_collect(self.datapath, job_data['user'], 2147483648) # 2GB ## PROGRESS CALCULATION pipes_complete = (pipeline_num - 1) / float(num_pipes) stage_complete = (pipeline_stage - 1) / float(num_stages) pct_segment = 1.0 / num_pipes stage_complete *= pct_segment total_complete = pipes_complete + stage_complete cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format( int(total_complete * 100), pipeline_num, num_pipes, pipeline_stage, num_stages, module_name) self.metadata.update_job(job_data['uid'], 'status', cur_state) ## LOG REPORT For now, module code is 1st and last letter short_name = self.pmanager.get_short_name(module_name) if short_name: #pipe_suffix += short_name.capitalize() module_code += short_name.capitalize() else: #pipe_suffix += module_name[0].upper() + module_name[-1] module_code += module_name[0].upper() + module_name[-1] mod_overrides = overrides[pipeline_stage - 1] for k in mod_overrides.keys(): #pipe_suffix += '_{}{}'.format(k[0], par[k]) module_code += '_{}{}'.format(k[0], mod_overrides[k]) pipe_suffix += module_code self.out_report.write( 'PIPELINE {} -- STAGE {}: {}\n'.format( pipeline_num, pipeline_stage, module_name)) logging.debug('New job_data for stage {}: {}'.format( pipeline_stage, job_data)) job_data['params'] = overrides[pipeline_stage - 1].items() module_start_time = time.time() ## RUN MODULE # Check if output data exists reuse_data = False enable_reuse = True # KILL SWITCH if enable_reuse: for k, pipe in enumerate(pipe_outputs): if reuse_data: break if not pipe: continue # Check that all previous pipes match for i in range(pipeline_stage): try: if not pipe[i][0] == cur_outputs[i][0]: break except: pass try: if (pipe[i][0] == module_code and i == pipeline_stage - 1): #and overrides[i].items() == job_data['params']): #copy! print( 'Found previously computed data, reusing {}.' .format(module_code)) output = [] + pipe[i][1] pfix = (k + 1, i + 1) alldata = [] + pipe[i][2] reuse_data = True job_data.get_pipeline( pipeline_num).get_module( pipeline_stage )['elapsed_time'] = time.time( job_data.get_pipeline(i). get_module(pipeline_stage) ['elapsed_time']) break except: # Previous pipes may be shorter pass output_type = self.pmanager.output_type(module_name) if not reuse_data: output, alldata, mod_log = self.pmanager.run_module( module_name, job_data, all_data=True, reads=include_reads) ##### Module produced no output, attach log and proceed to next ##### if not output: pipe_alive = False try: print mod_log logfiles.append(mod_log) except: print 'error attaching ', mod_log break ##### Prefix outfiles with pipe stage (only assembler modules) ##### alldata = [ asm.prefix_file_move( file, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) for file in alldata ] module_elapsed_time = time.time() - module_start_time job_data.get_pipeline(pipeline_num).get_module( pipeline_stage )['elapsed_time'] = module_elapsed_time if alldata: #If log was renamed mod_log = asm.prefix_file( mod_log, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) if output_type == 'contigs' or output_type == 'scaffolds': #Assume assembly contigs if reuse_data: p_num, p_stage = pfix else: p_num, p_stage = pipeline_num, pipeline_stage # If plugin returned scaffolds if type(output) is tuple and len(output) == 2: out_contigs = output[0] out_scaffolds = output[1] cur_scaffolds = [ asm.prefix_file( file, "P{}_S{}_{}".format( p_num, p_stage, module_name)) for file in out_scaffolds ] else: out_contigs = output cur_contigs = [ asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_contigs ] #job_data['reads'] = asm.arast_reads(alldata) job_data['contigs'] = cur_contigs elif output_type == 'reads': #Assume preprocessing if include_reads and reuse_data: # data was prefixed and moved for d in output: files = [ asm.prefix_file( f, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) for f in d['files'] ] d['files'] = files d['short_reads'] = [] + files job_data['reads'] = output job_data['processed_reads'] = list(job_data['reads']) else: # Generic return, don't use in further stages pipeline_results += output logging.info( 'Generic plugin output: {}'.format(output)) if pipeline_stage == num_stages: # Last stage, add contig for assessment if output and (output_type == 'contigs' or output_type == 'scaffolds' ): #If a contig was produced fcontigs = cur_contigs rcontigs = [ asm.rename_file_symlink( f, 'P{}_{}'.format(pipeline_num, pipe_suffix)) for f in fcontigs ] try: rscaffolds = [ asm.rename_file_symlink( f, 'P{}_{}_{}'.format( pipeline_num, pipe_suffix, 'scaff')) for f in cur_scaffolds ] if rscaffolds: scaffold_data = { 'files': rscaffolds, 'name': pipe_suffix } final_scaffolds.append(scaffold_data) output_types.append(output_type) except: pass if rcontigs: contig_data = { 'files': rcontigs, 'name': pipe_suffix, 'alignment_bam': [] } final_contigs.append(contig_data) output_types.append(output_type) try: logfiles.append(mod_log) except: print 'error attaching ', mod_log pipeline_stage += 1 cur_contigs = [] cur_scaffolds = [] cur_outputs.append([module_code, output, alldata]) pipe_elapsed_time = time.time() - pipe_start_time pipe_ftime = str( datetime.timedelta(seconds=int(pipe_elapsed_time))) job_data.get_pipeline( pipeline_num)['elapsed_time'] = pipe_elapsed_time if not output: self.out_report.write( 'ERROR: No contigs produced. See module log\n') else: ## Assessment #self.pmanager.run_module('reapr', job_data) #print job_data # TODO reapr break may be diff from final reapr align! # ale_out, _, _ = self.pmanager.run_module('ale', job_data) # if ale_out: # job_data.get_pipeline(pipeline_num).import_ale(ale_out) # ale_reports[pipe_suffix] = ale_out pipeline_datapath = '{}/{}/pipeline{}/'.format( job_data['datapath'], job_data['job_id'], pipeline_num) try: os.makedirs(pipeline_datapath) except: logging.info("{} exists, skipping mkdir".format( pipeline_datapath)) # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results, # 'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix))) all_files += pipeline_results self.out_report.write('Pipeline {} total time: {}\n\n'.format( pipeline_num, pipe_ftime)) job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix pipe_outputs.append(cur_outputs) pipeline_num += 1 except: print "ERROR: Pipeline #{} Failed".format(pipeline_num) print format_exc(sys.exc_info()) e = str(sys.exc_info()[1]) if e.find('Terminated') != -1: raise Exception(e) exceptions.append(module_name + ':\n' + str(sys.exc_info()[1])) pipeline_num += 1 ## ANALYSIS: Quast job_data['final_contigs'] = final_contigs job_data['final_scaffolds'] = final_scaffolds job_data['params'] = [] #clear overrides from last stage summary = [] # Quast reports for contigs and scaffolds try: #Try to assess, otherwise report pipeline errors if job_data['final_contigs']: job_data['contig_type'] = 'contigs' quast_report, quast_tar, z1, q_log = self.pmanager.run_module( 'quast', job_data, tar=True, meta=True) if quast_report: summary.append(quast_report[0]) with open(q_log) as infile: self.out_report.write(infile.read()) else: quast_report, quast_tar = '', '' if job_data['final_scaffolds']: scaff_data = dict(job_data) scaff_data['final_contigs'] = job_data['final_scaffolds'] scaff_data['contig_type'] = 'scaffolds' scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module( 'quast', scaff_data, tar=True, meta=True) scaffold_quast = True if scaff_report: summary.append(scaff_report[0]) with open(scaff_log) as infile: self.out_report.write('\n Quast Report - Scaffold Mode \n') self.out_report.write(infile.read()) else: scaffold_quast = False except: if exceptions: if len(exceptions) > 1: raise Exception('Multiple Errors') else: raise Exception(exceptions[0]) else: raise Exception(str(sys.exc_info()[1])) ## CONCAT MODULE LOG FILES self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("=" * 10)) for log in logfiles: self.out_report.write("\n\n{0} Begin Module {0}\n".format("=" * 10)) try: with open(log) as infile: self.out_report.write(infile.read()) except: self.out_report.write("Error writing log file") ## Format Returns ctg_analysis = quast_tar.rsplit( '/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id']) try: os.rename(quast_tar, ctg_analysis) return_files = [ctg_analysis] except: #summary = '' return_files = [] if scaffold_quast: scf_analysis = scaff_tar.rsplit( '/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id']) #summary = quast_report[0] os.rename(scaff_tar, scf_analysis) return_files.append(scf_analysis) contig_files = [] for data in final_contigs + final_scaffolds: for f in data['files']: contig_files.append(os.path.realpath(f)) return_files += all_files ## Deduplicate seen = set() for f in return_files: seen.add(f) return_files = [f for f in seen] #if exceptions: # if len(exceptions) > 1: # raise Exception('Multiple Errors') # else: # raise Exception(exceptions[0]) if contig_files: return_files.append( asm.tar_list( '{}/{}'.format(job_data['datapath'], job_data['job_id']), contig_files, '{}_assemblies.tar.gz'.format(job_data['job_id']))) print "return files: {}".format(return_files) return return_files, summary, contig_files, exceptions def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logging.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'default': res = sclient.upload_misc(file, 'default') elif filetype == 'contigs': res = sclient.upload_contigs(file) return res def download(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection( pika.ConnectionParameters(host=self.arasturl)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(queue=self.queue, exclusive=False, auto_delete=False, durable=True) logging.basicConfig( format=("%(asctime)s %s %(levelname)-8s %(message)s", proc().name)) print proc().name, ' [*] Fetching job...' channel.basic_qos(prefetch_count=1) channel.basic_consume(self.callback, queue=self.queue) channel.start_consuming() def callback(self, ch, method, properties, body): print " [*] %r:%r" % (method.routing_key, body) params = json.loads(body) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) uid = job_doc['_id'] ## Check if job was not killed if job_doc['status'] == 'Terminated': print 'Job {} was killed, skipping'.format(params['job_id']) else: try: self.compute(body) except: print sys.exc_info() status = "[FAIL] {}".format(format_tb(sys.exc_info()[2])) print logging.error(status) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) def start(self): self.fetch_job()