class ArastConsumer: def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = self.parser.get('compute','datapath') if queue: self.queue = queue print('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq','default_routing_key') self.min_free_space = float(self.parser.get('compute','min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth']) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, user, required_space): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" self.gc_lock.acquire() s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) logging.debug("Required space in bytes: %s" % required_space) while ((free_space - self.min_free_space) < required_space): #Delete old data dirs = os.listdir(os.path.join(datapath, user)) times = [] for dir in dirs: times.append(os.path.getmtime(os.path.join(datapath, user, dir))) if len(dirs) > 0: old_dir = os.path.join(datapath, user, dirs[times.index(min(times))]) shutil.rmtree(old_dir, ignore_errors=True) else: logging.error("No more directories to remove") break logging.info("Space required. %s removed." % old_dir) s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) self.gc_lock.release() def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) if 'assembly_data' in params: logging.info('New Data Format') return self._get_data(body) else: return self._get_data_old(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] token = params['oauth_token'] uid = params['_id'] ##### Get data from ID ##### data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') try:os.makedirs(filepath) except:pass ### TODO Garbage collect ### download_url = 'http://{}'.format(self.shockurl) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: file_set['files'] = [] #legacy for file_info in file_set['file_infos']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): logging.info("Requested data exists on node: {}".format(local_file)) else: local_file = self.download(download_url, user, token, file_info['shock_id'], filepath) file_info['local_file'] = local_file file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def _get_data_old(self, body): params = json.loads(body) #filepath = self.datapath + str(params['data_id']) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] uid = params['_id'] job_id = params['job_id'] user = params['ARASTUSER'] data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if data_doc: paired = data_doc['pair'] single = data_doc['single'] files = data_doc['filename'] ids = data_doc['ids'] token = params['oauth_token'] try: ref = data_doc['reference'] except: pass else: self.metadata.update_job(uid, 'status', 'Invalid Data ID') raise Exception('Data {} does not exist on Shock Server'.format( params['data_id'])) all_files = [] if os.path.isdir(filepath): logging.info("Requested data exists on node") try: for l in paired: filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No single files submitted!') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No reference files submitted!') touch(datapath) ## Data does not exist on current compute node else: self.metadata.update_job(uid, 'status', 'Data transfer') os.makedirs(filepath) # Get required space and garbage collect try: req_space = 0 for file_size in data_doc['file_sizes']: req_space += file_size self.garbage_collect(self.datapath, user, req_space) except: pass url = "http://%s" % (self.shockurl) try: for l in paired: #FILEDICT contains a single read library's info filedict = {'type':'paired', 'files':[]} for word in l: if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type':'single', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') try: for r in ref: for wordpath in r: filedict = {'type':'reference', 'files':[]} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle(dl): #Shock handle, get real data logging.info('Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download(s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: #logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') print all_files return datapath, all_files def compute(self, body): error = False params = json.loads(body) job_id = params['job_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params['pipeline'] #support legacy arast client if len(pipelines) > 0: if type(pipelines[0]) is not list: pipelines = [pipelines] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except: raise Exception ('Data Error') ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({'job_id' : params['job_id'], 'uid' : params['_id'], 'user' : params['ARASTUSER'], 'reads': reads, 'reference': reference, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'processed_reads': list(reads), 'pipeline_data': {}, 'datapath': datapath, 'out_report' : self.out_report, 'logfiles': []}) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) self.job_list.append(job_data) self.start_time = time.time() self.done_flag = threading.Event() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() download_ids = {} contig_ids = {} url = "http://%s" % (self.shockurl) # url += '/node' try: include_all_data = params['all_data'] except: include_all_data = False contigs = not include_all_data status = '' ## TODO CHANGE: default pipeline default_pipe = ['velvet'] exceptions = [] if pipelines: try: if pipelines == ['auto']: pipelines = [default_pipe,] for p in pipelines: self.pmanager.validate_pipe(p) result_files, summary, contig_files, exceptions = self.run_pipeline(pipelines, job_data, contigs_only=contigs) for i, f in enumerate(result_files): #fname = os.path.basename(f).split('.')[0] fname = str(i) res = self.upload(url, user, token, f) download_ids[fname] = res['data']['id'] for c in contig_files: fname = os.path.basename(c).split('.')[0] res = self.upload(url, user, token, c, filetype='contigs') contig_ids[fname] = res['data']['id'] # Check if job completed with no errors if exceptions: status = 'Complete with errors' elif not summary: status = 'Complete: No valid contigs' else: status += "Complete" self.out_report.write("Pipeline completed successfully\n") except: traceback = format_exc(sys.exc_info()) status = "[FAIL] {}".format(sys.exc_info()[1]) print traceback self.out_report.write("ERROR TRACE:\n{}\n". format(format_tb(sys.exc_info()[2]))) # Format report for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']: self.job_list.pop(i) self.done_flag.set() new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log exceptions if len(exceptions) > 0: new_report.write('PIPELINE ERRORS') for i,e in enumerate(exceptions): new_report.write('{}: {}\n'.format(i, e)) try: for sum in summary: with open(sum) as s: new_report.write(s.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) download_ids['report'] = res['data']['id'] # Get location self.metadata.update_job(uid, 'result_data', download_ids) self.metadata.update_job(uid, 'contig_ids', contig_ids) self.metadata.update_job(uid, 'status', status) print '=========== JOB COMPLETE ============' def update_time_record(self): elapsed_time = time.time() - self.start_time ftime = str(datetime.timedelta(seconds=int(elapsed_time))) self.metadata.update_job(uid, 'computation_time', ftime) def run_pipeline(self, pipes, job_data, contigs_only=True): """ Runs all pipelines in list PIPES """ all_pipes = [] for p in pipes: all_pipes += self.pmanager.parse_input(p) logging.info('{} pipelines:'.format(len(all_pipes))) for p in all_pipes: print '->'.join(p) #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads' include_reads = False pipeline_num = 1 all_files = [] pipe_outputs = [] logfiles = [] ale_reports = {} final_contigs = [] final_scaffolds = [] output_types = [] exceptions = [] num_pipes = len(all_pipes) for pipe in all_pipes: try: #job_data = copy.deepcopy(job_data_global) #job_data['out_report'] = job_data_global['out_report'] pipeline, overrides = self.pmanager.parse_pipe(pipe) job_data.add_pipeline(pipeline_num, pipeline) num_stages = len(pipeline) pipeline_stage = 1 pipeline_results = [] cur_outputs = [] # Reset job data job_data['reads'] = copy.deepcopy(job_data['raw_reads']) job_data['processed_reads'] = [] print job_data self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format('='*15, pipeline_num, pipe)) pipe_suffix = '' # filename code for indiv pipes pipe_start_time = time.time() pipe_alive = True # Store data record for pipeline for module_name in pipeline: if not pipe_alive: self.out_report.write('\n{0} Module Failure, Killing Pipe {0}'.format( 'X'*10)) break module_code = '' # unique code for data reuse print '\n\n{0} Running module: {1} {2}'.format( '='*20, module_name, '='*(35-len(module_name))) self.garbage_collect(self.datapath, job_data['user'], 2147483648) # 2GB ## PROGRESS CALCULATION pipes_complete = (pipeline_num - 1) / float(num_pipes) stage_complete = (pipeline_stage - 1) / float(num_stages) pct_segment = 1.0 / num_pipes stage_complete *= pct_segment total_complete = pipes_complete + stage_complete cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format( int(total_complete * 100), pipeline_num, num_pipes, pipeline_stage, num_stages, module_name) self.metadata.update_job(job_data['uid'], 'status', cur_state) ## LOG REPORT For now, module code is 1st and last letter short_name = self.pmanager.get_short_name(module_name) if short_name: #pipe_suffix += short_name.capitalize() module_code += short_name.capitalize() else: #pipe_suffix += module_name[0].upper() + module_name[-1] module_code += module_name[0].upper() + module_name[-1] mod_overrides = overrides[pipeline_stage - 1] for k in mod_overrides.keys(): #pipe_suffix += '_{}{}'.format(k[0], par[k]) module_code += '_{}{}'.format(k[0], mod_overrides[k]) pipe_suffix += module_code self.out_report.write('PIPELINE {} -- STAGE {}: {}\n'.format( pipeline_num, pipeline_stage, module_name)) logging.debug('New job_data for stage {}: {}'.format( pipeline_stage, job_data)) job_data['params'] = overrides[pipeline_stage-1].items() module_start_time = time.time() ## RUN MODULE # Check if output data exists reuse_data = False enable_reuse = True # KILL SWITCH if enable_reuse: for k, pipe in enumerate(pipe_outputs): if reuse_data: break if not pipe: continue # Check that all previous pipes match for i in range(pipeline_stage): try: if not pipe[i][0] == cur_outputs[i][0]: break except: pass try: if (pipe[i][0] == module_code and i == pipeline_stage - 1): #and overrides[i].items() == job_data['params']): #copy! print('Found previously computed data, reusing {}.'.format( module_code)) output = [] + pipe[i][1] pfix = (k+1, i+1) alldata = [] + pipe[i][2] reuse_data = True job_data.get_pipeline(pipeline_num).get_module( pipeline_stage)['elapsed_time'] = time.time( job_data.get_pipeline(i).get_module( pipeline_stage)['elapsed_time']) break except: # Previous pipes may be shorter pass output_type = self.pmanager.output_type(module_name) if not reuse_data: output, alldata, mod_log = self.pmanager.run_module( module_name, job_data, all_data=True, reads=include_reads) ##### Module produced no output, attach log and proceed to next ##### if not output: pipe_alive = False try: print mod_log logfiles.append(mod_log) except: print 'error attaching ', mod_log break ##### Prefix outfiles with pipe stage (only assembler modules) ##### alldata = [asm.prefix_file_move( file, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) for file in alldata] module_elapsed_time = time.time() - module_start_time job_data.get_pipeline(pipeline_num).get_module( pipeline_stage)['elapsed_time'] = module_elapsed_time if alldata: #If log was renamed mod_log = asm.prefix_file(mod_log, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) if output_type == 'contigs' or output_type == 'scaffolds': #Assume assembly contigs if reuse_data: p_num, p_stage = pfix else: p_num, p_stage = pipeline_num, pipeline_stage # If plugin returned scaffolds if type(output) is tuple and len(output) == 2: out_contigs = output[0] out_scaffolds = output[1] cur_scaffolds = [asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_scaffolds] else: out_contigs = output cur_contigs = [asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_contigs] #job_data['reads'] = asm.arast_reads(alldata) job_data['contigs'] = cur_contigs elif output_type == 'reads': #Assume preprocessing if include_reads and reuse_data: # data was prefixed and moved for d in output: files = [asm.prefix_file(f, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) for f in d['files']] d['files'] = files d['short_reads'] = [] + files job_data['reads'] = output job_data['processed_reads'] = list(job_data['reads']) else: # Generic return, don't use in further stages pipeline_results += output logging.info('Generic plugin output: {}'.format(output)) if pipeline_stage == num_stages: # Last stage, add contig for assessment if output and (output_type == 'contigs' or output_type == 'scaffolds'): #If a contig was produced fcontigs = cur_contigs rcontigs = [asm.rename_file_symlink(f, 'P{}_{}'.format( pipeline_num, pipe_suffix)) for f in fcontigs] try: rscaffolds = [asm.rename_file_symlink(f, 'P{}_{}_{}'.format( pipeline_num, pipe_suffix, 'scaff')) for f in cur_scaffolds] if rscaffolds: scaffold_data = {'files': rscaffolds, 'name': pipe_suffix} final_scaffolds.append(scaffold_data) output_types.append(output_type) except: pass if rcontigs: contig_data = {'files': rcontigs, 'name': pipe_suffix, 'alignment_bam': []} final_contigs.append(contig_data) output_types.append(output_type) try: logfiles.append(mod_log) except: print 'error attaching ', mod_log pipeline_stage += 1 cur_contigs = [] cur_scaffolds = [] cur_outputs.append([module_code, output, alldata]) pipe_elapsed_time = time.time() - pipe_start_time pipe_ftime = str(datetime.timedelta(seconds=int(pipe_elapsed_time))) job_data.get_pipeline(pipeline_num)['elapsed_time'] = pipe_elapsed_time if not output: self.out_report.write('ERROR: No contigs produced. See module log\n') else: ## Assessment #self.pmanager.run_module('reapr', job_data) #print job_data # TODO reapr break may be diff from final reapr align! # ale_out, _, _ = self.pmanager.run_module('ale', job_data) # if ale_out: # job_data.get_pipeline(pipeline_num).import_ale(ale_out) # ale_reports[pipe_suffix] = ale_out pipeline_datapath = '{}/{}/pipeline{}/'.format(job_data['datapath'], job_data['job_id'], pipeline_num) try: os.makedirs(pipeline_datapath) except: logging.info("{} exists, skipping mkdir".format(pipeline_datapath)) # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results, # 'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix))) all_files += pipeline_results self.out_report.write('Pipeline {} total time: {}\n\n'.format(pipeline_num, pipe_ftime)) job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix pipe_outputs.append(cur_outputs) pipeline_num += 1 except: print "ERROR: Pipeline #{} Failed".format(pipeline_num) print format_exc(sys.exc_info()) e = str(sys.exc_info()[1]) if e.find('Terminated') != -1: raise Exception(e) exceptions.append(module_name + ':\n' + str(sys.exc_info()[1])) pipeline_num += 1 ## ANALYSIS: Quast job_data['final_contigs'] = final_contigs job_data['final_scaffolds'] = final_scaffolds job_data['params'] = [] #clear overrides from last stage summary = [] # Quast reports for contigs and scaffolds try: #Try to assess, otherwise report pipeline errors if job_data['final_contigs']: job_data['contig_type'] = 'contigs' quast_report, quast_tar, z1, q_log = self.pmanager.run_module('quast', job_data, tar=True, meta=True) if quast_report: summary.append(quast_report[0]) with open(q_log) as infile: self.out_report.write(infile.read()) else: quast_report, quast_tar = '','' if job_data['final_scaffolds']: scaff_data = dict(job_data) scaff_data['final_contigs'] = job_data['final_scaffolds'] scaff_data['contig_type'] = 'scaffolds' scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module('quast', scaff_data, tar=True, meta=True) scaffold_quast = True if scaff_report: summary.append(scaff_report[0]) with open(scaff_log) as infile: self.out_report.write('\n Quast Report - Scaffold Mode \n') self.out_report.write(infile.read()) else: scaffold_quast = False except: if exceptions: if len(exceptions) > 1: raise Exception('Multiple Errors') else: raise Exception(exceptions[0]) else: raise Exception(str(sys.exc_info()[1])) ## CONCAT MODULE LOG FILES self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("="*10)) for log in logfiles: self.out_report.write("\n\n{0} Begin Module {0}\n".format("="*10)) try: with open(log) as infile: self.out_report.write(infile.read()) except: self.out_report.write("Error writing log file") ## Format Returns ctg_analysis = quast_tar.rsplit('/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id']) try: os.rename(quast_tar, ctg_analysis) return_files = [ctg_analysis] except: #summary = '' return_files = [] if scaffold_quast: scf_analysis = scaff_tar.rsplit('/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id']) #summary = quast_report[0] os.rename(scaff_tar, scf_analysis) return_files.append(scf_analysis) contig_files = [] for data in final_contigs + final_scaffolds: for f in data['files']: contig_files.append(os.path.realpath(f)) return_files += all_files ## Deduplicate seen = set() for f in return_files: seen.add(f) return_files = [f for f in seen] #if exceptions: # if len(exceptions) > 1: # raise Exception('Multiple Errors') # else: # raise Exception(exceptions[0]) if contig_files: return_files.append(asm.tar_list('{}/{}'.format(job_data['datapath'], job_data['job_id']), contig_files, '{}_assemblies.tar.gz'.format( job_data['job_id']))) print "return files: {}".format(return_files) return return_files, summary, contig_files, exceptions def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logging.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'default': res = sclient.upload_misc(file, 'default') elif filetype == 'contigs': res = sclient.upload_contigs(file) return res def download(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection(pika.ConnectionParameters( host = self.arasturl)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(queue=self.queue, exclusive=False, auto_delete=False, durable=True) logging.basicConfig(format=("%(asctime)s %s %(levelname)-8s %(message)s",proc().name)) print proc().name, ' [*] Fetching job...' channel.basic_qos(prefetch_count=1) channel.basic_consume(self.callback, queue=self.queue) channel.start_consuming() def callback(self, ch, method, properties, body): print " [*] %r:%r" % (method.routing_key, body) params = json.loads(body) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) uid = job_doc['_id'] ## Check if job was not killed if job_doc['status'] == 'Terminated': print 'Job {} was killed, skipping'.format(params['job_id']) else: try: self.compute(body) except: print sys.exc_info() status = "[FAIL] {}".format(format_tb(sys.exc_info()[2])) print logging.error(status) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) def start(self): self.fetch_job()
class ArastConsumer: def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf): self.parser = SafeConfigParser() self.parser.read(config) self.job_list = job_list # Load plugins self.pmanager = ModuleManager(threads, kill_queue, job_list) # Set up environment self.shockurl = shockurl self.arasturl = arasturl self.datapath = self.parser.get('compute', 'datapath') if queue: self.queue = queue print('Using queue:{}'.format(self.queue)) else: self.queue = self.parser.get('rabbitmq', 'default_routing_key') self.min_free_space = float( self.parser.get('compute', 'min_free_space')) m = ctrl_conf['meta'] a = ctrl_conf['assembly'] self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'], m['mongo.collection'], m['mongo.collection.auth']) self.gc_lock = multiprocessing.Lock() def garbage_collect(self, datapath, user, required_space): """ Monitor space of disk containing DATAPATH and delete files if necessary.""" self.gc_lock.acquire() s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) logging.debug("Required space in bytes: %s" % required_space) while ((free_space - self.min_free_space) < required_space): #Delete old data dirs = os.listdir(os.path.join(datapath, user)) times = [] for dir in dirs: times.append( os.path.getmtime(os.path.join(datapath, user, dir))) if len(dirs) > 0: old_dir = os.path.join(datapath, user, dirs[times.index(min(times))]) shutil.rmtree(old_dir, ignore_errors=True) else: logging.error("No more directories to remove") break logging.info("Space required. %s removed." % old_dir) s = os.statvfs(datapath) free_space = float(s.f_bsize * s.f_bavail) logging.debug("Free space in bytes: %s" % free_space) self.gc_lock.release() def get_data(self, body): """Get data from cache or Shock server.""" params = json.loads(body) if 'assembly_data' in params: logging.info('New Data Format') return self._get_data(body) else: return self._get_data_old(body) def _get_data(self, body): params = json.loads(body) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] user = params['ARASTUSER'] token = params['oauth_token'] uid = params['_id'] ##### Get data from ID ##### data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if not data_doc: raise Exception('Invalid Data ID: {}'.format(params['data_id'])) if 'kbase_assembly_input' in data_doc: params['assembly_data'] = kb_to_asm( data_doc['kbase_assembly_input']) elif 'assembly_data' in data_doc: params['assembly_data'] = data_doc['assembly_data'] ##### Get data from assembly_data ##### self.metadata.update_job(uid, 'status', 'Data transfer') try: os.makedirs(filepath) except: pass ### TODO Garbage collect ### download_url = 'http://{}'.format(self.shockurl) file_sets = params['assembly_data']['file_sets'] for file_set in file_sets: file_set['files'] = [] #legacy for file_info in file_set['file_infos']: local_file = os.path.join(filepath, file_info['filename']) if os.path.exists(local_file): logging.info( "Requested data exists on node: {}".format(local_file)) else: local_file = self.download(download_url, user, token, file_info['shock_id'], filepath) file_info['local_file'] = local_file file_set['files'].append(local_file) #legacy all_files.append(file_set) return datapath, all_files def _get_data_old(self, body): params = json.loads(body) #filepath = self.datapath + str(params['data_id']) filepath = os.path.join(self.datapath, params['ARASTUSER'], str(params['data_id'])) datapath = filepath filepath += "/raw/" all_files = [] uid = params['_id'] job_id = params['job_id'] user = params['ARASTUSER'] data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER']) if data_doc: paired = data_doc['pair'] single = data_doc['single'] files = data_doc['filename'] ids = data_doc['ids'] token = params['oauth_token'] try: ref = data_doc['reference'] except: pass else: self.metadata.update_job(uid, 'status', 'Invalid Data ID') raise Exception('Data {} does not exist on Shock Server'.format( params['data_id'])) all_files = [] if os.path.isdir(filepath): logging.info("Requested data exists on node") try: for l in paired: filedict = {'type': 'paired', 'files': []} for word in l: if is_filename(word): baseword = os.path.basename(word) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type': 'single', 'files': []} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No single files submitted!') try: for r in ref: for wordpath in r: filedict = {'type': 'reference', 'files': []} if is_filename(wordpath): baseword = os.path.basename(wordpath) filedict['files'].append( extract_file(os.path.join(filepath, baseword))) else: kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_tb(sys.exc_info()[2])) logging.info('No reference files submitted!') touch(datapath) ## Data does not exist on current compute node else: self.metadata.update_job(uid, 'status', 'Data transfer') os.makedirs(filepath) # Get required space and garbage collect try: req_space = 0 for file_size in data_doc['file_sizes']: req_space += file_size self.garbage_collect(self.datapath, user, req_space) except: pass url = "http://%s" % (self.shockurl) try: for l in paired: #FILEDICT contains a single read library's info filedict = {'type': 'paired', 'files': []} for word in l: if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No paired files submitted') try: for seqfiles in single: for wordpath in seqfiles: filedict = {'type': 'single', 'files': []} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') try: for r in ref: for wordpath in r: filedict = {'type': 'reference', 'files': []} # Parse user directories try: path, word = wordpath.rsplit('/', 1) path += '/' except: word = wordpath path = '' if is_filename(word): baseword = os.path.basename(word) dl = self.download(url, user, token, ids[files.index(baseword)], filepath) if shock.parse_handle( dl): #Shock handle, get real data logging.info( 'Found shock handle, getting real data...') s_addr, s_id = shock.parse_handle(dl) s_url = 'http://{}'.format(s_addr) real_file = self.download( s_url, user, token, s_id, filepath) filedict['files'].append(real_file) else: filedict['files'].append(dl) elif re.search('=', word): kv = word.split('=') filedict[kv[0]] = kv[1] all_files.append(filedict) except: #logging.info(format_exc(sys.exc_info())) logging.info('No single end files submitted') print all_files return datapath, all_files def compute(self, body): error = False params = json.loads(body) job_id = params['job_id'] uid = params['_id'] user = params['ARASTUSER'] token = params['oauth_token'] pipelines = params['pipeline'] #support legacy arast client if len(pipelines) > 0: if type(pipelines[0]) is not list: pipelines = [pipelines] ### Download files (if necessary) datapath, all_files = self.get_data(body) rawpath = datapath + '/raw/' jobpath = os.path.join(datapath, str(job_id)) try: os.makedirs(jobpath) except: raise Exception('Data Error') ### Create job log self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id)) self.out_report = open(self.out_report_name, 'w') ### Create data to pass to pipeline reads = [] reference = [] for fileset in all_files: if len(fileset['files']) != 0: if (fileset['type'] == 'single' or fileset['type'] == 'paired'): reads.append(fileset) elif fileset['type'] == 'reference': reference.append(fileset) else: raise Exception('fileset error') job_data = ArastJob({ 'job_id': params['job_id'], 'uid': params['_id'], 'user': params['ARASTUSER'], 'reads': reads, 'reference': reference, 'initial_reads': list(reads), 'raw_reads': copy.deepcopy(reads), 'processed_reads': list(reads), 'pipeline_data': {}, 'datapath': datapath, 'out_report': self.out_report, 'logfiles': [] }) self.out_report.write("Arast Pipeline: Job {}\n".format(job_id)) self.job_list.append(job_data) self.start_time = time.time() self.done_flag = threading.Event() timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag) timer_thread.start() download_ids = {} contig_ids = {} url = "http://%s" % (self.shockurl) # url += '/node' try: include_all_data = params['all_data'] except: include_all_data = False contigs = not include_all_data status = '' ## TODO CHANGE: default pipeline default_pipe = ['velvet'] exceptions = [] if pipelines: try: if pipelines == ['auto']: pipelines = [ default_pipe, ] for p in pipelines: self.pmanager.validate_pipe(p) result_files, summary, contig_files, exceptions = self.run_pipeline( pipelines, job_data, contigs_only=contigs) for i, f in enumerate(result_files): #fname = os.path.basename(f).split('.')[0] fname = str(i) res = self.upload(url, user, token, f) download_ids[fname] = res['data']['id'] for c in contig_files: fname = os.path.basename(c).split('.')[0] res = self.upload(url, user, token, c, filetype='contigs') contig_ids[fname] = res['data']['id'] # Check if job completed with no errors if exceptions: status = 'Complete with errors' elif not summary: status = 'Complete: No valid contigs' else: status += "Complete" self.out_report.write("Pipeline completed successfully\n") except: traceback = format_exc(sys.exc_info()) status = "[FAIL] {}".format(sys.exc_info()[1]) print traceback self.out_report.write("ERROR TRACE:\n{}\n".format( format_tb(sys.exc_info()[2]))) # Format report for i, job in enumerate(self.job_list): if job['user'] == job_data['user'] and job['job_id'] == job_data[ 'job_id']: self.job_list.pop(i) self.done_flag.set() new_report = open('{}.tmp'.format(self.out_report_name), 'w') ### Log exceptions if len(exceptions) > 0: new_report.write('PIPELINE ERRORS') for i, e in enumerate(exceptions): new_report.write('{}: {}\n'.format(i, e)) try: for sum in summary: with open(sum) as s: new_report.write(s.read()) except: new_report.write('No Summary File Generated!\n\n\n') self.out_report.close() with open(self.out_report_name) as old: new_report.write(old.read()) new_report.close() os.remove(self.out_report_name) shutil.move(new_report.name, self.out_report_name) res = self.upload(url, user, token, self.out_report_name) download_ids['report'] = res['data']['id'] # Get location self.metadata.update_job(uid, 'result_data', download_ids) self.metadata.update_job(uid, 'contig_ids', contig_ids) self.metadata.update_job(uid, 'status', status) print '=========== JOB COMPLETE ============' def update_time_record(self): elapsed_time = time.time() - self.start_time ftime = str(datetime.timedelta(seconds=int(elapsed_time))) self.metadata.update_job(uid, 'computation_time', ftime) def run_pipeline(self, pipes, job_data, contigs_only=True): """ Runs all pipelines in list PIPES """ all_pipes = [] for p in pipes: all_pipes += self.pmanager.parse_input(p) logging.info('{} pipelines:'.format(len(all_pipes))) for p in all_pipes: print '->'.join(p) #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads' include_reads = False pipeline_num = 1 all_files = [] pipe_outputs = [] logfiles = [] ale_reports = {} final_contigs = [] final_scaffolds = [] output_types = [] exceptions = [] num_pipes = len(all_pipes) for pipe in all_pipes: try: #job_data = copy.deepcopy(job_data_global) #job_data['out_report'] = job_data_global['out_report'] pipeline, overrides = self.pmanager.parse_pipe(pipe) job_data.add_pipeline(pipeline_num, pipeline) num_stages = len(pipeline) pipeline_stage = 1 pipeline_results = [] cur_outputs = [] # Reset job data job_data['reads'] = copy.deepcopy(job_data['raw_reads']) job_data['processed_reads'] = [] print job_data self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format( '=' * 15, pipeline_num, pipe)) pipe_suffix = '' # filename code for indiv pipes pipe_start_time = time.time() pipe_alive = True # Store data record for pipeline for module_name in pipeline: if not pipe_alive: self.out_report.write( '\n{0} Module Failure, Killing Pipe {0}'.format( 'X' * 10)) break module_code = '' # unique code for data reuse print '\n\n{0} Running module: {1} {2}'.format( '=' * 20, module_name, '=' * (35 - len(module_name))) self.garbage_collect(self.datapath, job_data['user'], 2147483648) # 2GB ## PROGRESS CALCULATION pipes_complete = (pipeline_num - 1) / float(num_pipes) stage_complete = (pipeline_stage - 1) / float(num_stages) pct_segment = 1.0 / num_pipes stage_complete *= pct_segment total_complete = pipes_complete + stage_complete cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format( int(total_complete * 100), pipeline_num, num_pipes, pipeline_stage, num_stages, module_name) self.metadata.update_job(job_data['uid'], 'status', cur_state) ## LOG REPORT For now, module code is 1st and last letter short_name = self.pmanager.get_short_name(module_name) if short_name: #pipe_suffix += short_name.capitalize() module_code += short_name.capitalize() else: #pipe_suffix += module_name[0].upper() + module_name[-1] module_code += module_name[0].upper() + module_name[-1] mod_overrides = overrides[pipeline_stage - 1] for k in mod_overrides.keys(): #pipe_suffix += '_{}{}'.format(k[0], par[k]) module_code += '_{}{}'.format(k[0], mod_overrides[k]) pipe_suffix += module_code self.out_report.write( 'PIPELINE {} -- STAGE {}: {}\n'.format( pipeline_num, pipeline_stage, module_name)) logging.debug('New job_data for stage {}: {}'.format( pipeline_stage, job_data)) job_data['params'] = overrides[pipeline_stage - 1].items() module_start_time = time.time() ## RUN MODULE # Check if output data exists reuse_data = False enable_reuse = True # KILL SWITCH if enable_reuse: for k, pipe in enumerate(pipe_outputs): if reuse_data: break if not pipe: continue # Check that all previous pipes match for i in range(pipeline_stage): try: if not pipe[i][0] == cur_outputs[i][0]: break except: pass try: if (pipe[i][0] == module_code and i == pipeline_stage - 1): #and overrides[i].items() == job_data['params']): #copy! print( 'Found previously computed data, reusing {}.' .format(module_code)) output = [] + pipe[i][1] pfix = (k + 1, i + 1) alldata = [] + pipe[i][2] reuse_data = True job_data.get_pipeline( pipeline_num).get_module( pipeline_stage )['elapsed_time'] = time.time( job_data.get_pipeline(i). get_module(pipeline_stage) ['elapsed_time']) break except: # Previous pipes may be shorter pass output_type = self.pmanager.output_type(module_name) if not reuse_data: output, alldata, mod_log = self.pmanager.run_module( module_name, job_data, all_data=True, reads=include_reads) ##### Module produced no output, attach log and proceed to next ##### if not output: pipe_alive = False try: print mod_log logfiles.append(mod_log) except: print 'error attaching ', mod_log break ##### Prefix outfiles with pipe stage (only assembler modules) ##### alldata = [ asm.prefix_file_move( file, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) for file in alldata ] module_elapsed_time = time.time() - module_start_time job_data.get_pipeline(pipeline_num).get_module( pipeline_stage )['elapsed_time'] = module_elapsed_time if alldata: #If log was renamed mod_log = asm.prefix_file( mod_log, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) if output_type == 'contigs' or output_type == 'scaffolds': #Assume assembly contigs if reuse_data: p_num, p_stage = pfix else: p_num, p_stage = pipeline_num, pipeline_stage # If plugin returned scaffolds if type(output) is tuple and len(output) == 2: out_contigs = output[0] out_scaffolds = output[1] cur_scaffolds = [ asm.prefix_file( file, "P{}_S{}_{}".format( p_num, p_stage, module_name)) for file in out_scaffolds ] else: out_contigs = output cur_contigs = [ asm.prefix_file( file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) for file in out_contigs ] #job_data['reads'] = asm.arast_reads(alldata) job_data['contigs'] = cur_contigs elif output_type == 'reads': #Assume preprocessing if include_reads and reuse_data: # data was prefixed and moved for d in output: files = [ asm.prefix_file( f, "P{}_S{}_{}".format( pipeline_num, pipeline_stage, module_name)) for f in d['files'] ] d['files'] = files d['short_reads'] = [] + files job_data['reads'] = output job_data['processed_reads'] = list(job_data['reads']) else: # Generic return, don't use in further stages pipeline_results += output logging.info( 'Generic plugin output: {}'.format(output)) if pipeline_stage == num_stages: # Last stage, add contig for assessment if output and (output_type == 'contigs' or output_type == 'scaffolds' ): #If a contig was produced fcontigs = cur_contigs rcontigs = [ asm.rename_file_symlink( f, 'P{}_{}'.format(pipeline_num, pipe_suffix)) for f in fcontigs ] try: rscaffolds = [ asm.rename_file_symlink( f, 'P{}_{}_{}'.format( pipeline_num, pipe_suffix, 'scaff')) for f in cur_scaffolds ] if rscaffolds: scaffold_data = { 'files': rscaffolds, 'name': pipe_suffix } final_scaffolds.append(scaffold_data) output_types.append(output_type) except: pass if rcontigs: contig_data = { 'files': rcontigs, 'name': pipe_suffix, 'alignment_bam': [] } final_contigs.append(contig_data) output_types.append(output_type) try: logfiles.append(mod_log) except: print 'error attaching ', mod_log pipeline_stage += 1 cur_contigs = [] cur_scaffolds = [] cur_outputs.append([module_code, output, alldata]) pipe_elapsed_time = time.time() - pipe_start_time pipe_ftime = str( datetime.timedelta(seconds=int(pipe_elapsed_time))) job_data.get_pipeline( pipeline_num)['elapsed_time'] = pipe_elapsed_time if not output: self.out_report.write( 'ERROR: No contigs produced. See module log\n') else: ## Assessment #self.pmanager.run_module('reapr', job_data) #print job_data # TODO reapr break may be diff from final reapr align! # ale_out, _, _ = self.pmanager.run_module('ale', job_data) # if ale_out: # job_data.get_pipeline(pipeline_num).import_ale(ale_out) # ale_reports[pipe_suffix] = ale_out pipeline_datapath = '{}/{}/pipeline{}/'.format( job_data['datapath'], job_data['job_id'], pipeline_num) try: os.makedirs(pipeline_datapath) except: logging.info("{} exists, skipping mkdir".format( pipeline_datapath)) # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results, # 'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix))) all_files += pipeline_results self.out_report.write('Pipeline {} total time: {}\n\n'.format( pipeline_num, pipe_ftime)) job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix pipe_outputs.append(cur_outputs) pipeline_num += 1 except: print "ERROR: Pipeline #{} Failed".format(pipeline_num) print format_exc(sys.exc_info()) e = str(sys.exc_info()[1]) if e.find('Terminated') != -1: raise Exception(e) exceptions.append(module_name + ':\n' + str(sys.exc_info()[1])) pipeline_num += 1 ## ANALYSIS: Quast job_data['final_contigs'] = final_contigs job_data['final_scaffolds'] = final_scaffolds job_data['params'] = [] #clear overrides from last stage summary = [] # Quast reports for contigs and scaffolds try: #Try to assess, otherwise report pipeline errors if job_data['final_contigs']: job_data['contig_type'] = 'contigs' quast_report, quast_tar, z1, q_log = self.pmanager.run_module( 'quast', job_data, tar=True, meta=True) if quast_report: summary.append(quast_report[0]) with open(q_log) as infile: self.out_report.write(infile.read()) else: quast_report, quast_tar = '', '' if job_data['final_scaffolds']: scaff_data = dict(job_data) scaff_data['final_contigs'] = job_data['final_scaffolds'] scaff_data['contig_type'] = 'scaffolds' scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module( 'quast', scaff_data, tar=True, meta=True) scaffold_quast = True if scaff_report: summary.append(scaff_report[0]) with open(scaff_log) as infile: self.out_report.write('\n Quast Report - Scaffold Mode \n') self.out_report.write(infile.read()) else: scaffold_quast = False except: if exceptions: if len(exceptions) > 1: raise Exception('Multiple Errors') else: raise Exception(exceptions[0]) else: raise Exception(str(sys.exc_info()[1])) ## CONCAT MODULE LOG FILES self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("=" * 10)) for log in logfiles: self.out_report.write("\n\n{0} Begin Module {0}\n".format("=" * 10)) try: with open(log) as infile: self.out_report.write(infile.read()) except: self.out_report.write("Error writing log file") ## Format Returns ctg_analysis = quast_tar.rsplit( '/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id']) try: os.rename(quast_tar, ctg_analysis) return_files = [ctg_analysis] except: #summary = '' return_files = [] if scaffold_quast: scf_analysis = scaff_tar.rsplit( '/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id']) #summary = quast_report[0] os.rename(scaff_tar, scf_analysis) return_files.append(scf_analysis) contig_files = [] for data in final_contigs + final_scaffolds: for f in data['files']: contig_files.append(os.path.realpath(f)) return_files += all_files ## Deduplicate seen = set() for f in return_files: seen.add(f) return_files = [f for f in seen] #if exceptions: # if len(exceptions) > 1: # raise Exception('Multiple Errors') # else: # raise Exception(exceptions[0]) if contig_files: return_files.append( asm.tar_list( '{}/{}'.format(job_data['datapath'], job_data['job_id']), contig_files, '{}_assemblies.tar.gz'.format(job_data['job_id']))) print "return files: {}".format(return_files) return return_files, summary, contig_files, exceptions def upload(self, url, user, token, file, filetype='default'): files = {} files["file"] = (os.path.basename(file), open(file, 'rb')) logging.debug("Message sent to shock on upload: %s" % files) sclient = shock.Shock(url, user, token) if filetype == 'default': res = sclient.upload_misc(file, 'default') elif filetype == 'contigs': res = sclient.upload_contigs(file) return res def download(self, url, user, token, node_id, outdir): sclient = shock.Shock(url, user, token) downloaded = sclient.curl_download_file(node_id, outdir=outdir) return extract_file(downloaded) def fetch_job(self): connection = pika.BlockingConnection( pika.ConnectionParameters(host=self.arasturl)) channel = connection.channel() channel.basic_qos(prefetch_count=1) result = channel.queue_declare(queue=self.queue, exclusive=False, auto_delete=False, durable=True) logging.basicConfig( format=("%(asctime)s %s %(levelname)-8s %(message)s", proc().name)) print proc().name, ' [*] Fetching job...' channel.basic_qos(prefetch_count=1) channel.basic_consume(self.callback, queue=self.queue) channel.start_consuming() def callback(self, ch, method, properties, body): print " [*] %r:%r" % (method.routing_key, body) params = json.loads(body) job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id']) uid = job_doc['_id'] ## Check if job was not killed if job_doc['status'] == 'Terminated': print 'Job {} was killed, skipping'.format(params['job_id']) else: try: self.compute(body) except: print sys.exc_info() status = "[FAIL] {}".format(format_tb(sys.exc_info()[2])) print logging.error(status) self.metadata.update_job(uid, 'status', status) ch.basic_ack(delivery_tag=method.delivery_tag) def start(self): self.fetch_job()