def getQuantCombinations(job_requests, fastq_ref, input_parameters, debug): # initialize debug debug_list = [] format = '{:<15}' if debug: debug_list.append(['def: getQuantCombinations():']) if debug: debug_list.append(['-']) if debug: debug_list.append(['combo', 'list']) if debug: debug_list.append(['-']) # initialize parameters fastq_combos = {} align_fq_comb = {} align_jobs = [] quant_jobs = [] # get possible alignment and quantification jobs for job in job_requests.keys(): if re.search(r'_align$', job): align_jobs.append(re.sub(r'_align$', '', job)) if re.search(r'_quant$', job): quant_jobs.append(job) # get possible fastq combinations # transforms comb job list into print("%s Status: Make fastq combinations ..." % (getTime())) for combi_str in filter(None, input_parameters['quant_comb_list'].split(';')): fq_combi = '' combi_list = [] for combi in sorted(filter(None, combi_str.split(','))): # make fq combi key fq_combi = '%s_%s' % (fq_combi, combi) # save fastq names into ordered list (according to fq_combi) from -> fastq_ref[j] = [ref_name, num_reads] combi_list.append(fastq_ref[combi][0]) # get final fq_combi entry into fastq_combos -> fastq_combos['_fq1_fq2_fq3'] = ['fq_name_1','fq_name_2','fq_name_3'] if debug: debug_list.append([fq_combi, combi_list]) fastq_combos[fq_combi] = combi_list print("%s Combinations: %s" % (getTime(), fastq_combos.keys())) # get possible aligner + fastq_combo combinations if debug: debug_list.append(['-']) print("%s Status: Make aligner-fastq combinations ..." % (getTime())) for aligner in align_jobs: for fq_combi in fastq_combos.keys(): aln_fq = '%s%s' % (aligner, fq_combi) # get fastq - aligner combination -> align_fq_comb['ALIGNTOOL1_fq1_fq2_fq3'] = ['fq_name_1','fq_name_2','fq_name_3'] if debug: debug_list.append([aln_fq, fastq_combos[fq_combi]]) align_fq_comb[aln_fq] = fastq_combos[fq_combi] print("%s Combinations: %s" % (getTime(), align_fq_comb.keys())) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return return align_fq_comb
def makeBashConfig(job_list, input_parameters, job_config_data, debug): # initialize debug debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: makeBashConfig():']) if debug: debug_list.append(['-']) if debug: debug_list.append(['job', 'script']) if debug: debug_list.append(['-']) # initialize bash config bash_config = {} # save bash config for job in job_list.keys(): if job_list[job] in ['request', 'ask', 'new']: # get job config for requested job job_config = job_config_data[job] # save bash pre and seq config config = {} print("%s Status: Create \'bash_pre_script\' for \'%s\' ..." % (getTime(), job)) config['bash_pre_script'] = job_config['bash_pre_script'] print("%s Status: Create \'bash_seq_script\' for \'%s\' ..." % (getTime(), job)) config['bash_seq_script'] = job_config['bash_seq_script'] # go through 'script_order' list and save script command print("%s Status: Create \'bash_main_script\' for \'%s\' ..." % (getTime(), job)) script = '' for order_item in job_config['script_order']: if order_item != 'options': script += job_config[order_item] + ' ' else: for option in job_config['script_options']: if input_parameters[option]: script += job_config[option] + ' ' print('%s Script: %s' % (getTime(), script)) if debug: debug_list.append([job, script]) # save bash main script config['bash_main_script'] = script # save config for job into bash config bash_config[job] = config # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return bash config return bash_config
def checkArgsProject(project, debug): # initialize parameters proj_dir_prefix = '' debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: checkArgsProject():']) if debug: debug_list.append(['-']) # check if project has a file path proj_path = os.path.dirname(project) proj_base = os.path.basename(project) if debug: debug_list.append(['proj_path', proj_path]) if debug: debug_list.append(['proj_base', proj_base]) if not proj_base: print("Error: Project name not defined properly!" % ()) sys.exit() elif proj_path and not os.path.isdir(proj_path): print("Error: \'%s\' has a path, but \'%s\' is not a directory!" % (proj_base, proj_path)) sys.exit() # ask user if this file path should be used as prefix elif proj_path and os.path.isdir(proj_path): print("%s Warning: \'%s\' has the path \'%s\'!" % (getTime(), proj_base, proj_path)) print( "%s Warning: This will overwrite all other \'PROJ_DIR_PREFIX\' variables." % (getTime())) user = userCall("Continue? y[es] / n[o]", True) proj_dir_prefix = proj_path # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return arguments return proj_dir_prefix, proj_base
def getQuantDict(possible_jobs, input_parameters): # new dict for quantification names quant_names_dict = {} # assign quantification names for name_mod in filter(None, input_parameters['quant_name_list'].split(';')): job, name = filter(None, name_mod.split(':')) if job in possible_jobs.keys(): quant_names_dict[job] = name else: print( "%s Warning: \'%s\' is not a valid job, it will be ignored!" % (getTime(), job)) # return dict return quant_names_dict
def main(project, config, prefix, dataset, fasta, gtf, fastq1, fastq2, debug, test, arq_path): ############################################################################### ## initialize data structures # parse option arguments to dictionary for improved error handling input_options = {'project': project, 'prefix': prefix} # parse file arguments to dictionary for improved error handling input_files = { 'config': config, 'fasta_gen_file': fasta, 'gtf_index_file': gtf, 'fastq1_links': fastq1, 'fastq2_links': fastq2 } # initialize dictionary for parameter handling ''' TODO: save ALL possible variables and update empty checks in code ''' print("%s Status: Get input dictionary ..." % (getTime())) input_parameters = getMainDictionaries('input_parameters', debug) input_parameters['test_call'] = test input_parameters['arq_path'] = arq_path # tool requests with tool links ''' TODO: make tool requests as input parameter and save them automatically ''' print("%s Status: Get dependency dictionary ..." % (getTime())) possible_jobs = getMainDictionaries('possible_jobs', debug) # possible alignment - quantification combinations input_parameters['quant_align_combi_list'] = getMainDictionaries( 'quant_align', debug) # job requests and status ''' TODO: options: 'request', 'enumerate', 'overwrite', 'skip', 'skipped', 'finished' ''' # job config data ''' TODO: TODO: add special config data dynamically ''' job_config_data = {} # set timezone os.environ['TZ'] = 'EST' ''' TODO: make sam merge folder, same as fq temp folder ''' ''' TODO: make remove option ''' ''' TODO: remove fastq file, etc. necessity -> so it can be used only for quantification finished alignments ''' ############################################################################### ## save input data and options to data structures # get check and get all provided arguments # TODO: get and check fastq files print("%s Status: Get settings ..." % (getTime())) input_parameters, fastq1_files, fastq2_files, fastq_ref = getArguments( input_files, input_options, input_parameters, debug) # generate project directories print("%s Status: Make main directories ..." % (getTime())) input_parameters, progress_data = makeMainDirectory( input_parameters, debug) # add alignment free placeholder progress_data.append('AFREE_align') # save requested jobs print("%s Status: Get job requests ..." % (getTime())) job_requests = getJobRequests(possible_jobs, input_parameters, debug) # check for finished jobs ''' check and mark dependencies ''' ''' TODO: do better distinguishing for fastq vs quant jobs ''' print("%s Status: Check job progress ..." % (getTime())) job_requests = checkJobRequests(progress_data, job_requests, input_parameters, debug) # get tool parameters ''' TODO: make it user dependent ''' ''' TODO: check tool paths ''' ''' TODO: optional global command ''' # get job config data print("%s Status: Get index config ..." % (getTime())) job_config_data = getIndexConfig(job_config_data, debug) print("%s Status: Get alignment config ..." % (getTime())) job_config_data = getAlignConfig(job_config_data, debug) print("%s Status: Get quantification config ..." % (getTime())) job_config_data = getQuantConfig(job_config_data, debug) ''' TODO: remove individual alignment file folder ''' # check all parameters in debug mode checkParametersDebug(input_parameters, job_requests, fastq1_files, fastq2_files, debug) ############################################################################### ## start tools - index ''' TODO: track time and memory consumption ''' # set qsub parameters ''' TODO: make as input parameter -> dependent on number of threads used ''' #input_parameters['qs_mem_free'] = '40' #input_parameters['qs_run_time'] = '28800' # make index tool directories print("%s Status: Make index directories ..." % (getTime())) index_jobs, input_parameters = makeToolDirectories(job_requests, input_parameters, 'index', debug) # make index bash config print("%s Status: Make index bash config ..." % (getTime())) bash_index_config = makeBashConfig(index_jobs, input_parameters, job_config_data, debug) # run index jobs print("%s Status: Run index jobs ..." % (getTime())) index_jobs = runQsubJobs(index_jobs, input_parameters, bash_index_config, {}, {}, {}, 'index', debug) ############################################################################### ## start tools - alignments ''' TODO: track time and memory consumption -> qacct -j <number> -> get job number and save it ''' ''' TODO: get statistics out of qacct ''' # set qsub parameters ''' TODO: make as input parameter -> dependent on number of threads used ''' #input_parameters['qs_mem_free'] = '40' #input_parameters['qs_run_time'] = '28800' # make alignment tool directories print("%s Status: Make alignment directories ..." % (getTime())) align_jobs, input_parameters = makeToolDirectories(job_requests, input_parameters, 'align', debug) # make alignment bash config print("%s Status: Make alignment bash config ..." % (getTime())) bash_align_config = makeBashConfig(align_jobs, input_parameters, job_config_data, debug) # unzip fastq files ''' TODO: make optional to remove fastq at the end ''' print("%s Status: Unzip fastq1 files ..." % (getTime())) fastq1_files_uz = unzipFastq(fastq1_files, input_parameters) print("%s Status: Unzip fastq2 files ..." % (getTime())) fastq2_files_uz = unzipFastq(fastq2_files, input_parameters) # check alignment progress # INPUT -> align_jobs = {'TOOL1_align':'overwrite', 'TOOL2_align':'ask', ...} # fastq1_files_uz = {'fq1':'path', 'fq2':'path', ...} # OUTPUT -> fastq_jobs = {'TOOL1_align':{'fq1':'overwrite', 'fq2':'request', ...}, # 'TOOL2_align':{'fq1':'overwrite', 'fq2':'request', ...}, ...} print("%s Status: Check alignment progress ..." % (getTime())) fastq_jobs = checkProgress(align_jobs, fastq1_files_uz, input_parameters, 'align', debug) # get tophat merge options input_parameters['tophat_merge_opt'] = tophatMergeOpt( input_parameters['tool_dir_toprec']) # run alignment jobs ''' TODO: check fastq progress, when a job breaks ''' ''' TODO: gzip sam / bam ? ''' print("%s Status: Run alignment jobs ..." % (getTime())) align_jobs = runQsubJobs(fastq_jobs, input_parameters, bash_align_config, {}, fastq1_files_uz, fastq2_files_uz, 'align', debug) # remove unzipped fastq files ''' TODO: make definition for removeFastq and test ''' ''' TODO: dont remove, neede for kallisto ''' #if input_parameters['remove_fastq'].lower() in ['remove', 'true']: #print("%s Status: Remove unzipped fastq1 files ..." % (getTime())) #removeFastq(fastq1_files_uz) #if input_parameters['remove_fastq'].lower() in ['remove', 'true']: #print("%s Status: Remove unzipped fastq2 files ..." % (getTime())) #removeFastq(fastq2_files_uz) ############################################################################### ## start tools - quantification ''' TODO: quant<->align tool combination ''' ''' TODO: get specific number of fastq files (10M, 20M, 50M reads) -> modify fastq1_files_uz and fastq2_files_uz ''' ''' TODO: track used fastq file names? ''' # set qsub parameters ''' TODO: make as input parameter -> dependent on number of threads used ''' #input_parameters['num_processors'] = '4' #input_parameters['qs_mem_free'] = '10' #input_parameters['qs_run_time'] = '28800' # make quantification tool directories ''' TODO: make better options -> new option for only make new jobs (skip will always skip complete atm ''' ''' TODO: and overwrite will always overwrite all or you have to decide for each ''' print("%s Status: Make quantification directories ..." % (getTime())) quant_jobs, input_parameters = makeToolDirectories(job_requests, input_parameters, 'quant', debug) # make quantification bash config print("%s Status: Make quantification bash config ..." % (getTime())) bash_quant_config = makeBashConfig(quant_jobs, input_parameters, job_config_data, debug) # make quantification job combinations ''' TODO: get specific number of fastq files (10M, 20M, 50M reads) -> modify fastq1_files_uz and fastq2_files_uz ''' print("%s Status: Count fastq reads ..." % (getTime())) fastq_ref = countFastqReads(fastq_ref, fastq1_files_uz, input_parameters['seq_style_fq']) # INPUT -> fastq_ref = {'fq1':['fq_name_1','num_reads'], 'fq2':['fq_name_2','num_reads'], ...} # job_requests = {'TOOL1_quant':'skip', 'TOOL2_quant':'request', 'TOOL3_align':'skip', ...} print("%s Status: Make alignment combinations ..." % (getTime())) quant_files = getQuantCombinations(job_requests, fastq_ref, input_parameters, debug) # get user defined quantification names print("%s Status: Get quantification names ..." % (getTime())) input_parameters['quant_name_list'] = getQuantDict(possible_jobs, input_parameters) # check quantification progress # INPUT -> quant_jobs = {'TOOL1_quant':'overwrite', 'TOOL2_quant':'ask', ...} # quant_files = {'ALIGNTOOL1_fq1_fq2_fq3':['fq_name_1','fq_name_2','fq_name_3'], # 'ALIGNTOOL2_fq1_fq2_fq3':['fq_name_1','fq_name_2','fq_name_3'], # 'ALIGNTOOL1_fq4_fq5':['fq_name_4','fq_name_5'], ...} # OUTPUT -> combi_jobs = {'tool1_quant':{'ALIGNTOOL1_fq1_fq2_fq3':'overwrite', 'ALIGNTOOL2_fq1_fq2_fq3':'request', ...}, # 'tool2_quant':{'ALIGNTOOL1_fq1_fq2_fq3':'overwrite', 'ALIGNTOOL2_fq1_fq2_fq3':'request', ...}, ...} print("%s Status: Check quantification progress ..." % (getTime())) combi_jobs = checkProgress(quant_jobs, quant_files, input_parameters, 'quant', debug) # write fastq reference file for alignment compilations print("%s Status: Write fastq reference file ..." % (getTime())) input_parameters['fastq_ref_file'] = '%s/%s_fastq.ref' % ( input_parameters['quant_dir_path'], input_parameters['data_set_name']) writeFastqRef(fastq_ref, input_parameters) # run quantification jobs print("%s Status: Run quantification jobs ..." % (getTime())) quant_jobs = runQsubJobs(combi_jobs, input_parameters, bash_quant_config, quant_files, fastq1_files_uz, fastq2_files_uz, 'quant', debug) # remove combined alignment files '''TODO: manage path with input parameters ''' if input_parameters['rem_comb_align'].lower() in ['remove', 'true']: print("%s Status: Remove alignment combinations ..." % (getTime())) removeCombAlign(quant_files, input_parameters) ############################################################################### ## clean up # remove alignment tmp folder ''' print("%s Status: Removing temp alignment folders ..." % (getTime())) for job in fastq_jobs.keys(): search_dir = '%s/%s' % (input_parameters['align_dir_path'], job) for file in os.listdir(search_dir): is_dir = '%s/%s' % (search_dir, file) if os.path.isdir(is_dir): removeJob(is_dir, [''], debug) # remove quantification tmp folder print("%s Status: Removing temp quantification folders ..." % (getTime())) for job in combi_jobs.keys(): search_dir = '%s/%s' % (input_parameters['quant_dir_path'], job) for file in os.listdir(search_dir): is_dir = '%s/%s' % (search_dir, file) if os.path.isdir(is_dir): removeJob(is_dir, [''], debug) ''' # done print("%s Status: ARQ finished ..." % (getTime()))
dest='fastq2', help="input file with fastq file paths (second end; paired end reads)") parser.add_argument('-t', '--test', dest='test', action='store_true', help="use predefined test data") ''' TODO: make full parameter list ''' parser._optionals.title = "arguments" options = parser.parse_args() # handle test arq_path = '' if options.test: arq_path = os.path.dirname(os.path.realpath(__file__)) print("%s Warning: Test requested ... " % getTime()) print("%s Warning: Setting parameter -p test_temp ... " % getTime()) options.project = 'test_temp' print( "%s Warning: Setting parameter -c %s/test_files/arq_proj_test.conf ... " % (getTime(), arq_path)) options.config = '%s/test_files/arq_proj_test.conf' % arq_path # check if either a config file or all required parameters are given config_file_given = None not in [options.project, options.config] required_arguments_given = None not in [ options.project, options.prefix, options.dataset, options.fasta, options.gtf, options.fastq1 ] if not (config_file_given or required_arguments_given): print(
def makeMainDirectory(input_parameters, debug): # initialize parameters project_directory = input_parameters['proj_dir_path'] progress_data = [] debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: makeMainDirectory():']) if debug: debug_list.append(['-']) # directory list dir_list = ['index', 'align', 'quant', 'qsub', 'fastq', 'comb'] # save project directories input_parameters['index_dir_path'] = '%s/indexes' % (project_directory) input_parameters['align_dir_path'] = '%s/alignments/%s' % ( project_directory, input_parameters['data_set_name']) input_parameters['quant_dir_path'] = '%s/quantifications/%s' % ( project_directory, input_parameters['data_set_name']) input_parameters['qsub_dir_path'] = '%s/qsub_data' % (project_directory) input_parameters['fastq_dir_path'] = '%s/fastq_data' % (project_directory) input_parameters['comb_dir_path'] = '%s/comb_data/%s' % ( project_directory, input_parameters['data_set_name']) # make new project folders new_project = False if not os.path.isdir(project_directory): # main project folder print("%s Status: Create \'%s\' main folder ..." % (getTime(), input_parameters['p_name_suffix'])) os.mkdir(project_directory) new_project = True # make arq data directories if not new_project: print("%s Warning: \'%s\' already exists! Checking progress ..." % (getTime(), project_directory)) # make data folders if debug: debug_list.append(['dir key', 'dir path']) if debug: debug_list.append(['-']) for data_type in dir_list: if not os.path.isdir(input_parameters['%s_dir_path' % data_type]): print("%s Status: Create \'%s\' folder ..." % (getTime(), data_type)) if debug: debug_list.append([ '%s_dir_path' % data_type, input_parameters['%s_dir_path' % data_type] ]) os.makedirs('%s' % (input_parameters['%s_dir_path' % data_type])) else: print("%s Warning: \'%s\' folder already exists ..." % (getTime(), data_type)) # remove qsub error file if exists if os.path.isfile('%s/qsub.error' % (input_parameters['qsub_dir_path'])): print("%s Status: Remove \'qsub.error\' file ..." % (getTime())) os.remove('%s/qsub.error' % (input_parameters['qsub_dir_path'])) # check tool progress if not new_project: print("%s Status: Check tool progress ..." % (getTime())) for mainDirs in [ input_parameters['index_dir_path'], input_parameters['align_dir_path'], input_parameters['quant_dir_path'] ]: if debug: debug_list.append(['mainDirs', mainDirs]) # list all contents and check if content is a directory for jobName in os.listdir(mainDirs): subDirs = '%s/%s' % (mainDirs, jobName) if debug: debug_list.append(['subDirs', subDirs]) if os.path.isdir(subDirs): # save progress into progress_data print("%s Warning: Job " % getTime() + '{:<15}'.format("\'%s\'" % (jobName)) + " ... is already done!") progress_data.append(jobName) if debug: debug_list.append([ jobName, True if jobName in progress_data else False ]) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return parameters return input_parameters, progress_data
def makeToolDirectories(job_requests, input_parameters, job_type, debug): # initialize debug debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: makeToolDirectories():']) if debug: debug_list.append(['-']) if debug: debug_list.append(['job', 'setting']) if debug: debug_list.append(['-']) # initiate tool job requests job_list = {} for job in job_requests.keys(): # save all tool jobs to job_list if re.search(r'_%s$' % job_type, job): job_list[job] = job_requests[job] if debug: debug_list.append([job, job_list[job]]) # declare tool job folders # job options: 'request', 'overwrite', 'skip', 'skipped', 'finished', 'required', 'ask', 'new' if debug: debug_list.append(['-']) for job in job_list.keys(): job_dir_name = '%s_dir' % job job_folder = '%s/%s' % (input_parameters['%s_dir_path' % job_type], job) input_parameters[job_dir_name] = job_folder if job_list[job] in ['request', 'overwrite', 'required', 'new']: # check if it is an alignment job and ask if all alignments should be redone if job_list[job] == 'overwrite' and job_type in ['align', 'quant']: print( "%s Warning: You requested \'overwrite\' for job \'%s\'!" % (getTime(), job)) overwrite_all = userCall( "Type: y[es] to remove all / n[o] to decide for each job part", False) job_list[job] = 'overwrite' if overwrite_all else 'ask' # check for overwrite option and remove directory if job_list[job] == 'overwrite': removeJob(job_folder, [''], debug) # make new folder if not os.path.isdir(job_folder) and job_list[job] in [ 'request', 'overwrite', 'new' ]: print("%s Status: Create \'%s\' folder ..." % (getTime(), job)) os.mkdir(job_folder) if job_list[job] == 'overwrite': job_list[job] = 'request' if debug: debug_list.append([job, job_list[job]]) elif not os.path.isdir(job_folder) and job_list[job] == 'required': if debug: printDebug(debug_list, format) print("Error: \'%s\' is required but doesn't exist!" % (job_folder)) sys.exit() elif os.path.isdir(job_folder) and job_list[job] in [ 'required', 'ask', 'new' ]: pass else: if debug: printDebug(debug_list, format) print("Error: \'%s\' already exists!" % (job_folder)) sys.exit() elif job_list[job] == 'skip': job_list[job] = 'skipped' if debug: debug_list.append([job, job_list[job]]) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return index jobs return job_list, input_parameters
def runQsubJobs(job_list, input_parameters, bash_config, align_files, fq1_files, fq2_files, job_type, debug): # initialize debug debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: runQsubJobs():']) if debug: debug_list.append(['-']) if debug and job_type == 'align': debug_list.append(['job', 'fastq', 'mode']) if debug and job_type == 'index': debug_list.append(['job', 'mode']) if debug and job_type == 'quant': debug_list.append(['job', 'combination', 'mode']) if debug: debug_list.append(['-']) # initialize parameters file_status = {} job_queue = [] queues_empty = False q_running = False bowtie_running = True if 'BOWTIE_index' in job_list.keys() else False TOOLS_running = [] error_file = False # make job queues print("%s Status: Create queue ..." % (getTime())) for job in job_list.keys(): queue = [] # save jobs directly when index is requested if job_type == 'index': if job_list[job] == 'request': job_queue.append(job) # save fastq jobs when align is requested or combi align jobs when quant is requested elif job_type in ['align', 'quant']: for file_name in job_list[job].keys(): if job_list[job][file_name] in ['overwrite', 'request', 'new']: queue.append(file_name) file_status[job] = queue job_queue.append(job) # run jobs print("%s Status: Run jobs ..." % (getTime())) while job_queue or q_running: # set user naming user_naming = '' # set q_running to True, because job_queue isn't empty q_running = True # print tool queue print('%s Queue: %s' % (getTime(), job_queue)) # take first job from queue job = job_queue.pop(0) if job_queue else '' # start qsub for index jobs if job_type == 'index' and job: # append TOPHAT_index to queue if bowtie isn't done yet if job == 'TOPHAT_index' and bowtie_running: job_queue.append(job) # start index job else: if debug: debug_list.append([job, job_list[job]]) runQsubCommand(job, input_parameters, bash_config[job], debug) job_list[job] = 'running' if debug: debug_list.append([job, job_list[job]]) if (input_parameters['single_tool_job'].lower() in ['single', 'true']): TOOLS_running.append(job) elif job_type in ['align', 'quant'] and job: queue = file_status.pop(job) if file_status.keys() else [] # print queue print('%s %s: %s' % (getTime(), re.match(r'[A-Z]+', job).group(0), queue)) # submit quantification or alignment jobs while queue and not (job in TOOLS_running): file_name = queue.pop(0) if debug: debug_list.append( [job, file_name, job_list[job][file_name]]) if job_type == 'align': input_parameters['fastq1_file'] = fq1_files[file_name] input_parameters['fastq2_file'] = fq2_files[file_name] if job_type == 'quant': """ TODO: get to work with combi alignments (kallisto) """ input_parameters['fastq1_file'] = fq1_files[ align_files[file_name][0]] input_parameters['fastq2_file'] = fq2_files[ align_files[file_name][0]] input_parameters['ALIGNER_align_dir'] = '%s/%s_align' % ( input_parameters['align_dir_path'], file_name.partition('_')[0]) input_parameters['alignment_prefix'] = align_files[ file_name][0] combi_string = '' for align_name in align_files[file_name]: combi_string = '%s%s/%s.bam ' % ( combi_string, input_parameters['ALIGNER_align_dir'], align_name) combi_string = combi_string.strip() input_parameters['combi_align_list'] = combi_string input_parameters['comb_out_prefix'] = file_name if job in input_parameters['quant_name_list'].keys(): user_naming = '_%s' % input_parameters[ 'quant_name_list'][job] else: user_naming = '_default' input_parameters['quant_ref_file'] = '%s/%s/%s%s.ref' % ( input_parameters['quant_dir_path'], job, file_name, user_naming) input_parameters['%s_out_prefix' % job_type] = '%s%s' % (file_name, user_naming) input_parameters['%s_out_dir' % job_type] = '%s/%s/%s%s' % ( input_parameters['%s_dir_path' % job_type], job, file_name, user_naming) # start qsub for alignment and quantification jobs if job_list[job][file_name] in ['overwrite', 'request', 'new']: if job_type == 'quant': if len(align_files[file_name]) > 1: makeCombAlign(input_parameters) input_parameters['comb_in_file'] = '%s/%s.bam' % ( input_parameters['comb_dir_path'], input_parameters['comb_out_prefix']) else: input_parameters['comb_in_file'] = '%s/%s.bam' % ( input_parameters['ALIGNER_align_dir'], input_parameters['alignment_prefix']) # create quantification parameter ref file writeQuantRef(input_parameters, bash_config[job]) qsub_name = '%s-%s' % ( job, input_parameters['%s_out_prefix' % job_type]) runQsubCommand(qsub_name, input_parameters, bash_config[job], debug) job_list[job][file_name] = 'running' if debug: debug_list.append( [job, file_name, job_list[job][file_name]]) # force only one job per tool if (input_parameters['single_tool_job'].lower() in ['single', 'true']): TOOLS_running.append(job) # save queue if it isn't empty (for when single_tool_job option is active) if queue: file_status[job] = queue job_queue.append(job) # print tool status if (input_parameters['single_tool_job'].lower() in ['single', 'true']): print('%s Running: %s' % (getTime(), TOOLS_running)) print('%s Qstat: ... working ...' % getTime()) # sleep this long when jobs are in the queue and bowtie is running if job_queue and bowtie_running: time.sleep(int(input_parameters['sleep_time_qs'])) # sleep this long when jobs are in the queue and tools are running elif job_queue and TOOLS_running: time.sleep(int(input_parameters['sleep_time_qs'])) # sleep this long when the queue is empty elif not job_queue: time.sleep(int(input_parameters['sleep_time_qs'])) # get qstat job info (out, err) = subprocess.Popen('qstat', stdout=subprocess.PIPE).communicate() # set bowtie running to false if BOWTIE job is not in the queue and not running anymore if ('BOWTIE_index' not in job_queue) and not re.search('BOWTIE', out): bowtie_running = False # check if tool in TOOLS_running are still running for active in TOOLS_running: if not re.search(active[:8], out): TOOLS_running.remove(active) # check if any job is running if not out: q_running = False # there was an error with qsub ... exiting ''' TODO: make specific error detection (make new file inside bash script) and only dismiss dependent jobs ''' if os.path.isfile('%s/qsub.error' % (input_parameters['qsub_dir_path'])): print( "%s Warning: There was an error with qsub! Check qsub_data/qsub.error for more information." % getTime()) if input_parameters['ignore_qs_error'].lower() not in [ 'ignore', 'true' ]: user = userCall("Continue? y[es] / n[o]", True) # update job info if debug: debug_list.append(['-']) for job in job_list.keys(): if job_type == 'index': if job_list[job] == 'running': job_list[job] = 'finished' if debug: debug_list.append([job, job_list[job]]) elif job_type == 'align': for fq_name in job_list[job].keys(): if job_list[job][fq_name] == 'running': job_list[job][fq_name] = 'finished' if debug: debug_list.append([job, job_list[job]]) if debug: debug_list.append([job, fq_name, job_list[job][fq_name]]) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) return job_list
def checkProgress(job_list, job_parts, input_parameters, job_type, debug): # initialize debug debug_list = [] format = '{:<15}' if debug: debug_list.append(['def: checkProgress():']) if debug: debug_list.append(['-']) if debug: debug_list.append(['job', 'fastq', 'status', 'loop']) if debug: debug_list.append(['-']) # initialize parameters todo_jobs = {} job_parts_keys = [] quant_align_combi_list = input_parameters['quant_align_combi_list'] # make job list for fastq jobs for job in job_list.keys(): # get job part keys job_parts_keys = job_parts.keys() # make modified job_list for quantification jobs user_naming = '' if job_type == 'quant': job_parts_keys = [] for job_name in job_parts.keys(): if job in input_parameters['quant_name_list'].keys(): # add user defined name tag user_naming = '_%s' % input_parameters['quant_name_list'][ job] job_name = '%s%s' % (job_name, user_naming) else: user_naming = '_default' # add 'default' name tag if no user defined name tag was given for this job job_name = '%s%s' % (job_name, user_naming) if ('%s_%s' % (job.partition('_')[0], job_name.partition('_')[0])) in quant_align_combi_list: job_parts_keys.append(job_name) # make dict for each job type job_requests = {} # check each job when 'ask' was requested if job_list[job] in ['ask', 'new']: # check existing tool and job part data and save it to todo_jobs dict job_tool_dir = '%s/%s' % (input_parameters['%s_dir_path' % job_type], job) for jobName in os.listdir(job_tool_dir): jobBaseName = os.path.basename(os.path.splitext(jobName)[0]) if debug: debug_list.append([job, jobName, jobBaseName, 'basename']) # only save existing jobs which are in the job part file list if jobBaseName in job_parts_keys: if job_type == 'quant': jobBaseName = re.sub(r'\_[a-zA-Z0-9]+$', '', jobBaseName) job_requests['%s' % jobBaseName] = '' if debug: debug_list.append( [job, jobName, jobBaseName, 'bname in keys']) if debug: debug_list.append(['-']) # iterate through requested job part names for job_name in job_parts.keys(): if (job_type in [ 'index', 'align' ]) or (job_type == 'quant' and (('%s_%s' % (job.partition('_')[0], job_name.partition('_')[0])) in quant_align_combi_list)): # job string job_string = '%s/%s/%s%s' % (input_parameters[ '%s_dir_path' % job_type], job, job_name, user_naming) # check if job was already done if job_name in job_requests.keys( ) and job_list[job] == 'ask': print( "%s Warning: Job part \'%s%s\' for \'%s\' already exists!" % (getTime(), job_name, user_naming, job)) overwrite = userCall( "Type: y[es] to overwrite / n[o] to skip this job part", False) if overwrite: removeJob(job_string, ['', '.sam', '.bam', '.ref'], debug) job_requests[job_name] = 'overwrite' print( "%s Status: Create \'%s%s\' folder for \'%s\' ..." % (getTime(), job_name, user_naming, job)) os.mkdir(job_string) else: job_requests[job_name] = 'skip' if debug: debug_list.append([ job, job_name, job_requests[job_name], 'job exists' ]) elif job_name not in job_requests.keys(): job_requests[job_name] = 'request' print( "%s Status: Create \'%s%s\' folder for \'%s\' ..." % (getTime(), job_name, user_naming, job)) os.mkdir(job_string) if debug: debug_list.append([ job, job_name, job_requests[job_name], 'new job' ]) else: job_requests[job_name] = 'skipped' if debug: debug_list.append([ job, job_name, job_requests[job_name], 'else' ]) if debug: debug_list.append(['-']) # save overwrite all and request jobs elif job_list[job] in ['overwrite', 'request']: for job_name in job_parts.keys(): if (job_type in [ 'index', 'align' ]) or (job_type == 'quant' and (('%s_%s' % (job.partition('_')[0], job_name.partition('_')[0])) in quant_align_combi_list)): # job string job_string = '%s/%s/%s%s' % (input_parameters[ '%s_dir_path' % job_type], job, job_name, user_naming) print("%s Status: Create \'%s%s\' folder for \'%s\' ..." % (getTime(), job_name, user_naming, job)) os.mkdir(job_string) job_requests[job_name] = job_list[job] if debug: debug_list.append([ job, job_name, job_requests[job_name], 'overwrite or request' ]) # save to fastq jobs if not job_list[job] == 'skipped': todo_jobs[job] = job_requests if debug: debug_list.append(['-']) # add todo_jobs debug if debug: debug_list.append(['-']) if debug: for job in todo_jobs.keys(): for part in todo_jobs[job]: debug_list.append( [job, part, todo_jobs[job][part], 'todo_jobs']) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return fastq_jobs return todo_jobs
def checkJobRequests(progress_data, job_requests, input_parameters, debug): # initialize parameters debug_list = [] format = '{:<15}' if debug: debug_list.append(['def: checkJobRequests():']) if debug: debug_list.append(['-']) if debug: debug_list.append( ['job', 'requests[job]', 'progress[job]', 'skip all', 'over all']) if debug: debug_list.append(['-']) # iterate through job requests and ask user to skip job if already done do_for_all = { 'for_all_index': '', 'for_all_align': '', 'for_all_quant': '', 'for_all_new': '' } # skip -> skip all <TOOL> jobs # overwrite -> overwrite all <TOOL> jobs (in case of alignment/quantification, it will be asked for every alignment/quantification) # new -> make only jobs which aren't done yet user_options = { 's': 'skip', 'skip': 'skip', 's all': 'skip', 'skip all': 'skip', 'o': 'overwrite', 'overwrite': 'overwrite', 'o all': 'overwrite', 'overwrite all': 'overwrite', 'n': 'new', 'new': 'new', 'n all': 'new', 'new all': 'new' } # make config setting list for user_set_over, user_set_skip and user_set_new conf_settings = {} options = ['over', 'skip', 'new'] for opt in options: for job in filter(None, input_parameters['user_set_%s' % opt].split(',')): conf_settings[job] = opt[0] # ask user for job in job_requests.keys(): job_done = True if job in progress_data else False # call if job was already done if job_done and job == 'AFREE_align': job_requests[job] = 'skip' elif job_done: all_job = 'for_all_%s' % job.rpartition('_')[2] if job not in conf_settings.keys(): user_input = do_for_all[all_job] if do_for_all[ all_job] else raw_input( "%s Warning: What to do with existing \'%s\' job \'%s\'? a[bort] / s[kip] / o[verwrite] / n[ew] [all]: " % (getTime(), job[-5:], job)) else: user_input = conf_settings[job] input_valid = False # only accept (case insensitive) as input while not input_valid: # handle do for all cases if user_input.lower() in [ 's all', 'skip all', 'o all', 'overwrite all', 'n all', 'new all' ]: do_for_all[all_job] = user_options[user_input.lower()] # do input command if user_input.lower() in user_options.keys(): job_requests[job] = user_options[user_input.lower()] input_valid = True print("%s Status: Setting for job " % getTime() + '{:<15}'.format("\'%s\'" % (job)) + " -> \'%s\'!" % (job_requests[job])) if debug: debug_list.append([ job, job_requests[job], job_done, user_input, do_for_all[all_job] ]) # abort if requested elif user_input.lower() in ['a', 'abort']: if debug: printDebug(debug_list, format) print("%s Warning: Script interrupted by user ... " % getTime()) sys.exit() # repeat until input is valid else: user_input = raw_input( "%s Warning: Wrong input! Use a[bort] / s[kip] / o[verwrite] / n[ew] [all]: " % getTime()) # job not yet done else: job_requests[job] = 'request' print("%s Status: Setting for job " % getTime() + '{:<15}'.format("\'%s\'" % (job)) + " -> \'%s\'!" % (job_requests[job])) if debug: debug_list.append([job, job_requests[job], job_done, '', '']) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return updated job_requests return job_requests
def getArguments(input_files, input_options, input_parameters, debug): # initialize parameters debug_list = [] format = '{:<17}' if debug: debug_list.append(['def: getArguments():']) if debug: debug_list.append(['-']) if debug: debug_list.append(['key', 'value']) if debug: debug_list.append(['-']) # check project argument and save new project name if project had a path input_parameters['proj_dir_prefix'], input_parameters[ 'p_name_suffix'] = checkArgsProject(input_options['project'], debug) if debug: debug_list.append(['p_name_suffix', input_parameters['p_name_suffix']]) # check for command line input files and save if valid ''' TODO: do for input_options ''' for type, file in input_files.iteritems(): if file and os.path.isfile("%s" % (file)) and not type == 'config': input_parameters[type] = file if debug: debug_list.append([type, file]) elif file and not os.path.isfile("%s" % (file)): print("Error: No \'%s\' file named \'%s\'!" % (type, file)) sys.exit() # check command line prefix option and save if not already declared prefix = input_options['prefix'] if prefix and not os.path.isdir(prefix): print("Error: \'%s\' is not a directory!" % (prefix)) sys.exit() elif prefix and not input_parameters['proj_dir_prefix']: input_parameters['proj_dir_prefix'] = prefix # get config arguments print("%s Status: Save config parameters ..." % (getTime())) input_parameters = getArgsConfig(input_files['config'], input_parameters, debug) if debug: debug_list.append( ['proj_dir_prefix', input_parameters['proj_dir_prefix']]) # configure test data if test was requested if input_parameters['test_call']: input_parameters[ 'proj_dir_prefix'] = '%s/test_files' % input_parameters['arq_path'] input_parameters[ 'fasta_gen_file'] = '%s/test_files/EF204940.fa' % input_parameters[ 'arq_path'] input_parameters[ 'gtf_index_file'] = '%s/test_files/ef204940.gtf' % input_parameters[ 'arq_path'] input_parameters[ 'fastq1_links'] = '%s/test_files/arq_proj_test.fq1.info' % input_parameters[ 'arq_path'] input_parameters[ 'fastq2_links'] = '%s/test_files/arq_proj_test.fq2.info' % input_parameters[ 'arq_path'] # define default parameters ''' TODO: make definition ''' # validate arguments checkArguments(input_parameters, debug) # define index base name without path if not 'index_base_name' in input_parameters.keys(): input_parameters['index_base_name'] = os.path.basename( os.path.splitext(input_parameters['fasta_gen_file'])[0]) if debug: debug_list.append( ['index_base_name', input_parameters['index_base_name']]) # define tool directories ''' TODO: enable global tool command ''' # save and validate fastq files print("%s Status: Save fastq files ..." % (getTime())) ''' TODO: make option for automatic fastq2 file read and enable 'paired end' option as input parameter ''' fastq1_files, fastq_ref = getArgsFastq(input_parameters['fastq1_links'], input_parameters, debug) if input_parameters['fastq2_links']: fastq2_files, fastq_ref2 = getArgsFastq( input_parameters['fastq2_links'], input_parameters, debug) for fq_name in fastq1_files.keys(): if not ((re.sub(r'\_1.fq.gz$', '', fastq1_files[fq_name]) == re.sub(r'\_2.fq.gz$', '', fastq2_files[fq_name])) or (re.sub(r'\-1.fastq.gz$', '', fastq1_files[fq_name]) == re.sub(r'\-2.fastq.gz$', '', fastq2_files[fq_name]))): print( "%s Warning: The links of \'%s\' are not equal after substituting the enumeration!" % (getTime(), fq_name)) ''' TODO: DO TEST ''' user = userCall("Continue? y[es] / n[o]", True) # declare project folder input_parameters['proj_dir_path'] = '%s/%s' % ( input_parameters['proj_dir_prefix'], input_parameters['p_name_suffix']) if debug: debug_list.append(['proj_dir_path', input_parameters['proj_dir_path']]) # print debug if debug: printDebug(debug_list, format) if debug: user = userCall("Continue? y[es] / n[o]", True) # return arguments return input_parameters, fastq1_files, fastq2_files, fastq_ref