def main(argv): global parser (opts, args) = parser.parse_args() if valid_arguments(opts, args): print usage sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) eprintf("COMMAND : %s\n", sys.argv[0] + ' ' + ' '.join(argv)) # initialize the input directory or file input_fp = opts.input_fp output_dir = path.abspath(opts.output_dir) verbose = opts.verbose print_only = opts.print_only sample_subset= opts.sample_subset run_type = opts.run_type.strip() '''no need to remove the whole directory''' # if run_type == 'overwrite': # force_remove_dir=True # else: # force_remove_dir=False if opts.config_file: config_file= opts.config_file else: config_file = cmd_folder + PATHDELIM + metapaths_config if opts.ncbi_header and opts.ncbi_sbt: if not path.exists(opts.ncbi_header): print "Could not open or missing NCBI header file " + opts.ncbi_header print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file" sys.exit(0) if not path.exists(opts.ncbi_sbt): print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt sys.exit(0) ncbi_sequin_params = path.abspath(opts.ncbi_header) ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt) else: ncbi_sequin_params = None ncbi_sequin_sbt = None # try to load the parameter file try: parameter_f = opts.parameter_fp except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.parameter_fp try: if run_type in ['overlay', 'safe'] and not path.exists(output_dir): makedirs(output_dir) except OSError: print "" print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\ " Perhaps directory \"" + output_dir + "\" already exists.\n" +\ " Please choose a different directory, or \n" +\ " run with the option \"-r overwrite\" to force overwrite it." sys.exit(1) if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates command_line_params={} command_line_params['verbose']= opts.verbose params=parse_metapaths_parameters(parameter_f) format = params['INPUT']['format'] """ load the sample inputs it expects either a fasta file or a directory containing fasta and yaml file pairs """ globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') input_output_list = {} # TODO: Check for illumina paired data... this complicates things a little. if path.isfile(input_fp): """ check if it is a file """ # TODO: Check for illumina pattern, if so check for pairs input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger) else: if path.exists(input_fp): """ check if dir exists """ input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger) else: """ must be an error """ eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!") eprintf("ERROR\tAs provided as arguments in the -in option.!\n") exit_process("ERROR\tAs provided as arguments in the -in option.!\n") """ these are the subset of sample to process if specified in case of an empty subset process all the sample """ if sample_subset: remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger) # add check the config parameters sorted_input_output_list = sorted(input_output_list.keys()) config_settings = read_pipeline_configuration(config_file, globalerrorlogger) parameter = Parameters() if not staticDiagnose(config_settings, params, logger = globalerrorlogger): eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n") samplesData = {} # PART1 before the blast block_mode = opts.block_mode runid = opts.runid try: # load the sample information if len(input_output_list): for input_file in sorted_input_output_list: sample_output_dir = input_output_list[input_file] algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper() s = SampleData() s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir) s.setParameter('algorithm', algorithm) s.setParameter('ncbi_params_file', ncbi_sequin_params) s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt) s.clearJobs() if run_type=='overwrite' and path.exists(sample_output_dir): shutil.rmtree(sample_output_dir) makedirs(sample_output_dir) if not path.exists(sample_output_dir): makedirs(sample_output_dir) s.prepareToRun() samplesData[input_file] = s # load the sample information run_metapathways( samplesData, sample_output_dir, output_dir, globallogger = globalerrorlogger, command_line_params=command_line_params, params=params, metapaths_config=metapaths_config, status_update_callback=status_update_callback, config_file=config_file, run_type = run_type, config_settings = config_settings, block_mode = block_mode, runid = runid ) else: eprintf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) ) globalerrorlogger.printf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) ) # blast the files blasting_system = get_parameter(params, 'metapaths_steps', 'BLAST_REFDB', default='yes') if blasting_system =='grid': # blasting the files files on the grids input_files = sorted_input_output_list blast_in_grid( sampleData[input_file], input_files, path.abspath(opts.output_dir), #important to use opts. params=params, metapaths_config=metapaths_config, config_file=config_file, run_type = run_type, runid = runid ) except: globalerrorlogger.write( "ERROR\t" + str(traceback.format_exc(10))) exit_process("ERROR:" + str(traceback.format_exc(10))) eprintf(" *********** \n") eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n") eprintf(" THE END \n") eprintf(" *********** \n") halt_process(4)
def blast_in_grid(input_files, output_dir, config_params, metapaths_config, config_file, run_type): algorithm = get_parameter(config_params, 'annotation', 'algorithm', default='BLAST').upper() messagelogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name='metapathways_messages', suffix='txt'),\ open_mode='w') command_Status= get_parameter(config_params,'metapaths_steps','BLAST_REFDB') config_settings = read_pipeline_configuration( config_file ) # preprocessed_dir = output_dir + PATHDELIM + "preprocessed" + PATHDELIM orf_prediction_dir = "orf_prediction" # genbank_dir = output_dir + PATHDELIM + "genbank" + PATHDELIM output_run_statistics_dir = output_dir + PATHDELIM + "run_statistics" +PATHDELIM blast_results_dir = output_dir + PATHDELIM + "blast_results" + PATHDELIM output_results = output_dir + PATHDELIM + "results" + PATHDELIM #--- # create the sample and input pairs samples_and_input = {} for input_file in input_files: sample_name = re.sub(r'[.][a-zA-Z]*$','',input_file) sample_name = path.basename(sample_name) sample_name = re.sub('[.]','_',sample_name) samples_and_input[sample_name] = output_dir + PATHDELIM + sample_name + PATHDELIM + orf_prediction_dir + PATHDELIM + sample_name + ".qced.faa" # BLAST THE ORFs AGAINST THE REFERENCE DATABASES FOR FUNCTIONAL ANNOTATION dbstring = get_parameter(config_params, 'annotation', 'dbs', default=None) dbs= dbstring.split(",") #parse the grid settings from the param file gridEnginePATTERN = re.compile(r'(grid_engine\d+)') trueOrYesPATTERN = re.compile(r'^[yYTt]') gridSettings = [] for key in config_params: match = gridEnginePATTERN.match(key) if match ==None: continue if 'active' in config_params[key]: trueOrYes = trueOrYesPATTERN.match(config_params[key]['active']) if trueOrYes: # this grid is inactive # proceed with adding the grid match = gridEnginePATTERN.match(key) if match: gridSettings.append(config_params[key]) if not isValidInput(output_dir, samples_and_input, dbs, gridSettings, config_settings = config_settings,\ messagelogger = messagelogger): sys.exit(0) blastbroker = BlastBroker(messagelogger) # setup the broker with a message logger blastbroker.setBaseOutputFolder(output_dir) #set up the output folder blastbroker.addSamples(samples_and_input) # add the samples and the input files # add databases against the samples for sample in samples_and_input: for db in dbs: blastbroker.addDatabase(sample, db) blastbroker.addAlgorithm(sample, algorithm) # add the algorithms # setup services and add them to the Broker for gridsetting in gridSettings: gridsetting['messagelogger']=messagelogger gridsetting['MetaPathwaysDir']=config_settings['METAPATHWAYS_PATH'] gridsetting['base_output_folder']=blastbroker.base_output_folder gridsetting['blast_db_folder']=config_settings['REFDBS'] + PATHDELIM + 'functional' try: blastservice = BlastService(gridsetting) except: print traceback.format_exc(10) blastbroker.addService(blastservice) # create the work space folders if blastbroker.are_working_folders_available(): messagelogger.write("STATUS: Local working folders for Grid found!\n") elif blastbroker.create_working_folders(): messagelogger.write("OK: Successfully created the grid related local working folders!\n") else: messagelogger.write("ERROR: Cannot create the grid working folders!\n") messagelogger.write("ERROR: Exiting blast in grid mode!\n") return # check if the input files are already split messagelogger.write("STATUS: Checking if input files are already split!\n") # for s in blastbroker.getSamples(): # if not blastbroker.doesValidSplitExist(s): # messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s)) # if not blastbroker.splitInput(s): #if not then split # messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n") # sys.exit(0) # else: # messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n") # else: # messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s)) # messagelogger.write("STATUS: Competed checks for file splits!\n") batch_size = int(get_parameter(config_params, 'grid_submission', 'batch_size', default=1000)) blastbroker.setBatchSize(batch_size) # check if the input files are already split for s in blastbroker.getSamples(): if not blastbroker.doesValidSplitExist(s): messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s)) if not blastbroker.splitInput(s): #if not then split print ("ERROR: Cannot split the files for some or all of the samples!\n") messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n") sys.exit(0) else: messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n") else: messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s)) # load the list of splits blastbroker.load_list_splits() messagelogger.write("SUCCESS: Successfully loaded the list of file splits!\n") # create the databse and split combinations as jobs for each sample blastbroker.createJobs(redo=False) messagelogger.write("SUCCESS: Successfully created the (split, database) pairs!\n") # make sure you loaded the latest job lists on file blastbroker.load_job_lists() messagelogger.write("SUCCESS: Successfully recovered the old/existing job list!\n") # for each sample load the submitted and completed lists # and compute the loadper Server blastbroker.load_job_status_lists() messagelogger.write("SUCCESS: Successfully loaded the status of the jobs!\n") blastbroker.compute_performance() try: blastbroker.compute_server_loads() except: print traceback.format_exc(10) #print blastbroker.list_jobs_submitted #print blastbroker.list_jobs_completed #blastbroker.launch_AWS_grid() blastbroker.setupStatsVariables() messagelogger.write("STATUS: Getting ready to submit jobs to the servers!\n") blastbroker.Do_Work() #blastbroker.stop_AWS_grid() blastbroker.Delete_Remote_Directories() #print output_dir #print samples_and_input #print dbs #print gridSettings message = "\n6. Blasting using Grid ORFs against reference database - "