Ejemplo n.º 1
0
def main(argv):
    global parser
    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("COMMAND : %s\n", sys.argv[0] + ' ' +  ' '.join(argv))
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only
    sample_subset= opts.sample_subset

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    
    if opts.ncbi_header and opts.ncbi_sbt:
       if not path.exists(opts.ncbi_header):
          print "Could not open or missing NCBI header file " + opts.ncbi_header
          print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file"
          sys.exit(0)

       if  not path.exists(opts.ncbi_sbt):
          print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n 
                 http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt
          sys.exit(0)

       ncbi_sequin_params = path.abspath(opts.ncbi_header)
       ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt)
    else:
       ncbi_sequin_params = None
       ncbi_sequin_sbt = None

    # try to load the parameter file    
    try:
        parameter_f = opts.parameter_fp
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.parameter_fp

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(1)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    params=parse_metapaths_parameters(parameter_f)
    format = params['INPUT']['format']

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 
    
    input_output_list = {}
    # TODO: Check for illumina paired data... this complicates things a little. 
    if path.isfile(input_fp):   
       """ check if it is a file """
       # TODO: Check for illumina pattern, if so check for pairs
       input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """
    if sample_subset:
       remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger)


    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)

    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n")

    
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('ncbi_params_file', ncbi_sequin_params)
                s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
   
        
         # blast the files
     
         blasting_system =    get_parameter(params,  'metapaths_steps', 'BLAST_REFDB', default='yes')
         if blasting_system =='grid':
            #  blasting the files files on the grids
             input_files = sorted_input_output_list
             blast_in_grid(
                   sampleData[input_file],
                   input_files, 
                   path.abspath(opts.output_dir),   #important to use opts.
                   params=params,
                   metapaths_config=metapaths_config,
                   config_file=config_file,
                   run_type = run_type,
                   runid = runid
                )
     
    except:
       globalerrorlogger.write( "ERROR\t" + str(traceback.format_exc(10)))
       exit_process("ERROR:" + str(traceback.format_exc(10)))


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
    halt_process(4)
Ejemplo n.º 2
0
def blast_in_grid(input_files, output_dir, config_params, metapaths_config, config_file, run_type):

    algorithm = get_parameter(config_params, 'annotation', 'algorithm', default='BLAST').upper()
    messagelogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name='metapathways_messages', suffix='txt'),\
                    open_mode='w')

    command_Status=  get_parameter(config_params,'metapaths_steps','BLAST_REFDB')

    config_settings = read_pipeline_configuration( config_file )

#   preprocessed_dir = output_dir + PATHDELIM + "preprocessed" + PATHDELIM
    orf_prediction_dir =   "orf_prediction"  
#   genbank_dir =  output_dir + PATHDELIM + "genbank"  + PATHDELIM
    output_run_statistics_dir = output_dir + PATHDELIM + "run_statistics"  +PATHDELIM
    blast_results_dir =  output_dir +  PATHDELIM + "blast_results"  + PATHDELIM
    output_results = output_dir + PATHDELIM + "results" + PATHDELIM 
    #---

    # create the sample and input pairs 
    samples_and_input = {}
    for input_file in input_files:
       sample_name = re.sub(r'[.][a-zA-Z]*$','',input_file)
       sample_name = path.basename(sample_name)
       sample_name = re.sub('[.]','_',sample_name)
       samples_and_input[sample_name] =  output_dir + PATHDELIM + sample_name + PATHDELIM + orf_prediction_dir + PATHDELIM +  sample_name + ".qced.faa"   
    
    

    # BLAST THE ORFs AGAINST THE REFERENCE DATABASES  FOR FUNCTIONAL ANNOTATION
    dbstring = get_parameter(config_params, 'annotation', 'dbs', default=None)
    dbs= dbstring.split(",")

    #parse the grid settings from the param file
    gridEnginePATTERN = re.compile(r'(grid_engine\d+)')
    trueOrYesPATTERN = re.compile(r'^[yYTt]')

    gridSettings = []
    for key in config_params:
       match = gridEnginePATTERN.match(key)
       if match ==None:
           continue
       if 'active' in config_params[key]:
           trueOrYes =  trueOrYesPATTERN.match(config_params[key]['active'])
           if trueOrYes:  # this grid is inactive
               # proceed with adding the grid
               match = gridEnginePATTERN.match(key)
               if match:
                  gridSettings.append(config_params[key])

    
    if not isValidInput(output_dir, samples_and_input, dbs, gridSettings, config_settings = config_settings,\
         messagelogger = messagelogger): 
       sys.exit(0)
       
    blastbroker = BlastBroker(messagelogger) # setup the broker with a message logger
    blastbroker.setBaseOutputFolder(output_dir)  #set up the output folder 
    blastbroker.addSamples(samples_and_input)   # add the samples and the input files
    
    # add databases against the samples
    for sample in samples_and_input:
       for db in dbs:
          blastbroker.addDatabase(sample, db)
       blastbroker.addAlgorithm(sample, algorithm)   # add the algorithms
       
    # setup services and add them to the Broker 
    for gridsetting in gridSettings:
        gridsetting['messagelogger']=messagelogger
        gridsetting['MetaPathwaysDir']=config_settings['METAPATHWAYS_PATH']
        gridsetting['base_output_folder']=blastbroker.base_output_folder
        gridsetting['blast_db_folder']=config_settings['REFDBS'] + PATHDELIM + 'functional'

        try:
          blastservice = BlastService(gridsetting)
        except:
          print traceback.format_exc(10)

        blastbroker.addService(blastservice)

    # create the work space folders
    if  blastbroker.are_working_folders_available():
       messagelogger.write("STATUS: Local working folders for Grid found!\n")
    elif blastbroker.create_working_folders():
       messagelogger.write("OK: Successfully created the grid related local working folders!\n")
    else:
       messagelogger.write("ERROR: Cannot create the grid working folders!\n")
       messagelogger.write("ERROR: Exiting blast in grid mode!\n")
       return

    
    # check if the input files are already split
    messagelogger.write("STATUS: Checking if input files are already split!\n")
#    for s in blastbroker.getSamples():
#       if not blastbroker.doesValidSplitExist(s):
#          messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s))
#          if not blastbroker.splitInput(s): #if not then split
#             messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n")
#             sys.exit(0)
#          else:
#             messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n")
#       else:
#          messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s))
#           
    messagelogger.write("STATUS: Competed checks for file splits!\n")

    batch_size = int(get_parameter(config_params, 'grid_submission', 'batch_size', default=1000))
    blastbroker.setBatchSize(batch_size)
    
    
    # check if the input files are already split
    for s in blastbroker.getSamples():
       if not blastbroker.doesValidSplitExist(s):
          messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s))
          if not blastbroker.splitInput(s): #if not then split
             print ("ERROR: Cannot split the files for some or all of the samples!\n")
             messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n")
             sys.exit(0)
          else:
             messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n")
       else:
          messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s))
           
    # load the list of splits
    blastbroker.load_list_splits()
    messagelogger.write("SUCCESS: Successfully loaded the list of file splits!\n")
    
    # create the databse and split combinations as jobs for each sample
    blastbroker.createJobs(redo=False)
    messagelogger.write("SUCCESS: Successfully created the (split, database) pairs!\n")
    
    # make sure you loaded the latest job lists on file
    blastbroker.load_job_lists()
    messagelogger.write("SUCCESS: Successfully recovered the old/existing job list!\n")

    # for each sample load the submitted and completed lists
    # and compute the loadper Server
    blastbroker.load_job_status_lists()
    messagelogger.write("SUCCESS: Successfully loaded the status of the jobs!\n")

    blastbroker.compute_performance()

    try:
       blastbroker.compute_server_loads()
    except:
       print traceback.format_exc(10)

    #print blastbroker.list_jobs_submitted
    #print blastbroker.list_jobs_completed
    #blastbroker.launch_AWS_grid()
    blastbroker.setupStatsVariables()

    messagelogger.write("STATUS: Getting ready to submit jobs to the servers!\n")
    blastbroker.Do_Work()
    #blastbroker.stop_AWS_grid()

    blastbroker.Delete_Remote_Directories()
    

    #print output_dir    
    #print samples_and_input
    #print dbs
    #print gridSettings
    
    message = "\n6. Blasting using Grid ORFs against reference database - "