def read_map_file(dbname_map_filename, field_to_description, hierarchical_map):
    try:
        map_file = open(dbname_map_filename, 'r')

        map_filelines = map_file.readlines()
    except:
        eprintf("ERROR: Cannot open file %s\n", dbname_map_filename)
        exit_process()

    tempfields = ['', '', '', '', '', '', '']
    for line in map_filelines:
        pos = beginning_valid_field(line)
        if pos == -1:
            continue

        fields = [x.strip() for x in line.split('\t')]

        tempfields[pos] = fields[pos]
        if len(fields) > pos + 1:
            field_to_description[fields[pos]] = fields[pos + 1]
        else:
            field_to_description[fields[pos]] = fields[pos]

        i = 0
        temp_hierarchical_map = hierarchical_map
        while i < pos:
            temp_hierarchical_map = temp_hierarchical_map[tempfields[i]]
            i += 1

        temp_hierarchical_map[tempfields[i]] = {}
    fill_hierarchy_with_zeroes(hierarchical_map)
Ejemplo n.º 2
0
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):

    """  creates a list of  input output pairs if input is  an input dir """
    clean = True
    if not re.search(r'^[a-zA-Z]',shortname):
         eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\n",shortname)
         clean = False

    if re.search(r'[.]',shortname):
         eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         clean = False

    if len(shortname)<2:
         eprintf("ERROR\tSample name %s is too short!\n",shortname)
         if globalerrorlogger:
             globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname)
         clean = False

    if clean:
         return clean

    errmessage = """ Sample names before the  suffixes .fasta, .fas, .fna, .faa or .gbk, must  consist only of alphabets, digits and _; and should consist of at least two characters """
    eprintf("ERROR\t%s\n",errmessage)
    if globalerrorlogger:
        globalerrorlogger.printf("ERROR\t%s\n",errmessage)
        exit_process("ERROR\t" + errmessage + "Exiting!" + "\n")
    return False
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map) :
    try:
       map_file = open(dbname_map_filename, 'r')

       map_filelines = map_file.readlines()
    except:
       eprintf("ERROR: Cannot open file %s\n", dbname_map_filename)
       exit_process()

    tempfields = [ '', '', '', '', '', '', '' ]
    for line in map_filelines:
       pos = beginning_valid_field(line)
       if pos==-1:
          continue

       fields = [ x.strip() for x in line.split('\t') ]

       tempfields[pos] = fields[pos]
       if len(fields) > pos + 1:
          field_to_description[fields[pos]] = fields[pos+1]
       else:
          field_to_description[fields[pos]] = fields[pos]

       i=0
       temp_hierarchical_map = hierarchical_map
       while i < pos :
          temp_hierarchical_map = temp_hierarchical_map[ tempfields[i] ]
          i+=1

       temp_hierarchical_map[ tempfields[i] ] = {}
    fill_hierarchy_with_zeroes(hierarchical_map)
def process_blastout_file(blast_file, database, table, errorlogger=None):
    try:
        blastfile = open(blast_file, 'r')
    except IOError:
        eprintf("ERROR : Cannot write read file " + blast_file + " !")
        if errorlogger != None:
            errorlogger.write(
                "STATS_rRNA\tERROR\tCannot write read blast output file " +
                blast_file + " for database " + database)
        exit_process()

    blastLines = blastfile.readlines()
    blastfile.close()

    for line in blastLines:
        line = line.strip()
        fields = re.split('\t', line)
        if len(fields) < 12:
            continue
        fields[0] = str(fields[0].strip())
        fields[1] = str(fields[1].strip())
        fields[2] = float(fields[2].strip())
        fields[6] = int(fields[6].strip())
        fields[7] = int(fields[7].strip())
        fields[10] = float(fields[10].strip())
        fields[11] = float(fields[11].strip())
        table[str(fields[0].strip())] = [
            fields[2], fields[10], fields[11], fields[1], fields[6], fields[7]
        ]
def  make_sure_map_file_exists(config_settings, dbname, globallogger = None):
    dbmapFile = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt"
    seqFilePath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname
    if not doFilesExist( [dbmapFile ] ):
         eprintf("WARNING: Trying to create database map file for %s\n", dbname)
         if globallogger!= None:
            globallogger.write("WARNING: Trying to create database map file for %s\n" %( dbname) )

         if not doFilesExist( [seqFilePath] ):
            eprintf("ERROR : You do not even have the raw sequence for Database  %s to format!\n", dbname)
            eprintf("      : Make sure you have the file %s\n", seqFilePath)

            if globallogger!= None:
               globallogger.write("ERROR \t You do not even have the raw sequence for Database  %s to format!\n" %( dbname))
               globallogger.write("Make sure you have the file %s\n" %( seqFilePath))

            exit_process()

         mapfile = open(dbmapFile,'w')
         seqFile = open(seqFilePath,'r')
         for line in seqFile:
             if re.match(r'>', line):
                 fprintf(mapfile, "%s\n",line.strip())
         seqFile.close()
         mapfile.close()

    return dbmapFile
Ejemplo n.º 6
0
def process_input(input, output, input_type , gene_list, append,  errorlogger = None):
    commentPATT = re.compile(r'^#')
    count = 0

    mode = 'w'
    if append:
       mode = 'a'

    gene_list = read_gene_list(gene_list)
    gene_dict = {}

    for gene in gene_list:
       gene_dict[gene.lower()] = gene # re.compile(r'[\/\s]' + gene + '[\/\s]') 

    if input_type=='LAST2':
      q = 0
      t = 9

    if input_type=='LAST1':
      q = 0
      t = 1

    if input_type=='HMM':
      q = 2
      t = 0

    try:
        inputfile = open(input, 'r') 
        outputfile = open(output, mode) 
    except:
        if errorlogger:
           errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))
        exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))


    for line in inputfile:
        result = commentPATT.search(line)
        if result:
           continue

        fields = [ x.strip() for x in line.split('\t') ]
        if len(fields) < 3:
           continue

        orfid = fields[q]

        #if input_type=='LAST1' or input_type=='LAST2':
        target = find_gene_name(fields[t], gene_list, gene_dict)

        if target==None:
           continue

        fprintf(outputfile, "%s\t%s\n",orfid, gene_dict[target]);


    outputfile.close()
    inputfile.close()
#    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)

    return count
def process_blastout_file(blast_file, database, table, errorlogger = None):
     try:
        blastfile = open(blast_file, 'r')
     except IOError:
        eprintf("ERROR : Cannot write read file " + blast_file + " !" )
        if errorlogger!=None:
          errorlogger.write("STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database )
        exit_process()

     blastLines = blastfile.readlines()
     blastfile.close()

     for line in blastLines:
        line = line.strip() 
        fields = re.split('\t', line)
        if len(fields) < 12:
           continue
        fields[0] =  str(fields[0].strip())
        fields[1] =  str(fields[1].strip())
        fields[2] =  float(fields[2].strip())
        fields[6] =  int(fields[6].strip())
        fields[7] =  int(fields[7].strip())
        fields[10] = float(fields[10].strip())
        fields[11] = float(fields[11].strip())
        table[str(fields[0].strip())] = [fields[2], fields [10], fields[11], fields[1], fields[6], fields[7]] 
def write_run_parameters_file(fileName, parameters):
    try:
       paramFile = open(fileName, 'w')
    except IOError:
       eprintf("Cannot write run parameters to file %s!\n", fileName)
       exit_process("Cannot write run parameters to file %s" %(fileName) )

#       16s_rRNA      {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'}
    paramFile.write("\nRun Date : " + str(date.today()) + " \n")

    paramFile.write("\n\nNucleotide Quality Control parameters[s.n")
    paramFile.write( "  min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n")

    paramFile.write("\n\nORF prediction parameters[s.n")
    paramFile.write( "  min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n")
    paramFile.write( "  algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n")


    paramFile.write("\n\nAmino acid quality control and annotation parameters[s.n")
    paramFile.write( "  min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n")
    paramFile.write( "  min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n")
    paramFile.write( "  annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n")
    paramFile.write( "  min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n")
    paramFile.write( "  max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n")

    paramFile.write("\n\nPathway Tools parameters[s.n")
    paramFile.write( "  taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n")

    paramFile.write("\n\nrRNA search/match parameters[s.n")
    paramFile.write( "  min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n")
    paramFile.write( "  max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n")
    paramFile.write( "  rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n")

    paramFile.close()
    def next(self):
        if self.i % self.SIZE == 0:
            self.refillBuffer()
            if len(self.lines)==0:
                raise StopIteration()

        if self.i % self.SIZE < self.size:
            fields = [ x.strip()  for x in self.lines[self.i % self.SIZE].split('\t')]
            try:
                self.data = {}
                self.data['query'] = fields[self.fieldmap['query']]
                self.data['q_length'] = int(fields[self.fieldmap['q_length']])
                self.data['bitscore'] = float(fields[self.fieldmap['bitscore']])
                self.data['bsr'] = float(fields[self.fieldmap['bsr']])
                self.data['target'] = fields[self.fieldmap['target']]
                self.data['aln_length'] = float(fields[self.fieldmap['aln_length']])
                self.data['expect'] = float(fields[self.fieldmap['expect']])
                self.data['identity'] = float(fields[self.fieldmap['identity']])
                self.data['ec'] = fields[self.fieldmap['ec']]
                self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']])
                self.lineToProcess = self.lines[self.i % self.SIZE]
            except:
                self.ERROR_COUNT += 1
                if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT:
                    eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME,  self.lines[self.i % self.SIZE], self.blastoutput)
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write("%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" %(self.STEP_NAME,  re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]) , self.blastoutput))
                    self.i = self.i + 1
                    self.next()
                else:
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write("%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" %(self.blastoutput,  self.MAX_READ_ERRORS_ALLOWED) )
                    exit_process()


                #              print "<<<<<<-------"
                #              print 'self size ' + str(self.size)
                #              print 'line ' + self.lines[self.i % self.SIZE]
                #              print 'num fields ' + str(len(fields))
                #              fields = [ x  for x in self.lines[self.i % self.SIZE].split('\t')]
                #              for field in fields:
                #                 print field
                #              print 'next line ' + self.lines[(self.i + 1) % self.SIZE]
                #              print ' field map ' + str(self.fieldmap)
                #              print 'index ' + str(self.i)
                #              print 'data ' + str(self.data)
                #              print 'fields ' + str(fields)
                #              print ' while processing file ' + self.blastoutput
                #              print ">>>>>>-------"
                #              import traceback
                #              print traceback.print_exc()

            self.i = self.i + 1
            return self.data
        else:
            self.lineToProcess = None
            self.blastoutputfile.close()
            raise StopIteration()
def process_blastoutput(dbname, blastoutput,  mapfile, refscore_file, opts, errorlogger = None):

    blastparser =  BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger)


    blastparser.setMaxErrorsLimit(100)
    blastparser.setErrorAndWarningLogger(errorlogger)
    blastparser.setSTEP_NAME('PARSE BLAST')
    
    fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if opts.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = opts.parsed_output

    # temporary file is used to deal with incomplete processing of the file
    output_blastoutput_parsed_tmp =  output_blastoutput_parsed + ".tmp"
    try:
        outputfile = open(output_blastoutput_parsed_tmp, 'w') 
    except:
        if errorlogger:
           errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))
        exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))

    # write the headers out
    fprintf(outputfile, "#%s",'query')
    for field in fields:
         fprintf(outputfile,"\t%s",field)
    fprintf(outputfile, "\n")

    pattern = re.compile(r'' +  "(\d+_\d+)$")

    count = 0;
    uniques = {}
    for data in blastparser:
        if not data:
          continue
        try:
          fprintf(outputfile, "%s",data['query'])

          result = pattern.search(data['query'])
          if result:
             name =  result.group(1)
             uniques[name] =True
        except:
           print 'data is : ', data, '\n'
           return count, len(uniques)

        for field in fields:
           fprintf(outputfile, "\t%s",data[field])
        fprintf(outputfile, "\n")
        count += 1

    outputfile.close()
    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)

    return count, len(uniques)
    def next(self):
        if self.i % self.SIZE == 0:
            self.refillBuffer()
            if len(self.lines) == 0:
                raise StopIteration()

        if self.i % self.SIZE < self.size:
            fields = [
                x.strip() for x in self.lines[self.i % self.SIZE].split('\t')
            ]
            try:
                self.data = {}
                self.data['query'] = fields[self.fieldmap['query']]
                self.data['q_length'] = int(fields[self.fieldmap['q_length']])
                self.data['bitscore'] = float(
                    fields[self.fieldmap['bitscore']])
                self.data['bsr'] = float(fields[self.fieldmap['bsr']])
                self.data['target'] = fields[self.fieldmap['target']]
                self.data['aln_length'] = float(
                    fields[self.fieldmap['aln_length']])
                self.data['expect'] = float(fields[self.fieldmap['expect']])
                self.data['identity'] = float(
                    fields[self.fieldmap['identity']])
                self.data['ec'] = fields[self.fieldmap['ec']]
                self.data['product'] = re.sub(r'=', ' ',
                                              fields[self.fieldmap['product']])
                self.lineToProcess = self.lines[self.i % self.SIZE]
            except:
                self.ERROR_COUNT += 1
                if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT:
                    eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n",
                            self.STEP_NAME, self.lines[self.i % self.SIZE],
                            self.blastoutput)
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write(
                            "%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n"
                            % (self.STEP_NAME,
                               re.sub(r'\t', '<tab>',
                                      self.lines[self.i % self.SIZE]),
                               self.blastoutput))
                    self.i = self.i + 1
                    self.next()
                else:
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write(
                            "%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n"
                            % (self.blastoutput, self.MAX_READ_ERRORS_ALLOWED))
                    exit_process()

            self.i = self.i + 1
            return self.data
        else:
            self.lineToProcess = None
            self.blastoutputfile.close()
            raise StopIteration()
Ejemplo n.º 12
0
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
        eprintf("ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        if errorlogger:
            errorlogger.printf(
                "ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        exit_process("ERROR\tPathwayTools executable %s not found!\n" %
                     (options.ptoolsExec))

    # command to build the ePGDB
    command = "%s " % (options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(),
                                  options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
        if False:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pythonCyc.setDebug()  # disable pathway debug statements
            printf("INFO\tExtracting the reaction list from ePGDB " +
                   options.sample_name + "\n")
            resultLines = pythonCyc.getReactionListLines()
            #pythonCyc.stopPathwayTools()
            reaction_list_file = open(options.reactions_list + ".tmp", 'w')
            for line in resultLines:
                fprintf(reaction_list_file, "%s\n", line.strip())
            reaction_list_file.close()
            StopPathwayTools()

    except:
        print traceback.print_exc(10)
        eprintf("ERROR\tFailed to run extract pathways for %s : \n" %
                (options.sample_name))
        eprintf(
            "INFO\tKill any other PathwayTools instance running on the machine and try again"
        )
        if errorlogger:
            errorlogger.write(
                "ERROR\tFailed to run extract pathways for %s : " %
                (options.sample_name))
            errorlogger.write(
                "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
            )
        StopPathwayTools()
def read_pipeline_configuration( file, globallogger ):
    patternKEYVALUE = re.compile(r'^([^\t\s]+)[\t\s]+\'(.*)\'')
    try:
       configfile = open(file, 'r')
    except IOError:
       eprintf("ERROR :Did not find pipeline config %s!\n", file) 
       globalerrorlogger.write("ERROR\tDid not find pipeline config %s!\n" %(file)) 
    else:
       lines = configfile.readlines()

    config_settings = {}
    for line in lines:
        if not re.match("#",line) and len(line.strip()) > 0 :
           line = line.strip()
           result = patternKEYVALUE.search(line)
           
           try:
              if len(result.groups()) == 2:
                 fields = result.groups()
              else:
                 eprintf("     The following line in your config settings files is not set up yet\n")
                 eprintf("     Please rerun the pipeline after setting up this line\n")
                 eprintf("     Error in line : %s\n", line)
                 globalerrorlogger(
                      "WARNING\t\n"+\
                      "     The following line in your config settings files is not set up yet\n"+\
                      "     Please rerun the pipeline after setting up this line\n"+\
                      "     Error in line : %s\n" %(line))

                 exit_process()
           except:
                 eprintf("     The following line in your config settings files is not set up yet\n")
                 eprintf("     Please rerun the pipeline after setting up this line\n")
                 eprintf("     Error ine line : %s\n", line)
                 globalerrorlogger(
                      "WARNING\t\n"+\
                      "     The following line in your config settings files is not set up yet\n"+\
                      "     Please rerun the pipeline after setting up this line\n"+\
                      "     Error in line : %s\n" %(line))
                 exit_process()
              
           if PATHDELIM=='\\':
              config_settings[fields[0]] = re.sub(r'/',r'\\',fields[1])   
           else:
              config_settings[fields[0]] = re.sub(r'\\','/',fields[1])   

           
    config_settings['METAPATHWAYS_PATH'] = config_settings['METAPATHWAYS_PATH'] + PATHDELIM
    config_settings['REFDBS'] = config_settings['REFDBS'] + PATHDELIM
    
    check_config_settings(config_settings, file, globallogger);
    config_settings['configuration_file'] = file

    return config_settings
def  checkParam_values(allcategorychoices, parameters, runlogger = None):
     for category in allcategorychoices:
        for choice in allcategorychoices[category]:
           if choice in parameters: 

             if not parameters[choice] in allcategorychoices[category][choice]:
                 logger.write('ERROR\tIncorrect setting in your parameter file')
                 logger.write('for step %s as %s' %(choice, parameters[choices]))
                 eprintf("ERROR: Incorrect setting in your parameter file" +\
                         "       for step %s as %s", choice, parameters[choices])
                 exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    if options.algorithm == 'BLAST':
       _execute_BLAST(options)
    elif options.algorithm == 'LAST':
        _execute_LAST(options)
    else:
        eprintf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
        if errorlogger:
            errorlogger.printf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
        exit_process("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
def checkParam_values(allcategorychoices, parameters, runlogger=None):
    for category in allcategorychoices:
        for choice in allcategorychoices[category]:
            if choice in parameters:

                if not parameters[choice] in allcategorychoices[category][
                        choice]:
                    logger.write(
                        'ERROR\tIncorrect setting in your parameter file')
                    logger.write('for step %s as %s' %
                                 (choice, parameters[choices]))
                    eprintf("ERROR: Incorrect setting in your parameter file" +\
                            "       for step %s as %s", choice, parameters[choices])
                    exit_process()
Ejemplo n.º 17
0
def process_blastoutput(dbname, blastoutput,  mapfile, refscore_file, opts, errorlogger = None):

    blastparser =  BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger)
    blastparser.setMaxErrorsLimit(100)
    blastparser.setErrorAndWarningLogger(errorlogger)
    blastparser.setSTEP_NAME('PARSE BLAST')

    
    fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if opts.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = blastoutput + '.parsed.txt'
    # temporary file is used to deal with incomplete processing of the file
    output_blastoutput_parsed_tmp =  output_blastoutput_parsed + ".tmp"
    try:
        outputfile = open(output_blastoutput_parsed_tmp, 'w') 
    except:
        if errorlogger:
           errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))
        exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))

    # write the headers out
    fprintf(outputfile, "#%s",'query')
    for field in fields:
         fprintf(outputfile,"\t%s",field)
    fprintf(outputfile, "\n")

    count = 0;
    for data in blastparser:
        if not data:
          continue
        try:
          fprintf(outputfile, "%s",data['query'])
        except:
           print 'data is : ', data, '\n'
           sys.exit()
        for field in fields:
           fprintf(outputfile, "\t%s",data[field])
        fprintf(outputfile, "\n")
        count += 1

    outputfile.close()
    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)


    return count
Ejemplo n.º 18
0
 def permuteForLAST(self, words):
     try :
        temp = copy(words)
        words[0] = temp[6] # query
        words[1] = temp[1] # target
        words[2] = 100.0 # percent id
        words[3] = temp[3]  #aln length
        words[6] = temp[2]
        words[7] = int(temp[2]) + int(temp[3]) - 1
        words[10] = 0.0   # evalue
        words[11] = temp[0]
     except:
        eprintf("ERROR : Invalid B/LAST output file %s \n" % (self.blastoutput))
        if self.error_and_warning_logger:   
            self.error_and_warning_logger.write("ERROR : Invalid B/LAST output file" %(self.blastoutput))
        exit_process( "ERROR : Invalid B/LAST output file %s " % (self.blastoutput))
def write_run_parameters_file(fileName, parameters):
    try:
        paramFile = open(fileName, 'w')
    except IOError:
        eprintf("Cannot write run parameters to file %s!\n", fileName)
        exit_process("Cannot write run parameters to file %s" % (fileName))

#       16s_rRNA      {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'}
    paramFile.write("\nRun Date : " + str(date.today()) + " \n")

    paramFile.write("\n\nNucleotide Quality Control parameters[s.n")
    paramFile.write("  min length" + "\t" +
                    str(parameters['quality_control']['min_length']) + "\n")

    paramFile.write("\n\nORF prediction parameters[s.n")
    paramFile.write("  min length" + "\t" +
                    str(parameters['orf_prediction']['min_length']) + "\n")
    paramFile.write("  algorithm" + "\t" +
                    str(parameters['orf_prediction']['algorithm']) + "\n")

    paramFile.write(
        "\n\nAmino acid quality control and annotation parameters[s.n")
    paramFile.write("  min bit score" + "\t" +
                    str(parameters['annotation']['min_score']) + "\n")
    paramFile.write("  min seq length" + "\t" +
                    str(parameters['annotation']['min_length']) + "\n")
    paramFile.write("  annotation reference dbs" + "\t" +
                    str(parameters['annotation']['dbs']) + "\n")
    paramFile.write("  min BSR" + "\t" +
                    str(parameters['annotation']['min_bsr']) + "\n")
    paramFile.write("  max evalue" + "\t" +
                    str(parameters['annotation']['max_evalue']) + "\n")

    paramFile.write("\n\nPathway Tools parameters[s.n")
    paramFile.write("  taxonomic pruning " + "\t" +
                    str(parameters['ptools_settings']['taxonomic_pruning']) +
                    "\n")

    paramFile.write("\n\nrRNA search/match parameters[s.n")
    paramFile.write("  min identity" + "\t" +
                    str(parameters['rRNA']['min_identity']) + "\n")
    paramFile.write("  max evalue" + "\t" +
                    str(parameters['rRNA']['max_evalue']) + "\n")
    paramFile.write("  rRNA reference dbs" + "\t" +
                    str(parameters['rRNA']['refdbs']) + "\n")

    paramFile.close()
Ejemplo n.º 20
0
def add_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open( blast_table_out,'r')

    refscores = {}
    lines = infile.readlines()
    for line in lines:
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))
          exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def create_dictionary(databasemapfile,
                      annot_map,
                      query_dictionary,
                      errorlogger=None):
    if not query_dictionary:
        print "WARNING : empty query dictionary in parse B/LAST"

        if errorlogger:
            errologger.write(
                "WARNING : empty query dictionary in parse B/LAST\n")
        return

    seq_beg_pattern = re.compile(">")
    try:
        dbmapfile = open(databasemapfile, 'r')
    except:
        if errorlogger:
            errologger.write(
                "PARSE_BLAST\tERROR\tCannot open database map file %s\t Please check the file manuallyT\n"
                % (databasemapfile))
        exit_process("ERROR: Cannot open database map file %s\n" %
                     (databasemapfile))

    for line in dbmapfile:
        if seq_beg_pattern.search(line):
            words = line.rstrip().split()
            name = words[0].replace('>', '', 1)
            if not name in query_dictionary:
                continue
            words.pop(0)
            if len(words) == 0:
                annotation = 'hypothetical protein'
            else:
                annotation = ' '.join(words)

            annot_map[name] = annotation
    dbmapfile.close()

    if len(annot_map) == 0:
        if errorlogger:
            errorlogger.write("PARSE_BLAST\tERROR\tFile " + databasemapfile +
                              " seems to be empty!\tCreate datbasemap file\n")
            errorlogger.write("Try re-running after deleting file : %s\n" %
                              (databasemapfile))
        exit_process("no anntations in file :" + databasemapfile)
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
       eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec)
       if errorlogger:
          errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n",  options.ptoolsExec)
       exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec))


    # command to build the ePGDB
    command = "%s "  %(options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
      if False:
         pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
         pythonCyc.setDebug() # disable pathway debug statements
         printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n")
         resultLines = pythonCyc.getReactionListLines()
         #pythonCyc.stopPathwayTools()
         reaction_list_file = open(options.reactions_list + ".tmp", 'w')
         for line in resultLines:
          fprintf(reaction_list_file,"%s\n",line.strip())
         reaction_list_file.close()
         StopPathwayTools()

    except:
           print traceback.print_exc(10)
           eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name))
           eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again")
           if errorlogger:
               errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name))
               errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
           StopPathwayTools()
    def __init__(self, dbname, blastoutput):
        self.lineToProcess = ""
        self.dbname = dbname
        self.blastoutput = blastoutput
        self.i = 0
        self.SIZE = 10000
        self.data = {}
        self.fieldmap = {}
        self.seq_beg_pattern = re.compile("^#")
        self.lines = []
        self.headerline = None

        self.MAX_READ_ERRORS_ALLOWED = 0
        self.ERROR_COUNT = 0
        self.STEP_NAME = 'CREATE_REPORT_FILES'  #PARSE_BLAST'
        self.error_and_warning_logger = None

        try:
            self.blastoutputfile = open(blastoutput, 'r')
            line = self.blastoutputfile.readline()
            if not self.seq_beg_pattern.search(line):
                eprintf(
                    "First line must have field header names and begin with \"#\"\n"
                )
                exit_process()

            self.headerline = line.strip()
            self.lineToProcess = self.headerline
            header = re.sub('^#', '', line)
            fields = [x.strip() for x in header.rstrip().split('\t')]
            k = 0
            for x in fields:
                self.fieldmap[x] = k
                k += 1

        except AttributeError:
            print "Cannot read the map file for database :" + dbname
            sys.exit(0)
Ejemplo n.º 24
0
def create_dictionary(databasemapfile, annot_map, query_dictionary, errorlogger= None):
       if not query_dictionary:
          print "WARNING : empty query dictionary in parse B/LAST"

          if errorlogger:
            errologger.write("WARNING : empty query dictionary in parse B/LAST\n")
         
          return 

       seq_beg_pattern = re.compile(">")
       try:
            dbmapfile = open( databasemapfile,'r')
       except:
            if errorlogger:
               errologger.write("PARSE_BLAST\tERROR\tCannot open database map file %s\t Please check the file manuallyT\n" %(databasemapfile) )
            exit_process("ERROR: Cannot open database map file %s\n" %(databasemapfile))

       for line in dbmapfile:
          if seq_beg_pattern.search(line):
              words = line.rstrip().split()
              name = words[0].replace('>','',1)
              if not name in query_dictionary: 
                 continue
              words.pop(0)
              if len(words)==0:
                 annotation = 'hypothetical protein'
              else:
                 annotation = ' '.join(words)

              annot_map[name] = annotation
       dbmapfile.close()

       if len(annot_map)==0:
          if errorlogger:
             errorlogger.write( "PARSE_BLAST\tERROR\tFile "+databasemapfile+ " seems to be empty!\tCreate datbasemap file\n") 
             errorlogger.write( "Try re-running after deleting file : %s\n" %(databasemapfile)) 
          exit_process( "no anntations in file :" + databasemapfile)
    def __init__(self, dbname,  blastoutput):
        self.lineToProcess = ""
        self.dbname = dbname
        self.blastoutput = blastoutput
        self.i=0
        self.SIZE = 10000
        self.data = {}
        self.fieldmap={}
        self.seq_beg_pattern = re.compile("^#")
        self.lines = []
        self.headerline = None

        self.MAX_READ_ERRORS_ALLOWED = 0
        self.ERROR_COUNT = 0
        self.STEP_NAME = 'CREATE_REPORT_FILES' #PARSE_BLAST'
        self.error_and_warning_logger = None

        try:
           self.blastoutputfile = open( blastoutput,'r')
           line = self.blastoutputfile.readline()
           if not self.seq_beg_pattern.search(line) :
              eprintf("First line must have field header names and begin with \"#\"\n")
              exit_process()

           self.headerline = line.strip()
           self.lineToProcess = self.headerline
           header = re.sub('^#','',line)
           fields = [ x.strip()  for x in header.rstrip().split('\t')]
           k = 0
           for x in fields:
             self.fieldmap[x] = k
             k += 1

        except AttributeError:
           print "Cannot read the map file for database :" + dbname
           sys.exit(0)
def checkMetapathsteps(params, runlogger = None):
     choices = { 'metapaths_steps':{}, 'annotation':{}, 'INPUT':{} }

     choices['INPUT']['format']  = ['fasta', 'gbk_unannotated', 'gbk_annotated', 'gff_unannotated', 'gff_annotated']

     choices['annotation']['algorithm'] =  ['last', 'blast'] 

     choices['metapaths_steps']['PREPROCESS_FASTA']   = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['ORF_PREDICTION']  = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['GFF_TO_AMINO']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['FILTERED_FASTA']  = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['COMPUTE_REFSCORE']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['BLAST_REFDB'] = ['yes', 'skip', 'stop', 'redo', 'grid']
     choices['metapaths_steps']['PARSE._BLAST'] = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['SCAN_rRNA']   = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['STATS_rRNA']  = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['ANNOTATE']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['PATHOLOGIC_INPUT']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['GENBANK_FILE']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['CREATE_SEQUIN_FILE']  = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['CREATE_REPORT_FILES']  = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['SCAN_tRNA']   = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['MLTREEMAP_CALCULATION']   = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['MLTREEMAP_IMAGEMAKER']    = ['yes', 'skip', 'stop', 'redo']
     choices['metapaths_steps']['PATHOLOGIC']  = ['yes', 'skip', 'stop', 'redo']


     if params['metapaths_steps']:
        checkParam_values(choices, params['metapaths_steps'], runlogger)

     checkparams = {}
     checkparams['annotation'] = []
     checkparams['annotation'].append('dbs') 

     if not checkMissingParam_values(params, checkparams, runlogger):
        exit_process("Missing parameters")
def make_sure_map_file_exists(config_settings, dbname, globallogger=None):
    dbmapFile = config_settings[
        'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt"
    seqFilePath = config_settings[
        'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname
    if not doFilesExist([dbmapFile]):
        eprintf("WARNING: Trying to create database map file for %s\n", dbname)
        if globallogger != None:
            globallogger.write(
                "WARNING: Trying to create database map file for %s\n" %
                (dbname))

        if not doFilesExist([seqFilePath]):
            eprintf(
                "ERROR : You do not even have the raw sequence for Database  %s to format!\n",
                dbname)
            eprintf("      : Make sure you have the file %s\n", seqFilePath)

            if globallogger != None:
                globallogger.write(
                    "ERROR \t You do not even have the raw sequence for Database  %s to format!\n"
                    % (dbname))
                globallogger.write("Make sure you have the file %s\n" %
                                   (seqFilePath))

            exit_process()

        mapfile = open(dbmapFile, 'w')
        seqFile = open(seqFilePath, 'r')
        for line in seqFile:
            if re.match(r'>', line):
                fprintf(mapfile, "%s\n", line.strip())
        seqFile.close()
        mapfile.close()

    return dbmapFile
Ejemplo n.º 28
0
    def __init__(self, dbname,  blastoutput, database_mapfile, refscore_file, opts, errorlogger =None):
        self.Size = 10000
        self.dbname = dbname
        self.ln2 = 0.69314718055994530941
        self.lnk = math.log(opts.k)
        self.Lambda = opts.Lambda
        self.blastoutput = blastoutput
        self.database_mapfile =database_mapfile
        self.refscore_file = refscore_file
        self.annot_map = {} 
        self.i=0
        self.opts = opts
        self.hits_counts = {}
        self.data = {}
        self.refscores = {}
        self.refBitScores = {}
        self.needToPermute = False;

        self.MAX_READ_ERRORS_ALLOWED = 100
        self.ERROR_COUNT = 0
        self.STEP_NAME = 'PARSE_BLAST'
        self.error_and_warning_logger = errorlogger 


        #print "trying to open blastoutput file " + blastoutput
        query_dictionary = {}
        create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger =  errorlogger) 
        try:
            self.blastoutputfile = open(self.blastoutput,'r')
        except:
            eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\
                      "      : make sure \"B/LAST\"ing was done for the particular database" )

            if self.error_and_warning_logger:
               self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\
                                             "      : make sure \"B/LAST\"ing was done for "+\
                                             "the particular database" %(blastoutput) )
            exit_process( "Cannot open B/LAST output file " + blastoutput )

        try:
            self.create_refBitScores()
        except:
            print traceback.print_exc(10)
            exit_process( "Error while reading from  B/LAST refscore file " + self.refscore_file )

        try:
           create_dictionary(database_mapfile, self.annot_map, query_dictionary)
           query_dictionary = {}
        except AttributeError:
           eprintf("Cannot read the map file for database : %s\n" % (dbname))
           if errorlogger!= None:
              errorlogger.write("PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" %(database_mapfile, dbname))

           exit_process("Cannot read the map file for database  " + dbname)
Ejemplo n.º 29
0
    def isWithinCutoffs(self, words, data, cutoffs, annot_map, refbitscores):
        data['query'] = words[0]
    
        try:
           data['target'] = words[1]
        except:
           data['target'] = 0
    
        try:
           data['q_length'] = int(words[7]) - int(words[6]) + 1
        except:
           data['q_length'] = 0
    
        try:
           data['bitscore'] = float(words[11])
        except:
           data['bitscore'] = 0
    
        try:
           data['bsr'] = float(words[11])/refbitscores[words[0]]
        except:
           #print "words 0 " + str(refscores[words[0]])
           #print "words 11 " + str( words[11])
           data['bsr'] = 0
    
        try:
           data['expect'] = float(words[10])
        except:
           data['expect'] = 0
    
        try:
           data['aln_length'] = float(words[3])
        except:
           data['aln_length'] = 0
    
        try:
           data['identity'] = float(words[2])
        except:
           data['identity'] = 0
    
        try:
           data['product'] = annot_map[words[1]]
        except:
           eprintf("Sequence with name \"" + words[1] + "\" is not present in map file ")
           if self.error_and_warning_logger:   
              self.error_and_warning_logger.write("Sequence with name %s is not present in map file " %(words[1] ))
           self.incErrorCount()
           if self.maxErrorsReached():
               if self.error_and_warning_logger:   
                  self.error_and_warning_logger.write("Number of sequence absent in map file %s exceeds %d" %(self.blastoutput, self.ERROR_COUNT ))
               exit_process("Number of sequence absent in map file %s exceeds %d" %(self.blastoutput, self.ERROR_COUNT ))
             
    
           data['product'] = 'hypothetical protein'
    
        try:
           m = re.search(r'(\d+[.]\d+[.]\d+[.]\d+)', data['product'])
           if m != None:
             data['ec'] = m.group(0)
           else:
             data['ec'] = ''
        except:
            data['ec'] = ''
    
        if cutoffs.taxonomy:
           try:
              m = re.search(r'\[([^\[]+)\]', data['product'])
              if m != None:
                data['taxonomy'] = m.group(1)
              else:
                data['taxonomy'] = ''
           except:
                data['taxonomy'] = ''
    
        
        if cutoffs.remove_taxonomy:
           try:
              data['product'] = re.sub(r'\[([^\[]+)\]','', data['product'])
           except:
              data['product'] = ''
    
        if cutoffs.remove_ec:
           try:
              data['product'] = re.sub(r'\([Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\)', '', data['product'])
              data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\]', '', data['product'])
              data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.-]\]', '', data['product'])
              data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.-.-]\]', '', data['product'])
              data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.-.-.-]\]', '', data['product'])
           except:
              data['product'] = ''
    

        if float(data['q_length']) < cutoffs.min_length:
           return False
    
        if float(data['bitscore']) < cutoffs.min_score:
           return False
    
        if float(data['expect']) > cutoffs.max_evalue:
           return False
    
        if float(data['identity']) < cutoffs.min_identity:
           return False
    
        if float(data['bsr']) < cutoffs.min_bsr:
           return False
    
    #min_length'
    #'min_score'
    #'max_evalue'
    # 'min_identity'
    #'limit'
    #'max_length'
    #'min_query_coverage'
    #'max_gaps'
    #min_bsr'
    
        return True
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    if not len(options.blast_files):
       parser.error('At least one taxonomic BLAST output is required')

    if runBlastCommandrRNA(runcommand = runcommand) !=0:
       if errorlogger:
          errorlogger.write("ERROR: Failed to BLAST the sequences against database %s : "  %(options.tax_databases[0]) )
          errorlogger.write("     : " + runcommand)
       exit_process("ERROR: Failed to BLAST the sequences against database %s : "  %(options.tax_databases[0]) +\
                    "     : " + runcommand)

    if not ( len(options.tax_databases) == len( options.blast_files) ):
       parser.error('Number of taxonomic databases and BLAST outputs should be the same')

    if not options.output:
       parser.error('Output file must be specified')
    # Incredible sanity check

    if not files_exist(options.blast_files):
        sys.exit(0)

    if not files_exist( options.tax_databases):
        sys.exit(0)
   
    params = {'length': int(options.length), 'similarity': float(options.similarity), 'evalue':float(options.evalue), 'bitscore':float(options.bitscore) }
    #print params['bitscore']
    table={}
    for x in range(0, len(options.blast_files)):
        table[options.tax_databases[x]]={}
        process_blastout_file(options.blast_files[x], options.tax_databases[x],table[options.tax_databases[x]], errorlogger = errorlogger)
        
 
    priority = 7000
    reads = {}
    for x in range(0, len(options.blast_files)):
        append_taxonomic_information(options.tax_databases[x], table[options.tax_databases[x]],  params)
        for key in table[options.tax_databases[x]]:
            if len(table[options.tax_databases[x]][key][6]) > 1:
              reads[key] = True
        
        dbname  =  re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])
        runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" %(str(priority),  dbname,  str(len(reads))))
        priority += 1
    outputfile = open(options.output, 'w')
    fprintf(outputfile, "#Similarity cutoff :\t" +  str(params['similarity']) +'\n')
    fprintf(outputfile, "#Length cutoff :\t" +  str(params['length']) +'\n')
    fprintf(outputfile, "#Evalue cutoff :\t" +  str(params['evalue']) +'\n')
    fprintf(outputfile, "#Bit score cutoff :\t" +  str(params['bitscore']) +'\n')
    fprintf(outputfile, "#Number of rRNA sequences detected:\t" +  str(len(reads)) +'\n\n')

    
    for x in range(0, len(options.tax_databases)):
    #  printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x]))
      fprintf(outputfile, '\t%s\t\t\t', re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]))
    #printf('\n')
    fprintf(outputfile,'\n')
       

    #printf('%s', 'read')
    for x in range(0, len(options.blast_files)):
        fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start', 'end', 'similarity', 'evalue', 'bitscore', 'taxonomy')
    fprintf(outputfile,'\n')

    for read in reads:
        #printf('%s', read)
        fprintf(outputfile,'%s', read)
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
               fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s', str(table[options.tax_databases[x]][read][4]), str(table[options.tax_databases[x]][read][5]), str(table[options.tax_databases[x]][read][0]),str(table[options.tax_databases[x]][read][1]),str(table[options.tax_databases[x]][read][2]), str(table[options.tax_databases[x]][read][6]))
            else:
               fprintf(outputfile, '\t-\t-\t-\t-\t-\t-')
        fprintf(outputfile,'\n')
    outputfile.close() 

    # collect the exact reads 
    database_hits = {}
    for read in reads:
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
               database_hits[read] = [ table[options.tax_databases[x]][read][4], table[options.tax_databases[x]][read][5]]

    # pick the hits, trim them according to the match and write them
    if options.fasta:
      selected_sequences={}
      read_select_fasta_sequences(database_hits, selected_sequences, options.fasta)
      for read in database_hits:
         selected_sequences[read] = selected_sequences[read][database_hits[read][0]:database_hits[read][1]] 
      write_selected_sequences(selected_sequences, options.output +'.fasta')
Ejemplo n.º 31
0
def check_an_format_refdb(dbname, seqType,  config_settings, params, globallogger = None): 

    algorithm=  get_parameter( params,'annotation','algorithm').upper()
    
    suffixes=[]
    
    # we do not use LAST for searchingin the taxonomic databas. e.g., greengenes, silva, etc
    # if the db formatting request is done with nucl and LAST, we switch to BLAST-based formatting
    if algorithm == 'LAST' and seqType == 'nucl':
       algorithm = 'BLAST'
    
    if algorithm == 'LAST' and seqType == 'prot':
        suffixes = [ 'des', 'sds', 'suf', 'bck', 'prj', 'ssp', 'tis' ]
    
    if algorithm == 'BLAST':
      if seqType=='prot':
        suffixes = ['phr', 'psq', 'pin']
    
      if seqType=='nucl':
        suffixes = ['nhr', 'nsq', 'nin']
        
    # formatted DB directories
    taxonomic_formatted = config_settings['REFDBS'] + PATHDELIM + 'taxonomic' + PATHDELIM + 'formatted'
    functional_formatted = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted'
    # check if formatted folder exis. if not create it
    for d in [taxonomic_formatted, functional_formatted]:
        if not createFolderIfNotFound(d):
            eprintf("WARNING : Creating formatted subdirectory in blastDB folder.\n")
    
    # formatted database output paths
    if seqType == 'nucl':
       seqPath= config_settings['REFDBS'] + PATHDELIM + 'taxonomic' + PATHDELIM +  dbname
       formattedDBPath = taxonomic_formatted + PATHDELIM +  dbname
    elif seqType == 'prot':
       seqPath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM +  dbname
       formattedDBPath = functional_formatted + PATHDELIM +  dbname
    else:
       eprintf("ERROR : Undefined sequnce type for %s!\n", dbname) 
       if globallogger!=None:
          globallogger.write("ERROR \t Undefined sequnce type for %s!\n" %( dbname) )
       exit_process()
    
    # database formatting executables paths
    if algorithm == 'LAST' and seqType =='prot':
      executable  = config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['LASTDB_EXECUTABLE']
    else: # algorithm == 'BLAST':
      executable  = config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['FORMATDB_EXECUTABLE']

    if not (formatted_db_exists(formattedDBPath,  suffixes) ):
        eprintf("WARNING : You do not seem to have Database %s formatted!\n", dbname)
        if globallogger!=None:
          globallogger.write("WARNING\t You do not seem to have Database %s formatted!\n" %(dbname) )
        if check_if_raw_sequences_exist(seqPath):
            eprintf("          Found raw sequences for  Database %s in folder %s!\n", dbname, seqPath)
            eprintf("          Trying to format on the fly .... for %s!\n", algorithm )
            if globallogger!=None:
               globallogger.write("WARNING\t Found raw sequences for  Database %s in folder %s!\n" %(dbname, seqPath) )
               globallogger.write("Trying to format on the fly .... for %s!\n" %(algorithm ) )

            result =format_db(executable, seqType, seqPath, formattedDBPath, algorithm)
            if result ==True:
                eprintf("          Formatting successful!\n")
                return 
            else:
                eprintf("          Formatting failed! Please consider formatting manually or do not try to annotate with this database!\n")
                if globallogger!=None:
                  globallogger.write("ERROR\tFormatting failed! Please consider formatting manually or do not try to annotate with this database!\n")
                exit_process()

        eprintf("ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname)
        eprintf("        in the folder %s\n", seqPath)
        eprintf("        Please put the appropriate files in folder \"blastDB\"\n")
        if globallogger!=None:
            globallogger.write("ERROR \t You do not even have the raw sequence for Database %s to format!\n" %( dbname) )
            globallogger.write("in the folder %s\n" %(seqPath))
            globallogger.write("Please put the appropriate files in folder \"blastDB\"\n")
        exit_process()
def check_config_settings(config_settings, file, globalerrorlogger = None):
   essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR']
   missingItems = []

   for key, value in  config_settings.items():
      # make sure  MetaPathways directory is present
      if key in ['METAPATHWAYS_PATH' ]:
         if not path.isdir( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 1.Currently it is set to \"%s\"\n",  config_settings[key] )  

            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"  %(key, file))  
               globalerrorlogger.write("       Currently it is set to \"%s\"\n" %(config_settings[key] )  )
            missingItems.append(key) 
         continue


      # make sure  REFDB directories are present
      if key in [ 'REFDBS' ]:
         if not path.isdir( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
                globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key,file))
                globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key]) )  
            missingItems.append(key) 
         continue

      # make sure EXECUTABLES_DIR directories are present
      if key in [ 'EXECUTABLES_DIR']:
         if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file))  
               globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] )) 
            missingItems.append(key) 
         continue

      # make sure RESOURCES_DIR directories are present
      if key in [ 'RESOURCES_DIR']:
         if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 4.Currently it is set to \"%s\"\n",  config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key] )  
            print  config_settings['METAPATHWAYS_PATH'], config_settings[key] 
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file))
               globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key]))  
            missingItems.append(key) 
         continue

      # make sure  MetaPaths directory is present
      if key in ['PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]:
         if not path.isfile( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file)) 
               globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] ) )
            missingItems.append(key) 
         continue

      # ignore pgdb folder for now
      if key in ['PGDB_FOLDER' ]:
          continue
      
      # check if the desired file exists. if not, then print a message
      if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  value)\
        and  not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) :
           eprintf("ERROR:Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file)  
           eprintf("5.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + value ) 
           if globalerrorlogger!=None:
              globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file) )
              globalerrorlogger.write("Currently it is set to \"%s\"\n" %(config_settings['METAPATHWAYS_PATH'] + value)) 
           missingItems.append(key) 
           continue
     
   stop_execution = False
   for item in missingItems:
      if item in essentialItems:
         eprintf("ERROR\t Essential field in setting %s is missing in configuration file!\n", item)
         if globalerrorlogger!=None:
            globalerrorlogger.write("ERROR\tEssential field in setting %s is missing in configuration file!\n" %(item))
         stop_execution = True

   if stop_execution ==True:
      eprintf("ERROR: Terminating execution due to missing essential  fields in configuration file!\n")
      if globalerrorlogger!=None:
         globalerrorlogger.write("ERROR\tTerminating execution due to missing essential  fields in configuration file!\n")
      exit_process()
def check_config_settings(config_settings, file, globalerrorlogger = None):
   essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR']
   missingItems = []

   for key, value in  config_settings.items():
      # these are not files or executables

      if key in ['NUM_CPUS', 'FORMATTED_DB_SIZE' ]:
        continue

      if key in ['FORMATDB_EXECUTABLE', 'BLASTP_EXECUTABLE', 'BLASTN_EXECUTABLE' ] and value=='':
        continue 



      # make sure  MetaPathways directory is present
      if key in ['METAPATHWAYS_PATH' ]:
         if not path.isdir( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 1.Currently it is set to \"%s\"\n",  config_settings[key] )  

            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"  %(key, file))  
               globalerrorlogger.write("       Currently it is set to \"%s\". Please correct it and try again.\n" %(config_settings[key] )  )
            missingItems.append(key) 
         continue


      # make sure  REFDB directories are present
      if key in [ 'REFDBS' ]:
         if not path.isdir( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
                globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key,file))
                globalerrorlogger.write("Currently it is set to \"%s\". Please correct it and try again.\n" %( config_settings[key]) )  
            missingItems.append(key) 
         continue

      # make sure EXECUTABLES_DIR directories are present
      if key in [ 'EXECUTABLES_DIR']:
         if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file))  
               globalerrorlogger.write("Currently it is set to \"%s\". Please correct the path.\n" %( config_settings[key] )) 
            missingItems.append(key) 
         continue

      if key in [ 'ACCESSION_TO_TAXONID']:
         if not path.isfile( config_settings['REFDBS'] + PATHDELIM +   'ncbi_tree' + PATHDELIM +  config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 7.Currently it is set to \"%s\"\n", config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM +config_settings[key] )  
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file))  
               globalerrorlogger.write("Currently it is set to \"%s\". Please correct the path to compute LCA with accession id translation.\n" %( config_settings[key] )) 
            missingItems.append(key) 
         continue


      # make sure RESOURCES_DIR directories are present
      if key in [ 'RESOURCES_DIR']:
         if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 4.Currently it is set to \"%s\"\n",  config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key] )  
            print  config_settings['METAPATHWAYS_PATH'], config_settings[key] 
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file))
               globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key]))  
            missingItems.append(key) 
         continue

      # make sure  MetaPaths directory is present
      if key in ['PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]:
         if not path.isfile( config_settings[key]) :
            eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing)  in configuration file \"%s\"\n", key, file)  
            eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key] )  
            if globalerrorlogger!=None:
               globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) 
               globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] ) )
            missingItems.append(key) 
         continue

      # ignore pgdb folder for now
      if key in ['PGDB_FOLDER' ]:
          continue
      
      # check if the desired file exists. if not, then print a message
      if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  value)\
        and  not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) :
           eprintf("ERROR:Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file)  
           eprintf("6.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH']+ PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) 
           if globalerrorlogger!=None:
              globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file) )
              globalerrorlogger.write("Currently it is set to \"%s\"\n" %(config_settings['METAPATHWAYS_PATH'] + value)) 
           missingItems.append(key) 
           continue
     
   stop_execution = False
   for item in missingItems:
      if item in essentialItems:
         eprintf("ERROR\t Essential field in setting %s is missing in configuration file!\n", item)
         if globalerrorlogger!=None:
            globalerrorlogger.write("ERROR\tEssential field in setting %s is missing in configuration file!\n" %(item))
         stop_execution = True

   if stop_execution ==True:
      eprintf("ERROR: Terminating execution due to missing essential  fields in configuration file!\n")
      if globalerrorlogger!=None:
         globalerrorlogger.write("ERROR\tTerminating execution due to missing essential  fields in configuration file!\n")
      exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)
    if options.inputfolder ==None:
       parser.error('ERROR\tInput folder for Pathologic not found')
    else:
      # required files to be able to build ePGDB
      files = [ 
                #options.inputfolder + PATHDELIM + '0.pf',
                # options.inputfolder + PATHDELIM + '0.fasta',
                options.inputfolder + PATHDELIM + 'genetic-elements.dat',  
                options.inputfolder + PATHDELIM + 'organism-params.dat'
              ]

      if files_exist( files , errorlogger = errorlogger):
        exit_process("ERROR\tCannot find all inputs for Pathologic in folder %s : "  %(options.inputfolder) )

    # is there a pathwaytools executable installed
    if not path.exists(options.ptoolsExec):
       eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec)
       if errorlogger:
          errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n",  options.ptoolsExec)
       exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec))


    # command to build the ePGDB
    command = "%s -patho %s"  %(options.ptoolsExec, options.inputfolder)
    if options.no_taxonomic_pruning:
       command += " -no-taxonomic-pruning "

    if options.no_web_cel_overview:
       command += " -no-web-cel-overview"

    command += " -tip"
    command += " -api"

    status =0
    fix_pgdb_input_files(options.pgdbdir, pgdbs = [])


    if not path.exists(options.pgdbdir):
      status  = runPathologicCommand(runcommand = command) 
      fix_pgdb_input_files(options.pgdbdir, pgdbs = [])
    if status!=0:
       eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %(options.inputfolder))
       eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
       if errorlogger:
          errorlogger.write("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder))
          errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again")
          errorlogger.write("     : " + command)
          insert_error(9)
       sys.exit(0)
       #exit_process("ERROR\tFailed to run Pathologic on input %s : "  %(options.inputfolder) )


    if not path.exists(options.reactions_list):
       try:
           pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
           pythonCyc.setDebug() # disable pathway debug statements
           printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n")
           resultLines = pythonCyc.getReactionListLines()
           #pythonCyc.stopPathwayTools()
           reaction_list_file = open(options.reactions_list + ".tmp", 'w')
           for line in resultLines:
              fprintf(reaction_list_file,"%s\n",line.strip())
           reaction_list_file.close()
           rename(options.reactions_list + ".tmp", options.reactions_list)
           StopPathwayTools()

       except:
           print traceback.print_exc(10)
           eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name))
           eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again")
           if errorlogger:
               errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name))
               errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
           insert_error(9)
           StopPathwayTools()

    if not path.exists(options.table_out):
        ExtractPathway_WTD(options)
def main(argv, errorlogger=None):
    global parser
    (opts, args) = parser.parse_args(argv)

    if not valid_arguments(opts, args):
        print usage
        sys.exit(0)

    sample_name = opts.sample_name
    folder_path = opts.folder_path
    results = []

    try:
        STEP_NAME = "GATHER_STATS"
        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'nuc')
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tCannot read nuc stats file\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'amino')
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tCannot read amino stats file\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the blast/last hits
        status = get_BLAST_LAST_hits(sample_name, folder_path)
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tReading BLAST HITS\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the selected parsed blast/last hits
        status = get_BLAST_LAST_parsed_hits(sample_name, folder_path)
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tReading parsed BLAST HITS\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the annotated gff hits
        status = get_annotation_hits(sample_name, folder_path)
        if status != None:
            results += status

        # read the annotated gff hits
        status = get_functional_taxonomic_hits(sample_name, folder_path)
        if status != None:
            results += status

        # read the number of ORFs that are used for mapping to functional categories
        status = get_ORF_annotations_hits(sample_name, folder_path)
        if status != None:
            results += status

        # get the rRNA hits
        status = get_rRNA_hits(sample_name, folder_path)
        if status != None:
            results += status

        # get the tRNA hits
        status = get_tRNA_hits(sample_name, folder_path)
        if status != None:
            results += status

        stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt'

        try:

            statsfilename = open(stats_file_name, 'w')
        except:
            print "ERRROR : Cannot open stats file format " + stats_file_name
            sys.exit(0)

        for pair in results:
            fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1])
        statsfilename.close()
    except:
        exit_process()
def main(argv, errorlogger = None): 
    global parser
    (opts, args) = parser.parse_args(argv)

    if not valid_arguments(opts, args):
       print usage
       sys.exit(0)
    
    sample_name = opts.sample_name
    folder_path = opts.folder_path
    results = []

    try:
        STEP_NAME = "GATHER_STATS"
        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'nuc')
        if status!=None:
           results += status
        else:
           errorlogger.write("%s\tERROR\tCannot read nuc stats file\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name))
           exit_process()
           
    
        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'amino')
        if status!=None:
           results += status
        else:
           errorlogger.write("%s\tERROR\tCannot read amino stats file\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name))
           exit_process()
    
        # read the blast/last hits
        status = get_BLAST_LAST_hits(sample_name, folder_path)
        if status!=None:
           results += status
        else:
           errorlogger.write("%s\tERROR\tReading BLAST HITS\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name))
           exit_process()
    
    
        # read the selected parsed blast/last hits
        status = get_BLAST_LAST_parsed_hits(sample_name, folder_path)
        if status!=None:
           results += status
        else:
           errorlogger.write("%s\tERROR\tReading parsed BLAST HITS\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name))
           exit_process()
    
        # read the annotated gff hits
        status = get_annotation_hits(sample_name, folder_path)
        if status!=None:
           results += status
    
        # read the annotated gff hits
        status = get_functional_taxonomic_hits(sample_name, folder_path)
        if status!=None:
           results += status
    
        # read the number of ORFs that are used for mapping to functional categories
        status =  get_ORF_annotations_hits(sample_name, folder_path)
        if status!=None:
           results += status
    
        # get the rRNA hits
        status = get_rRNA_hits(sample_name, folder_path)
        if status!=None:
           results += status
    
        # get the tRNA hits
        status = get_tRNA_hits(sample_name, folder_path)
        if status!=None:
           results += status
    
        stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt' 
    
        try:
           
           statsfilename = open(stats_file_name, 'w')
        except:
           print "ERRROR : Cannot open stats file format " + stats_file_name 
           sys.exit(0)
          
        for pair in results:
           fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1])
        statsfilename.close()
    except:
        exit_process()
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None):
    linecount = 0
    readerhandles = []

    if verbose:
       eprintf("Processing for database  : %s\n", dbname)

    if len(filenames)==0:
       eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname)
       exit_process()

    try:
       for i in range(len(filenames)):
         #print filenames
         readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
    except OSError:
      eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i])
      exit_process()

    # set error and warning parameters 
    for readerhandle in readerhandles:
        readerhandle.setMaxErrorsLimit(5)
        readerhandle.setErrorAndWarningLogger(errorlogger)
        readerhandle.setSTEP_NAME('PARSE BLAST')

    try:
       outputfile = open(outputfilename, 'w')
       fieldmapHeaderLine = readerhandles[0].getHeaderLine()
       fprintf(outputfile, "%s\n",fieldmapHeaderLine)
    except OSError:
       eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename)
       exit_process()

    values = []
    for i in range(len(filenames)):
       iterate = iter(readerhandles[i])
       try :
          next(iterate)
          line = readerhandles[i].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values.append( (i, orfRanks[shortORFId], line) )
       except:
          outputfile.close()
          return

    S = len(filenames)
    BuildHeap(S, values)

    while S>0:
       try:
          iterate = iter(readerhandles[values[0][0]])
          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          #print fields[0], orfRanks[fields[0]]
          fprintf(outputfile, "%s\n",line)
          next(iterate)

          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values[0] = (values[0][0], orfRanks[shortORFId], line)
       except:
          #import traceback
          #traceback.print_exc()
          #print 'finished ' + str(S)
          values[0] = values[S-1]
          S = S - 1

       if S>0:
          Heapify(values, 0, S)

    #print 'line count ' + str(linecount)
    outputfile.close()
Ejemplo n.º 38
0
def sigint_handler(signum, frame):
    eprintf("Received TERMINATION signal\n")
    exit_process()
Ejemplo n.º 39
0
def check_config_settings(config_settings, file, globalerrorlogger=None):
    essentialItems = ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR']
    missingItems = []

    for key, value in config_settings.items():
        # make sure  MetaPathways directory is present
        if key in ['METAPATHWAYS_PATH']:
            if not path.isdir(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 1.Currently it is set to \"%s\"\n",
                        config_settings[key])

                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write(
                        "       Currently it is set to \"%s\"\n" %
                        (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure  REFDB directories are present
        if key in ['REFDBS']:
            if not path.isdir(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 2.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure EXECUTABLES_DIR directories are present
        if key in ['EXECUTABLES_DIR']:
            if not path.isdir(config_settings['METAPATHWAYS_PATH'] +
                              PATHDELIM + config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 3.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure RESOURCES_DIR directories are present
        if key in ['RESOURCES_DIR']:
            if not path.isdir(config_settings['METAPATHWAYS_PATH'] +
                              PATHDELIM + config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                    key, file)
                eprintf(
                    "ERROR: 4.Currently it is set to \"%s\"\n",
                    config_settings['METAPATHWAYS_PATH'] + PATHDELIM +
                    config_settings[key])
                print config_settings['METAPATHWAYS_PATH'], config_settings[
                    key]
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure  MetaPaths directory is present
        if key in ['PYTHON_EXECUTABLE', 'PATHOLOGIC_EXECUTABLE']:
            if not path.isfile(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 5.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # ignore pgdb folder for now
        if key in ['PGDB_FOLDER']:
            continue

        # check if the desired file exists. if not, then print a message
        if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  value)\
          and  not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) :
            eprintf(
                "ERROR:Path for \"%s\" is NOT set properly in configuration file \"%s\"\n",
                key, file)
            eprintf("5.Currently it is set to \"%s\"\n",
                    config_settings['METAPATHWAYS_PATH'] + value)
            if globalerrorlogger != None:
                globalerrorlogger.write(
                    "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n"
                    % (key, file))
                globalerrorlogger.write(
                    "Currently it is set to \"%s\"\n" %
                    (config_settings['METAPATHWAYS_PATH'] + value))
            missingItems.append(key)
            continue

    stop_execution = False
    for item in missingItems:
        if item in essentialItems:
            eprintf(
                "ERROR\t Essential field in setting %s is missing in configuration file!\n",
                item)
            if globalerrorlogger != None:
                globalerrorlogger.write(
                    "ERROR\tEssential field in setting %s is missing in configuration file!\n"
                    % (item))
            stop_execution = True

    if stop_execution == True:
        eprintf(
            "ERROR: Terminating execution due to missing essential  fields in configuration file!\n"
        )
        if globalerrorlogger != None:
            globalerrorlogger.write(
                "ERROR\tTerminating execution due to missing essential  fields in configuration file!\n"
            )
        exit_process()
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)
    if options.inputfolder == None:
        parser.error('ERROR\tInput folder for Pathologic not found')
    else:
        # required files to be able to build ePGDB
        files = [
            options.inputfolder + PATHDELIM + '0.pf',
            # options.inputfolder + PATHDELIM + '0.fasta',
            options.inputfolder + PATHDELIM + 'genetic-elements.dat',
            options.inputfolder + PATHDELIM + 'organism-params.dat'
        ]

        if files_exist(files, errorlogger=errorlogger):
            exit_process(
                "ERROR\tCannot find all inputs for Pathologic in folder %s : "
                % (options.inputfolder))

    # is there a pathwaytools executable installed
    if not path.exists(options.ptoolsExec):
        eprintf("ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        if errorlogger:
            errorlogger.printf(
                "ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        exit_process("ERROR\tPathwayTools executable %s not found!\n" %
                     (options.ptoolsExec))

    # command to build the ePGDB
    command = "%s -patho %s" % (options.ptoolsExec, options.inputfolder)
    if options.no_taxonomic_pruning:
        command += " -no-taxonomic-pruning "

    if options.no_web_cel_overview:
        command += " -no-web-cel-overview"

    command += " -api"

    status = 0
    fix_pgdb_input_files(options.pgdbdir, pgdbs=[])

    if not path.exists(options.pgdbdir):
        status = runPathologicCommand(runcommand=command)
        fix_pgdb_input_files(options.pgdbdir, pgdbs=[])
    if status != 0:
        eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %
                (options.inputfolder))
        eprintf(
            "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
        )
        if errorlogger:
            errorlogger.write(
                "ERROR\tFailed to run Pathologic on input %s : " %
                (options.inputfolder))
            errorlogger.write(
                "INFO\tKill any other PathwayTools instance running on the machine and try again"
            )
            errorlogger.write("     : " + command)
        exit_process("ERROR\tFailed to run Pathologic on input %s : " %
                     (options.inputfolder))

    if not path.exists(options.reactions_list):
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pythonCyc.setDebug()  # disable pathway debug statements
            printf("INFO\tExtracting the reaction list from ePGDB " +
                   options.sample_name + "\n")
            resultLines = pythonCyc.getReactionListLines()
            #pythonCyc.stopPathwayTools()
            reaction_list_file = open(options.reactions_list + ".tmp", 'w')
            for line in resultLines:
                fprintf(reaction_list_file, "%s\n", line.strip())
            reaction_list_file.close()
            rename(options.reactions_list + ".tmp", options.reactions_list)

            StopPathwayTools()

        except:
            print traceback.print_exc(10)
            eprintf("ERROR\tFailed to run extract pathways for %s : \n" %
                    (options.sample_name))
            eprintf(
                "INFO\tKill any other PathwayTools instance running on the machine and try again"
            )
            if errorlogger:
                errorlogger.write(
                    "ERROR\tFailed to run extract pathways for %s : " %
                    (options.sample_name))
                errorlogger.write(
                    "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
                )
            StopPathwayTools()

    if not path.exists(options.table_out):
        ExtractPathway_WTD(options)
Ejemplo n.º 41
0
def main(argv):
    global parser
    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("COMMAND : %s\n", sys.argv[0] + ' ' +  ' '.join(argv))
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only
    sample_subset= opts.sample_subset

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    
    if opts.ncbi_header and opts.ncbi_sbt:
       if not path.exists(opts.ncbi_header):
          print "Could not open or missing NCBI header file " + opts.ncbi_header
          print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file"
          sys.exit(0)

       if  not path.exists(opts.ncbi_sbt):
          print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n 
                 http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt
          sys.exit(0)

       ncbi_sequin_params = path.abspath(opts.ncbi_header)
       ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt)
    else:
       ncbi_sequin_params = None
       ncbi_sequin_sbt = None

    # try to load the parameter file    
    try:
        parameter_f = opts.parameter_fp
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.parameter_fp

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(1)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    params=parse_metapaths_parameters(parameter_f)
    format = params['INPUT']['format']

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 
    
    input_output_list = {}
    # TODO: Check for illumina paired data... this complicates things a little. 
    if path.isfile(input_fp):   
       """ check if it is a file """
       # TODO: Check for illumina pattern, if so check for pairs
       input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """
    if sample_subset:
       remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger)


    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)

    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n")

    
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('ncbi_params_file', ncbi_sequin_params)
                s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
   
        
         # blast the files
     
         blasting_system =    get_parameter(params,  'metapaths_steps', 'BLAST_REFDB', default='yes')
         if blasting_system =='grid':
            #  blasting the files files on the grids
             input_files = sorted_input_output_list
             blast_in_grid(
                   sampleData[input_file],
                   input_files, 
                   path.abspath(opts.output_dir),   #important to use opts.
                   params=params,
                   metapaths_config=metapaths_config,
                   config_file=config_file,
                   run_type = run_type,
                   runid = runid
                )
     
    except:
       globalerrorlogger.write( "ERROR\t" + str(traceback.format_exc(10)))
       exit_process("ERROR:" + str(traceback.format_exc(10)))


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
    halt_process(4)
    def isWithinCutoffs(self, words, data, cutoffs, annot_map, refbitscores):

        try:
            orfid = ShortORFId(words[0])
        except:
            orfid = words[0]

        data['query'] = orfid

        try:
            data['target'] = words[1]
        except:
            data['target'] = 0

        try:
            data['q_length'] = int(words[7]) - int(words[6]) + 1
        except:
            data['q_length'] = 0

        try:
            data['bitscore'] = float(words[11])
        except:
            data['bitscore'] = 0

        try:
            data['bsr'] = float(words[11]) / refbitscores[orfid]
        except:
            #print "words 0 " + str(refscores[words[0]])
            #print "words 11 " + str( words[11])
            data['bsr'] = 0

        try:
            data['expect'] = float(words[10])
        except:
            data['expect'] = 0

        try:
            data['aln_length'] = float(words[3])
        except:
            data['aln_length'] = 0

        try:
            data['identity'] = float(words[2])
        except:
            data['identity'] = 0

        try:
            data['product'] = annot_map[words[1]]
        except:
            eprintf("Sequence with name \"" + words[1] +
                    "\" is not present in map file\n")
            if self.error_and_warning_logger:
                self.error_and_warning_logger.write(
                    "Sequence with name %s is not present in map file " %
                    (words[1]))
            self.incErrorCount()
            if self.maxErrorsReached():
                if self.error_and_warning_logger:
                    self.error_and_warning_logger.write(
                        "Number of sequence absent in map file %s exceeds %d" %
                        (self.blastoutput, self.ERROR_COUNT))
                exit_process(
                    "Number of sequence absent in map file %s exceeds %d" %
                    (self.blastoutput, self.ERROR_COUNT))
            data['product'] = 'hypothetical protein'

        try:
            m = re.search(r'(\d+[.]\d+[.]\d+[.]\d+)', data['product'])
            if m != None:
                data['ec'] = m.group(0)
            else:
                data['ec'] = ''
        except:
            data['ec'] = ''

        if cutoffs.taxonomy:
            try:
                m = re.search(r'\[([^\[]+)\]', data['product'])
                if m != None:
                    data['taxonomy'] = m.group(1)
                else:
                    data['taxonomy'] = ''
            except:
                data['taxonomy'] = ''

        if cutoffs.remove_taxonomy:
            try:
                data['product'] = re.sub(r'\[([^\[]+)\]', '', data['product'])
            except:
                data['product'] = ''

        if cutoffs.remove_ec:
            try:
                data['product'] = re.sub(
                    r'\([Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\)', '',
                    data['product'])
                data['product'] = re.sub(
                    r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\]', '',
                    data['product'])
                data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.-]\]',
                                         '', data['product'])
                data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.-.-]\]', '',
                                         data['product'])
                data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.-.-.-]\]', '',
                                         data['product'])
            except:
                data['product'] = ''

        if data['q_length'] < cutoffs.min_length:
            return False

        if data['bitscore'] < cutoffs.min_score:
            return False

        if data['expect'] > cutoffs.max_evalue:
            return False

        if data['identity'] < cutoffs.min_identity:
            return False

        if data['bsr'] < cutoffs.min_bsr:
            return False

    #min_length'
    #'min_score'
    #'max_evalue'
    # 'min_identity'
    #'limit'
    #'max_length'
    #'min_query_coverage'
    #'max_gaps'
    #min_bsr'

        return True
Ejemplo n.º 43
0
def main(argv):
    global parser
    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
        print usage
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("COMMAND : %s\n", sys.argv[0] + ' ' + ' '.join(argv))
    # initialize the input directory or file
    input_fp = opts.input_fp
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()
    '''no need to remove the whole directory'''
    #    if run_type == 'overwrite':
    #       force_remove_dir=True
    #    else:
    #       force_remove_dir=False

    if opts.config_file:
        config_file = opts.config_file
    else:
        config_file = cmd_folder + PATHDELIM + metapaths_config

    if opts.ncbi_header and opts.ncbi_sbt:
        if not path.exists(opts.ncbi_header):
            print "Could not open or missing NCBI header file " + opts.ncbi_header
            print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file"
            sys.exit(0)

        if not path.exists(opts.ncbi_sbt):
            print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n 
                 http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt
            sys.exit(0)

        ncbi_sequin_params = path.abspath(opts.ncbi_header)
        ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt)
    else:
        ncbi_sequin_params = None
        ncbi_sequin_sbt = None

    # try to load the parameter file
    try:
        if opts.parameter_fp:
            parameter_fp = opts.parameter_fp
        else:
            parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, (
            "Can't open parameters file (%s). Does it exist? Do you have read access?"
            % opts.parameter_fp)

    try:
        if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
            makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(1)

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    command_line_params = {}
    command_line_params['verbose'] = opts.verbose

    params = parse_metapaths_parameters(parameter_fp)
    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(
        output_dir, basefile_name='global_errors_warnings'),
                                       open_mode='w')

    input_output_list = {}
    if path.isfile(input_fp):
        """ check if it is a file """
        input_output_list = create_an_input_output_pair(
            input_fp, output_dir, globalerrorlogger=globalerrorlogger)
    else:
        if path.exists(input_fp):
            """ check if dir exists """
            input_output_list = create_input_output_pairs(
                input_fp, output_dir, globalerrorlogger=globalerrorlogger)
        else:
            """ must be an error """
            eprintf(
                "ERROR\tNo valid input sample file or directory containing samples exists .!"
            )
            eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
            exit_process(
                "ERROR\tAs provided as arguments in the -in option.!\n")
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list,
                               sample_subset,
                               globalerrorlogger=globalerrorlogger)

    # add check the config parameters
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list)

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
        globalerrorlogger.printf(
            "ERROR\tInvalid inputs found. Check for file with bad format or characters!\n"
        )
        halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list,
                             sample_subset,
                             logger=globalerrorlogger)

    #check the pipeline configuration
    config_settings = read_pipeline_configuration(config_file,
                                                  globalerrorlogger)

    parameter = Parameters()
    if not staticDiagnose(config_settings, params, logger=globalerrorlogger):
        eprintf(
            "ERROR\tFailed to pass the test for required scripts and inputs before run\n"
        )
        globalerrorlogger.printf(
            "ERROR\tFailed to pass the test for required scripts and inputs before run\n"
        )
        halt_process(opts.delay)

    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
        # load the sample information
        print "RUNNING MetaPathways version 2.5.2"
        if len(input_output_list):
            for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params,
                                          'annotation',
                                          'algorithm',
                                          default='LAST').upper()
                s = SampleData()
                s.setInputOutput(inputFile=input_file,
                                 sample_output_dir=sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('ncbi_params_file', ncbi_sequin_params)
                s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                if params["INPUT"]['format'] in [
                        "gbk-annotated", "gff-annotated"
                ]:
                    s.setParameter('ANNOTATED', True)
                else:
                    s.setParameter('ANNOTATED', False)
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()

                if run_type == 'overwrite' and path.exists(sample_output_dir):
                    shutil.rmtree(sample_output_dir)
                    makedirs(sample_output_dir)
                if not path.exists(sample_output_dir):
                    makedirs(sample_output_dir)

                s.prepareToRun()
                samplesData[input_file] = s

            # load the sample information
            run_metapathways(samplesData,
                             sample_output_dir,
                             output_dir,
                             globallogger=globalerrorlogger,
                             command_line_params=command_line_params,
                             params=params,
                             metapaths_config=metapaths_config,
                             status_update_callback=status_update_callback,
                             config_file=config_file,
                             run_type=run_type,
                             config_settings=config_settings,
                             block_mode=block_mode,
                             runid=runid)
        else:
            eprintf(
                "ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",
                sQuote(input_fp))
            globalerrorlogger.printf(
                "ERROR\tNo valid input files to process in folder %s!\n",
                sQuote(input_fp))

        # blast the files

        blasting_system = get_parameter(params,
                                        'metapaths_steps',
                                        'BLAST_REFDB',
                                        default='yes')
        if blasting_system == 'grid':
            #  blasting the files files on the grids
            input_files = sorted_input_output_list
            blast_in_grid(
                sampleData[input_file],
                input_files,
                path.abspath(opts.output_dir),  #important to use opts.
                params=params,
                metapaths_config=metapaths_config,
                config_file=config_file,
                run_type=run_type,
                runid=runid)

    except:
        exit_process(str(traceback.format_exc(10)), logger=globalerrorlogger)

    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
    halt_process(opts.delay)
def main(argv):
    global parser

    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' +  ' '.join(argv)) )
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    

    # try to load the parameter file    
    try:
       if opts.parameter_fp:
          parameter_fp= opts.parameter_fp
       else:
          parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp )

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(2)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    if not path.exists(parameter_fp):
        eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp))
        eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp))
        create_metapaths_parameters(parameter_fp, cmd_folder)

    params=parse_metapaths_parameters(parameter_fp)

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 

    input_output_list = {}
    if path.isfile(input_fp):   
       """ check if it is a file """
       input_output_list = create_an_input_output_pair(input_fp, output_dir,  globalerrorlogger=globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)

    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list) 

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
       globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n")
       halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger)


    #check the pipeline configuration

    print 'config'
    if not path.exists(config_file):
        eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file))
        eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file))
        if not environment_variables_defined():
           sys.exit(0)
        create_metapaths_configuration(config_file, cmd_folder)

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)


    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        return 
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         print "RUNNING MetaPathways version FogDog 3.0"
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) )
   
    except:
       exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger )


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
def check_config_settings(config_settings, file, globalerrorlogger=None):
    essentialItems = ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR']
    missingItems = []

    for key, value in config_settings.items():
        # these are not files or executables

        if key in ['NUM_CPUS', 'FORMATTED_DB_SIZE']:
            continue

        if key in [
                'FORMATDB_EXECUTABLE', 'BLASTP_EXECUTABLE', 'BLASTN_EXECUTABLE'
        ] and value == '':
            continue

        # make sure  MetaPathways directory is present
        if key in ['METAPATHWAYS_PATH']:
            if not path.isdir(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 1.Currently it is set to \"%s\"\n",
                        config_settings[key])

                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write(
                        "       Currently it is set to \"%s\". Please correct it and try again.\n"
                        % (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure  REFDB directories are present
        if key in ['REFDBS']:
            if not path.isdir(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 2.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write(
                        "Currently it is set to \"%s\". Please correct it and try again.\n"
                        % (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure EXECUTABLES_DIR directories are present
        if key in ['EXECUTABLES_DIR']:
            if not path.isdir(config_settings['METAPATHWAYS_PATH'] +
                              PATHDELIM + config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 3.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write(
                        "Currently it is set to \"%s\". Please correct the path.\n"
                        % (config_settings[key]))
                missingItems.append(key)
            continue

        if key in ['ACCESSION_TO_TAXONID']:
            if not path.isfile(config_settings['REFDBS'] + PATHDELIM +
                               'ncbi_tree' + PATHDELIM + config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                    key, file)
                eprintf(
                    "ERROR: 7.Currently it is set to \"%s\"\n",
                    config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' +
                    PATHDELIM + config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write(
                        "Currently it is set to \"%s\". Please correct the path to compute LCA with accession id translation.\n"
                        % (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure RESOURCES_DIR directories are present
        if key in ['RESOURCES_DIR']:
            if not path.isdir(config_settings['METAPATHWAYS_PATH'] +
                              PATHDELIM + config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                    key, file)
                eprintf(
                    "ERROR: 4.Currently it is set to \"%s\"\n",
                    config_settings['METAPATHWAYS_PATH'] + PATHDELIM +
                    config_settings[key])
                print(config_settings['METAPATHWAYS_PATH'],
                      config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # make sure  MetaPaths directory is present
        if key in ['PYTHON_EXECUTABLE', 'PATHOLOGIC_EXECUTABLE']:
            if not path.isfile(config_settings[key]):
                eprintf(
                    "ERROR: Path for \"%s\" is NOT set properly (or missing)  in configuration file \"%s\"\n",
                    key, file)
                eprintf("ERROR: 5.Currently it is set to \"%s\"\n",
                        config_settings[key])
                if globalerrorlogger != None:
                    globalerrorlogger.write(
                        "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                        % (key, file))
                    globalerrorlogger.write("Currently it is set to \"%s\"\n" %
                                            (config_settings[key]))
                missingItems.append(key)
            continue

        # ignore pgdb folder for now
        if key in ['PGDB_FOLDER']:
            continue

        # check if the desired file exists. if not, then print a message
        if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM +  value)\
          and  not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) :
            eprintf(
                "ERROR:Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n",
                key, file)
            eprintf(
                "6.Currently it is set to \"%s\"\n",
                config_settings['METAPATHWAYS_PATH'] + PATHDELIM +
                config_settings['EXECUTABLES_DIR'] + PATHDELIM + value)
            if globalerrorlogger != None:
                globalerrorlogger.write(
                    "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n"
                    % (key, file))
                globalerrorlogger.write(
                    "Currently it is set to \"%s\"\n" %
                    (config_settings['METAPATHWAYS_PATH'] + value))
            missingItems.append(key)
            continue

    stop_execution = False
    for item in missingItems:
        if item in essentialItems:
            eprintf(
                "ERROR\t Essential field in setting %s is missing in configuration file!\n",
                item)
            if globalerrorlogger != None:
                globalerrorlogger.write(
                    "ERROR\tEssential field in setting %s is missing in configuration file!\n"
                    % (item))
            stop_execution = True

    if stop_execution == True:
        eprintf(
            "ERROR: Terminating execution due to missing essential  fields in configuration file!\n"
        )
        if globalerrorlogger != None:
            globalerrorlogger.write(
                "ERROR\tTerminating execution due to missing essential  fields in configuration file!\n"
            )
        exit_process()
def read_pipeline_configuration(file, globallogger):
    patternKEYVALUE = re.compile(r'^([^\t\s]+)[\t\s]+\'(.*)\'')
    try:
        configfile = open(file, 'r')
    except IOError:
        eprintf("ERROR :Did not find pipeline config %s!\n", file)
        globalerrorlogger.write("ERROR\tDid not find pipeline config %s!\n" %
                                (file))
    else:
        lines = configfile.readlines()

    config_settings = {}
    for line in lines:
        if not re.match("#", line) and len(line.strip()) > 0:
            line = line.strip()
            result = patternKEYVALUE.search(line)

            try:
                if len(result.groups()) == 2:
                    fields = result.groups()
                else:
                    eprintf(
                        "     The following line in your config settings files is not set up yet\n"
                    )
                    eprintf(
                        "     Please rerun the pipeline after setting up this line\n"
                    )
                    eprintf("     Error in line : %s\n", line)
                    globalerrorlogger(
                         "WARNING\t\n"+\
                         "     The following line in your config settings files is not set up yet\n"+\
                         "     Please rerun the pipeline after setting up this line\n"+\
                         "     Error in line : %s\n" %(line))

                    exit_process()
            except:
                eprintf(
                    "     The following line in your config settings files is not set up yet\n"
                )
                eprintf(
                    "     Please rerun the pipeline after setting up this line\n"
                )
                eprintf("     Error ine line : %s\n", line)
                globalerrorlogger(
                     "WARNING\t\n"+\
                     "     The following line in your config settings files is not set up yet\n"+\
                     "     Please rerun the pipeline after setting up this line\n"+\
                     "     Error in line : %s\n" %(line))
                exit_process()

            if PATHDELIM == '\\':
                config_settings[fields[0]] = re.sub(r'/', r'\\', fields[1])
            else:
                config_settings[fields[0]] = re.sub(r'\\', '/', fields[1])

    config_settings[
        'METAPATHWAYS_PATH'] = config_settings['METAPATHWAYS_PATH'] + PATHDELIM
    config_settings['REFDBS'] = config_settings['REFDBS'] + PATHDELIM

    check_config_settings(config_settings, file, globallogger)
    config_settings['configuration_file'] = file

    return config_settings
def merge_sorted_parsed_files(dbname,
                              filenames,
                              outputfilename,
                              orfRanks,
                              verbose=False,
                              errorlogger=None):
    linecount = 0
    readerhandles = []

    if verbose:
        eprintf("Processing database  : %s\n", dbname)

    if len(filenames) == 0:
        eprintf(
            "WARNING : Cannot find any B/LAST output file for database : %\n",
            dbname)
        exit_process()

    try:
        for i in range(len(filenames)):
            #print filenames
            readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]))
    except OSError:
        eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i])
        exit_process()

    # set error and warning parameters
    for readerhandle in readerhandles:
        readerhandle.setMaxErrorsLimit(5)
        readerhandle.setErrorAndWarningLogger(errorlogger)
        readerhandle.setSTEP_NAME('PARSE BLAST')

    try:
        outputfile = open(outputfilename, 'w')
        fieldmapHeaderLine = readerhandles[0].getHeaderLine()
        fprintf(outputfile, "%s\n", fieldmapHeaderLine)
    except OSError:
        eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename)
        exit_process()

    values = []
    for i in range(len(filenames)):
        iterate = iter(readerhandles[i])
        try:
            next(iterate)
            line = readerhandles[i].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            shortORFId = getShortORFId(fields[0])
            values.append((i, orfRanks[shortORFId], line))
        except:
            outputfile.close()
            return

    S = len(filenames)
    BuildHeap(S, values)

    while S > 0:
        try:
            iterate = iter(readerhandles[values[0][0]])
            line = readerhandles[values[0][0]].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            #print fields[0], orfRanks[fields[0]]
            fprintf(outputfile, "%s\n", line)
            next(iterate)

            line = readerhandles[values[0][0]].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            shortORFId = getShortORFId(fields[0])
            values[0] = (values[0][0], orfRanks[shortORFId], line)
        except:
            #print 'finished ' + str(S)
            values[0] = values[S - 1]
            S = S - 1

        if S > 0:
            Heapify(values, 0, S)

    #print 'line count ' + str(linecount)
    outputfile.close()
def checkMetapathsteps(params, runlogger=None):
    choices = {'metapaths_steps': {}, 'annotation': {}, 'INPUT': {}}

    choices['INPUT']['format'] = [
        'fasta', 'gbk_unannotated', 'gbk_annotated', 'gff_unannotated',
        'gff_annotated'
    ]

    choices['annotation']['algorithm'] = ['last', 'blast']

    choices['metapaths_steps']['PREPROCESS_FASTA'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['ORF_PREDICTION'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['GFF_TO_AMINO'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['FILTERED_FASTA'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['COMPUTE_REFSCORE'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['BLAST_REFDB'] = [
        'yes', 'skip', 'stop', 'redo', 'grid'
    ]
    choices['metapaths_steps']['PARSE._BLAST'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['SCAN_rRNA'] = ['yes', 'skip', 'stop', 'redo']
    choices['metapaths_steps']['STATS_rRNA'] = ['yes', 'skip', 'stop', 'redo']
    choices['metapaths_steps']['ANNOTATE'] = ['yes', 'skip', 'stop', 'redo']
    choices['metapaths_steps']['PATHOLOGIC_INPUT'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['GENBANK_FILE'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['CREATE_SEQUIN_FILE'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['CREATE_REPORT_FILES'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['SCAN_tRNA'] = ['yes', 'skip', 'stop', 'redo']
    choices['metapaths_steps']['MLTREEMAP_CALCULATION'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['MLTREEMAP_IMAGEMAKER'] = [
        'yes', 'skip', 'stop', 'redo'
    ]
    choices['metapaths_steps']['PATHOLOGIC'] = ['yes', 'skip', 'stop', 'redo']

    if params['metapaths_steps']:
        checkParam_values(choices, params['metapaths_steps'], runlogger)

    checkparams = {}
    checkparams['annotation'] = []
    checkparams['annotation'].append('dbs')

    if not checkMissingParam_values(params, checkparams, runlogger):
        exit_process("Missing parameters")
Ejemplo n.º 49
0
def sigint_handler(signum, frame):
    eprintf("Received TERMINATION signal\n")
    exit_process()
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)

    if not len(options.blast_files):
        parser.error('At least one taxonomic BLAST output is required')

    if runBlastCommandrRNA(runcommand=runcommand) != 0:
        if errorlogger:
            errorlogger.write(
                "ERROR: Failed to BLAST the sequences against database %s : " %
                (options.tax_databases[0]))
            errorlogger.write("     : " + runcommand)
        exit_process("ERROR: Failed to BLAST the sequences against database %s : "  %(options.tax_databases[0]) +\
                     "     : " + runcommand)

    if not (len(options.tax_databases) == len(options.blast_files)):
        parser.error(
            'Number of taxonomic databases and BLAST outputs should be the same'
        )

    if not options.output:
        parser.error('Output file must be specified')
    # Incredible sanity check

    if not files_exist(options.blast_files):
        sys.exit(0)

    if not files_exist(options.tax_databases):
        sys.exit(0)

    params = {
        'length': int(options.length),
        'similarity': float(options.similarity),
        'evalue': float(options.evalue),
        'bitscore': float(options.bitscore)
    }
    #print params['bitscore']
    table = {}
    for x in range(0, len(options.blast_files)):
        table[options.tax_databases[x]] = {}
        process_blastout_file(options.blast_files[x],
                              options.tax_databases[x],
                              table[options.tax_databases[x]],
                              errorlogger=errorlogger)

    priority = 7000
    reads = {}
    for x in range(0, len(options.blast_files)):
        append_taxonomic_information(options.tax_databases[x],
                                     table[options.tax_databases[x]], params)
        for key in table[options.tax_databases[x]]:
            if len(table[options.tax_databases[x]][key][6]) > 1:
                reads[key] = True

        dbname = re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])
        runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" %
                             (str(priority), dbname, str(len(reads))))
        priority += 1
    outputfile = open(options.output, 'w')
    fprintf(outputfile,
            "#Similarity cutoff :\t" + str(params['similarity']) + '\n')
    fprintf(outputfile, "#Length cutoff :\t" + str(params['length']) + '\n')
    fprintf(outputfile, "#Evalue cutoff :\t" + str(params['evalue']) + '\n')
    fprintf(outputfile,
            "#Bit score cutoff :\t" + str(params['bitscore']) + '\n')
    fprintf(outputfile,
            "#Number of rRNA sequences detected:\t" + str(len(reads)) + '\n\n')

    for x in range(0, len(options.tax_databases)):
        #  printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x]))
        fprintf(outputfile, '\t%s\t\t\t',
                re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]))
    #printf('\n')
    fprintf(outputfile, '\n')

    #printf('%s', 'read')
    for x in range(0, len(options.blast_files)):
        fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start',
                'end', 'similarity', 'evalue', 'bitscore', 'taxonomy')
    fprintf(outputfile, '\n')

    for read in reads:
        #printf('%s', read)
        fprintf(outputfile, '%s', read)
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
                fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s',
                        str(table[options.tax_databases[x]][read][4]),
                        str(table[options.tax_databases[x]][read][5]),
                        str(table[options.tax_databases[x]][read][0]),
                        str(table[options.tax_databases[x]][read][1]),
                        str(table[options.tax_databases[x]][read][2]),
                        str(table[options.tax_databases[x]][read][6]))
            else:
                fprintf(outputfile, '\t-\t-\t-\t-\t-\t-')
        fprintf(outputfile, '\n')
    outputfile.close()

    # collect the exact reads
    database_hits = {}
    for read in reads:
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
                database_hits[read] = [
                    table[options.tax_databases[x]][read][4],
                    table[options.tax_databases[x]][read][5]
                ]

    # pick the hits, trim them according to the match and write them
    if options.fasta:
        selected_sequences = {}
        read_select_fasta_sequences(database_hits, selected_sequences,
                                    options.fasta)
        for read in database_hits:
            selected_sequences[read] = selected_sequences[read][
                database_hits[read][0]:database_hits[read][1]]
        write_selected_sequences(selected_sequences, options.output + '.fasta')
def main(argv):
    global parser

    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' +  ' '.join(argv)) )
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    

    # try to load the parameter file    
    try:
       if opts.parameter_fp:
          parameter_fp= opts.parameter_fp
       else:
          parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp )

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(2)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    if not path.exists(parameter_fp):
        eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp))
        eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp))
        create_metapaths_parameters(parameter_fp, cmd_folder)

    params=parse_metapaths_parameters(parameter_fp)

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 

    input_output_list = {}
    if path.isfile(input_fp):   
       """ check if it is a file """
       input_output_list = create_an_input_output_pair(input_fp, output_dir,  globalerrorlogger=globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)

    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list) 

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
       globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n")
       halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger)


    #check the pipeline configuration

    print 'config'
    if not path.exists(config_file):
        eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file))
        eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file))
        if not environment_variables_defined():
           sys.exit(0)
        create_metapaths_configuration(config_file, cmd_folder)

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)


    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        return 
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         print "RUNNING MetaPathways version FogDog 3.0"
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) )
   
    except:
       exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger )


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")