コード例 #1
0
def blast(parameters, logFile):
  '''
  Perform the homology search using the different BLAST package programs. This
  module offers retrocompatibility to legacy blast.
  '''

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  ## Get output file name and check whether has been previously generated or
  ## not. It will also affect whether the variable REPLACE is set or not
  outFile = ("%s.homology.blast.out") % (oFile)

  ## If output file exist and it is not set to replace it, just go back to the
  ## main function. Otherwise, set the replace parameter to TRUE in other to
  ## replace any already generated file downstream
  if lookForFile(outFile) and not parameters["replace"]:
    return
  parameters["replace"] = True

  ## Generate command-line depending on which BLAST package is being used.
  if parameters["homology"][0] == "legacy_blast":
    binary = parameters["legacy_blast"][0]
    params = parameters[binary +"_params"]
    cmd = ("%s %s -e %s -d %s -i %s -o %s") % (parameters[binary], params, \
      str(parameters["e_value"]), parameters["db_file"], parameters["in_file"],\
      outFile)

  elif parameters["homology"][0] == "blast+":
    binary = parameters["blast+"][0]
    params = parameters[binary +"_params"]
    cmd = ("%s %s -evalue %s -db %s -query %s -out %s") % (parameters[binary], \
      params, str(parameters["e_value"]), parameters["db_file"], \
      parameters["in_file"], outFile)

  name = getfqdn()
  print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
    logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile)
  except OSError as e:
    sys.exit("ERROR: Execution failed: " + str(e))

  if proc.wait() != 0:
    sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))

  ## Remove any error file generated during the legacy_blast execution - We try
  ## to delete this file only if it is empty
  if not lookForFile("error.log"):
    sp.call(("rm -f error.log"), shell = True)
コード例 #2
0
def blast(parameters, logFile):
    '''
  Perform the homology search using the different BLAST package programs. This
  module offers retrocompatibility to legacy blast.
  '''

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    ## Get output file name and check whether has been previously generated or
    ## not. It will also affect whether the variable REPLACE is set or not
    outFile = ("%s.homology.blast.out") % (oFile)

    ## If output file exist and it is not set to replace it, just go back to the
    ## main function. Otherwise, set the replace parameter to TRUE in other to
    ## replace any already generated file downstream
    if lookForFile(outFile) and not parameters["replace"]:
        return
    parameters["replace"] = True

    ## Generate command-line depending on which BLAST package is being used.
    if parameters["homology"][0] == "legacy_blast":
        binary = parameters["legacy_blast"][0]
        params = parameters[binary + "_params"]
        cmd = ("%s %s -e %s -d %s -i %s -o %s") % (parameters[binary], params, \
          str(parameters["e_value"]), parameters["db_file"], parameters["in_file"],\
          outFile)

    elif parameters["homology"][0] == "blast+":
        binary = parameters["blast+"][0]
        params = parameters[binary + "_params"]
        cmd = ("%s %s -evalue %s -db %s -query %s -out %s") % (parameters[binary], \
          params, str(parameters["e_value"]), parameters["db_file"], \
          parameters["in_file"], outFile)

    name = getfqdn()
    print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
      logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile)
    except OSError as e:
        sys.exit("ERROR: Execution failed: " + str(e))

    if proc.wait() != 0:
        sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))

    ## Remove any error file generated during the legacy_blast execution - We try
    ## to delete this file only if it is empty
    if not lookForFile("error.log"):
        sp.call(("rm -f error.log"), shell=True)
コード例 #3
0
def checkAlignment(ifile_1, ifile_2, iformat_1="fasta", iformat_2="fasta"):
    '''
  Read two giving input files and check both contain the same sequences and
  the same input strings
  '''

    ## We introduce a delay to ensure data is already written in the disk.
    ## With high-computing facilities, sometimes there are some problems of
    ## writing to disk the already computed results
    if not lookForFile(ifile_1) or not lookForFile(ifile_2, attempts=5):
        return False

    ## Read both input files - remvoving ambiguous characters and checking for
    ## duplicate names. We used regular expressions for removing any character
    inSeqs_1 = {}
    for record in SeqIO.parse(ifile_1, iformat_1):
        if record.id in inSeqs_1:
            print >> sys.stderr, ("ERROR: Repeated sequence '%s' ['%s']") \
              % (record.id, ifile_1)
            return False
        seq = re.sub(r'[^a-zA-Z]', '', str(record.seq))
        inSeqs_1.setdefault(record.id, seq.upper().strip())

    inSeqs_2 = {}
    for record in SeqIO.parse(ifile_2, iformat_2):
        if record.id in inSeqs_2:
            print >> sys.stderr, ("ERROR: Repeated sequence '%s' ['%s']") \
              % (record.id, ifile_2)
            return False
        seq = re.sub(r'[^a-zA-Z]', '', str(record.seq))
        inSeqs_2.setdefault(record.id, seq.upper().strip())

    ## If there are inconsistencies among sequences, inform about them
    if set(inSeqs_1.keys()) ^ set(inSeqs_2.keys()) != set():
        print >> sys.stderr, (
            "ERROR: Non-overlapping sequences identifier detected " +
            "between input ['%s'] and output ['%s'] files ") % (ifile_1,
                                                                ifile_2)
        return False

    ## Check that sequences in both files contain the same residues
    for seq in inSeqs_1:
        if inSeqs_1[seq] != inSeqs_2[seq]:
            print >> sys.stderr, (
                "ERROR: Different sequence composition for '%s' bet" +
                "ween input ['%s'] and output ['%s'] files") % (seq, ifile_1,
                                                                ifile_2)
            return False

    ## If everything is OK, inform about it
    return True
コード例 #4
0
def checkAlignment(ifile_1, ifile_2, iformat_1 = "fasta", iformat_2 = "fasta"):
  '''
  Read two giving input files and check both contain the same sequences and
  the same input strings
  '''

  ## We introduce a delay to ensure data is already written in the disk.
  ## With high-computing facilities, sometimes there are some problems of
  ## writing to disk the already computed results
  if not lookForFile(ifile_1) or not lookForFile(ifile_2, attempts = 5):
    return False

  ## Read both input files - remvoving ambiguous characters and checking for
  ## duplicate names. We used regular expressions for removing any character
  inSeqs_1 = {}
  for record in SeqIO.parse(ifile_1, iformat_1):
    if record.id in inSeqs_1:
      print(("ERROR: Repeated sequence '%s' ['%s']") \
        % (record.id, ifile_1), file = sys.stderr)
      return False
    seq = re.sub(r'[^a-zA-Z]', '', str(record.seq))
    inSeqs_1.setdefault(record.id, seq.upper().strip())

  inSeqs_2 = {}
  for record in SeqIO.parse(ifile_2, iformat_2):
    if record.id in inSeqs_2:
      print(("ERROR: Repeated sequence '%s' ['%s']") \
        % (record.id, ifile_2), file = sys.stderr)
      return False
    seq = re.sub(r'[^a-zA-Z]', '', str(record.seq))
    inSeqs_2.setdefault(record.id, seq.upper().strip())

  ## If there are inconsistencies among sequences, inform about them
  if set(inSeqs_1.keys()) ^ set(inSeqs_2.keys()) != set():
    print(("ERROR: Non-overlapping sequences identifier detected "
      + "between input ['%s'] and output ['%s'] files ") % (ifile_1, ifile_2), \
      file = sys.stderr)
    return False

  ## Check that sequences in both files contain the same residues
  for seq in inSeqs_1:
    if inSeqs_1[seq] != inSeqs_2[seq]:
      print(("ERROR: Different sequence composition for '%s' bet"
        + "ween input ['%s'] and output ['%s'] files") % (seq, ifile_1, \
        ifile_2), file = sys.stderr)
      return False

  ## If everything is OK, inform about it
  return True
コード例 #5
0
def reverseSequences(binary, in_file, out_file, replace, logFile):
    '''
  Reverse the input sequences using readAl for that purpose
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    ## Define the command-line for getting the sequences reverse independently of
    ## being aligned or not and of the input format
    cmd = ("%s -in %s -out %s -reverse") % (binary, in_file, out_file)

    name = getfqdn()
    print(("###\n###\treadAl - reverse seqs"), file=logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes["readal"])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: readAl"), file=sys.stderr)
        sys.exit(exit_codes["readal"])

    return True
コード例 #6
0
def convertInputFile_Format(label, binary, in_file, out_file, out_format, \
  logFile, replace):
  '''
  Convert a giving input file into a given output format
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  ## Define the command-line for getting the input file format
  cmd = ("%s -in %s -out %s -%s") % (binary, in_file, out_file, out_format)

  name = getfqdn()
  print(("###\n###\t%s - get format") % (label.upper()), file = logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes[label])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  return True
コード例 #7
0
def convertInputFile_Format(label, binary, in_file, out_file, out_format, \
  logFile, replace):
    '''
  Convert a giving input file into a given output format
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    ## Define the command-line for getting the input file format
    cmd = ("%s -in %s -out %s -%s") % (binary, in_file, out_file, out_format)

    name = getfqdn()
    print(("###\n###\t%s - get format") % (label.upper()), file=logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s") % (label.upper()),
              file=sys.stderr)
        sys.exit(exit_codes[label])

    return True
コード例 #8
0
def reverseSequences(binary, in_file, out_file, replace, logFile):
  '''
  Reverse the input sequences using readAl for that purpose
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  ## Define the command-line for getting the sequences reverse independently of
  ## being aligned or not and of the input format
  cmd = ("%s -in %s -out %s -reverse") % (binary, in_file, out_file)

  name = getfqdn()
  print(("###\n###\treadAl - reverse seqs"), file = logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes["readal"])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: readAl"), file = sys.stderr)
    sys.exit(exit_codes["readal"])

  return True
コード例 #9
0
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \
  in_file = None, compare_msa = None, force_refer_msa = None, cds = None):
    '''
  Function to trim a given multiple sequence alignment according to a number of
  parameters. It may also returns the output file in codons if appropiate
  parameters are used.
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    cmd = ""
    ## Construct a customize trimAl command-line call
    ## If an input CDS file is set, generate the output alignment using such
    ## information
    if cds:
        cmd = ("%s -backtrans %s ") % (cmd, cds)
    if compare_msa:
        cmd = ("%s -compareset %s ") % (cmd, compare_msa)
    if force_refer_msa:
        cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa)
    if in_file:
        cmd = ("%s -in %s ") % (cmd, in_file)
    cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters)

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\tTrimming Input MSA\t%s") % (date), file=logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s") % (label.upper()),
              file=sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    return True
コード例 #10
0
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \
  in_file = None, compare_msa = None, force_refer_msa = None, cds = None):
  '''
  Function to trim a given multiple sequence alignment according to a number of
  parameters. It may also returns the output file in codons if appropiate
  parameters are used.
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  cmd = ""
  ## Construct a customize trimAl command-line call
  ## If an input CDS file is set, generate the output alignment using such
  ## information
  if cds:
    cmd = ("%s -backtrans %s ") % (cmd, cds)
  if compare_msa:
    cmd = ("%s -compareset %s ") % (cmd, compare_msa)
  if force_refer_msa:
    cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa)
  if in_file:
    cmd = ("%s -in %s ") % (cmd, in_file)
  cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters)

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\tTrimming Input MSA\t%s") % (date), file = logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes[label])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file = logFile)
  logFile.flush()

  return True
コード例 #11
0
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \
  logFile, replace):
    '''
  Function to format the command-line of different phylogenetic tree reconstruc-
  tion programs and execute such command lines.
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["phyml", "codonphyml"]:
        cmd = ("%s -i %s %s") % (binary, in_file, parameters)

    elif label in ["fasttree"]:
        cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \
          out_file, in_file)

    elif label in ["raxml"]:
        random_seed = randint(1, 10000)
        suffix = ("%s_%d") % (label, random_seed)

        cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \
          parameters)

    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print >> logFile, ("###\n###\t%s - Phylogenetic Trees\t") % (
        label.upper()),
    print >> logFile, ("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name,
                                                                  cmd)
    logFile.flush()

    try:
        ## We add a small pipeline to avoid informatin written in the same line
        proc = sp.Popen(cmd,
                        shell=True,
                        stderr=logFile,
                        stdout=logFile,
                        stdin=sp.PIPE)
    except OSError, e:
        print >> sys.stderr, "ERROR: Execution failed: " + str(e)
        sys.exit(exit_codes[label])
コード例 #12
0
def replaceRareAminoAcids(in_file, out_file, replace, logFile, combinations, \
  back = False, in_format = "fasta"):
    '''
  Replace rare amino-acids occurrence by wildcards, and vice-versa. It will only
  works with input files in FASTA format
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    subs = {}
    for comb in map(strip, combinations.split()):
        ## Depending on the direction of the conversion, make it on one way or in
        ## the way around
        src, dst = comb.split(":")[::-1] if back else comb.split(":")
        subs.setdefault(src, dst)

    ## Record some stats about which amino-acids and how many times they have been
    ## detected
    stats = dict([(letter, 0) for letter in subs])

    oFile = open(out_file, "w")
    for record in SeqIO.parse(in_file, in_format):
        seq = str(record.seq)
        for letter in subs:
            seq = seq.replace(letter, subs[letter])
            stats[letter] += seq.count(subs[letter])
        print((">%s\n%s") % (record.id, splitSequence(seq)), file=oFile)
    oFile.close()

    output = "|\t".join([("'%s' > '%s'\tfreq: %d") % (aa, subs[aa], stats[aa]) \
      for aa in stats if stats[aa] > 0])

    name = getfqdn()
    print(("###\n###\t[%s]\tSubstituting Rare Amino-Acids") % (name), file = \
      logFile)
    print(("###\tReport\t%s") % (output), file=logFile)
    logFile.flush()

    return True
コード例 #13
0
def replaceRareAminoAcids(in_file, out_file, replace, logFile, combinations, \
  back = False, in_format = "fasta"):
  '''
  Replace rare amino-acids occurrence by wildcards, and vice-versa. It will only
  works with input files in FASTA format
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  subs = {}
  for comb in map(strip, combinations.split()):
    ## Depending on the direction of the conversion, make it on one way or in
    ## the way around
    src, dst = comb.split(":")[::-1] if back else comb.split(":")
    subs.setdefault(src, dst)

  ## Record some stats about which amino-acids and how many times they have been
  ## detected
  stats = dict([(letter, 0) for letter in subs])

  oFile = open(out_file, "w")
  for record in SeqIO.parse(in_file, in_format):
    seq = str(record.seq)
    for letter in subs:
      seq = seq.replace(letter, subs[letter])
      stats[letter] += seq.count(subs[letter])
    print((">%s\n%s") % (record.id, splitSequence(seq)), file = oFile)
  oFile.close()

  output = "|\t".join([("'%s' > '%s'\tfreq: %d") % (aa, subs[aa], stats[aa]) \
    for aa in stats if stats[aa] > 0])

  name = getfqdn()
  print(("###\n###\t[%s]\tSubstituting Rare Amino-Acids") % (name), file = \
    logFile)
  print(("###\tReport\t%s") % (output), file = logFile)
  logFile.flush()

  return True
コード例 #14
0
def get_likelihood(label, stats_file):

    ## Check whether the STATS file is available or not
    if not lookForFile(stats_file):
        return None

    logLK = None
    ## PHYML/CodonPhyML
    if label in ["phyml", "codonphyml"]:
        for line in open(stats_file, "rU"):
            if not line.startswith(". Log-likelihood"):
                continue
            logLK = float(list(map(strip, line.split()))[2])
            break

    ## FastTree
    elif label in ["fasttree"]:
        for line in open(stats_file, "rU"):
            if line.lower().find("loglk") == -1:
                continue
            f = list(map(strip, line.split("\t")))
            try:
                value = float(f[2])
            except:
                continue
            logLK = value if not logLK or value < logLK else logLK

    ## RAXML
    for line in open(stats_file, "rU"):
        if not line.lower().startswith("final") or line.lower().find(
                "score") == -1:
            continue
        logLK = float(list(map(strip, line.split()))[-1])
        break

    ## Return the likelihood value for the current tree
    return logLK
コード例 #15
0
def get_likelihood(label, stats_file):

  ## Check whether the STATS file is available or not
  if not lookForFile(stats_file):
    return None

  logLK = None
  ## PHYML/CodonPhyML
  if label in ["phyml", "codonphyml"]:
    for line in open(stats_file, "rU"):
      if not line.startswith(". Log-likelihood"):
        continue
      logLK = float(list(map(strip, line.split()))[2])
      break

  ## FastTree
  elif label in ["fasttree"]:
    for line in open(stats_file, "rU"):
      if line.lower().find("loglk") == -1:
        continue
      f = list(map(strip, line.split("\t")))
      try:
        value = float(f[2])
      except:
        continue
      logLK = value if not logLK or value < logLK else logLK

  ## RAXML
  for line in open(stats_file, "rU"):
    if not line.lower().startswith("final") or line.lower().find("score") == -1:
      continue
    logLK = float(list(map(strip, line.split()))[-1])
    break

  ## Return the likelihood value for the current tree
  return logLK
コード例 #16
0
  if lookForDirectory(args.outDir, False):
    sys.exit(("ERROR: Output ROOT folder already exist '%s'") % (args.outDir))
    
  args.outDir = os.path.abspath(args.outDir)
  ## ... and try to create it in case it doesn't exist 
  if not lookForDirectory(args.outDir, create = True):
    sys.exit(("ERROR: ROOT folder '%s' cannot be created") % (args.outDir))

  ## Create folders to store the jobs file and (potentially) the configuration
  ## file and input databases
  lookForDirectory(os.path.join(args.outDir, "jobs"))
  lookForDirectory(os.path.join(args.outDir, "Data"))
  lookForDirectory(os.path.join(args.outDir, "BlastDB"))

  ## Check parameters related to files / directories
  if not lookForFile(os.path.abspath(args.script)):
    sys.exit(("ERROR: Check input SCRIPT file '%s'") % (args.script))
  args.script = os.path.abspath(args.script)

  ## Databases and configuration files will be, by default, copied into the new
  ## data structure. It will guarantee to have everything under the same ROOT
  ## folder
  if not lookForFile(os.path.abspath(args.configFile)):
    sys.exit(("ERROR: Check input CONFIG file '%s'") % (args.configFile))
  args.configFile = os.path.abspath(args.configFile)

  config = ("%s/jobs/%s") % (args.outDir, os.path.split(args.configFile)[1]) \
    if args.copy else args.configFile

  if not lookForFile(os.path.abspath(args.dbFile)):
    sys.exit(("ERROR: Check input TARGET SEQUENCES file '%s'") % (args.dbFile))
コード例 #17
0
ファイル: pipeline.py プロジェクト: hongzhonglu/phylomizer
  ## Get current directory - we will use this for normalizing input files and
  ## directories to their absolute paths
  current_directory = os.getcwd()

  ## Assign input parameters directly to the dictionary which will contain all
  ## current run configuration.
  parameters = {}
  parameters.setdefault("replace", args.replace)

  ## Assign which step is being executed. It is useful to know whether the log
  ## file should be replaced or not - even when the flag "replace" is set
  parameters.setdefault("step", 0)

  ## Check parameters related to files / directories
  if not lookForFile(args.inFile):
    sys.exit(("ERROR: Check input QUERY SEQUENCE/s file '%s'") % (args.inFile))
  parameters.setdefault("in_file", os.path.abspath(args.inFile))

  if not lookForFile(args.dbFile):
    sys.exit(("ERROR: Check input TARGET SEQUENCES file '%s' [Mode: HOMOLOGY "
    + "SEARCH]") % (args.dbFile))
  parameters.setdefault("db_file", os.path.abspath(args.dbFile))

  if args.cdsFile:
    if not lookForFile(args.cdsFile):
      sys.exit(("ERROR: Check input CDS file '%s'") % (args.cdsFile))
    parameters.setdefault("cds", os.path.abspath(args.cdsFile))

  if not lookForFile(args.configFile):
    sys.exit(("ERROR: Check input CONFIG file '%s'") % (args.configFile))
コード例 #18
0
def phylogenetic_trees(parameters):
    ''' Phylogenetic trees are reconstructed according to the input parameters.
      Once the different files have been generated, the function moves those
      files into a pre-established filename schema
  '''

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    current_directory = os.getcwd()
    ## Change current directory to the output folder. Any temporary file will be
    ## generated therefore in this folder
    os.chdir(parameters["out_directory"])

    ## Depending on the verbosity level - set the appropriate logfile value
    if not "verbose" in parameters or parameters["verbose"] == 0:
        logFile = open(os.devnull, 'wb')

    ## ALL/logfile
    elif parameters["verbose"] == 1:
        ## Set output filename and log file
        mode = "w" if parameters["replace"] and parameters[
            "step"] == 0 else "a+"
        logFile = open(oFile + ".log", mode)

    ## ALL/Stderr
    elif parameters["verbose"] == 2:
        logFile = sys.stderr

    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" +
           "%s\n###") % (date),
          file=logFile)
    logFile.flush()

    ## Get which program will be used to reconstruct phylogenetic trees. Check
    ## such program is listed among the available binaries
    if not "tree" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is no definition for "
            + "the Phylogenetic TREE reconstruction step")

    prog = parameters["tree"][0]
    if not prog in parameters:
        sys.exit(
            ("ERROR: Selected program '%s' is not available accordding to the "
             "the configuration file") % (prog))

    ## Get binary as well as any default parameters for the selected program
    binary = parameters[prog]
    key = ("%s_params") % (prog)
    progr_params = parameters[key] if key in parameters else ""

    if not "evol_models" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is no definition for "
            + "the <evol_models> parameter")

    ## If the evolutionary model list is not appropiately formated, do it
    if isinstance(parameters["evol_models"], str):
        parameters["evol_models"] = list(
            map(strip, parameters["evol_models"].split()))

    ## Check if <numb_models parameters is defined and how many models are
    ## requested to be evaluated
    if not "numb_models" in parameters or parameters["numb_models"].lower() \
      == "all":
        parameters["numb_models"] = len(parameters["evol_models"])
    parameters["numb_models"] = int(parameters["numb_models"])

    if not parameters["numb_models"] in range(
            1,
            len(parameters["evol_models"]) + 1):
        sys.exit(
            ("ERROR: Check how many evolutionary models has been asked to re" +
             "construct '%d'") % (parameters["numb_models"]))

    ## Check whether "readAl" is available or not. It is useful for sequences
    ## manipulation independently of the input format.
    if not "readal" in parameters:
        sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

    ## Create a temporary FASTA file which will be used to detect the sequence
    ## number on the input alignment and the presence of rare amino-acids
    TEMPFILE = tempfile.NamedTemporaryFile()
    convertInputFile_Format("readal", parameters["readal"],
                            parameters["in_file"], TEMPFILE.name, "fasta",
                            logFile, parameters["replace"])
    TEMPFILE.flush()

    numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name)

    ## Set the minimum number of sequences required to reconstruct an alignment
    min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
      min_seqs_analysis)

    ## Finish when there are not enough sequences to make an alignment
    if numSeqs < min_seqs:
        print(("### INFO: It is necessary, at least, %d sequences to " +
               "to reconstruct an alignment (%d)") % (min_seqs, numSeqs),
              file=logFile)
        sys.exit(80)

    ## Check which approaches should be used for the phylogenetic reconstruction
    ## and whether there are specific program's parameters for them
    if not "tree_approach" in parameters:
        parameters["tree_approach"] = ["ml"]

    ## Remove potential duplicates and lowercase all approaches for the tree
    ## reconstruction
    parameters["tree_approach"] = set([p.lower() for p in \
      parameters["tree_approach"]])

    ## We will first loot for Neighbour Joining tree reconstruction, then for
    ## Maximum likelihood and then for any other approach defined in the config
    ## file
    tree_approaches = []
    if "nj" in parameters["tree_approach"]:
        tree_approaches.append("nj")
    if "ml" in parameters["tree_approach"]:
        tree_approaches.append("ml")
    others = parameters["tree_approach"] - set(["nj", "ml"])
    if others != set():
        tree_approaches += sorted(others)

    ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are
    ## present in the input alignment
    if prog in ["raxml"]:
        ## If Selenocysteines or Pyrrolysines are present, substitute them by "X"
        if selenocys or pyrrolys:
            out_file = ("%s.no_rare_aa") % (parameters["in_file"])

            if replaceRareAminoAcids(TEMPFILE.name, out_file,
                                     parameters["replace"], logFile,
                                     "U:X O:X"):
                parameters["replace"] = True
            parameters["in_file"] = out_file
        TEMPFILE.close()

    ## When using FastTree force the conversion of input alignment to FASTA format
    ## since it may crash reading standard interleave PHYLIP format files
    if prog in ["fasttree"]:

        in_file_format, aligned = getFileFormat("readal", parameters["readal"], \
          parameters["in_file"], logFile)

        if in_file_format != "fasta":
            out_file = ("%s.fa") % (parameters["in_file"])
            if (convertInputFile_Format("readal", parameters["readal"], \
              parameters["in_file"], out_file, "fasta", logFile,
              parameters["replace"])):
                parameters["replace"] = True
            parameters["in_file"] = out_file

    replace = parameters["replace"]
    selected_models = parameters["evol_models"]
    ## Reconstruct trees for each approach considering evolutionary models order
    ## according their likelihood values
    for approach in tree_approaches:

        ## Save results - we will use such data for selecting the best -if required-
        ## models fitting to the input data
        results = {}

        ## Format the choosen program's parameters according to the default ones and
        ## the specific ones for the current approach
        params = ("%s ") % (progr_params)
        params += parameters[approach] if approach in parameters else ""

        for model in selected_models:
            out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
            stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach,
                                                    model)

            if prog in ["phyml"]:
                exec_params = ("%s -m %s") % (params, model)

            ## Get additional model -if any- for codons
            elif prog in ["codonphyml"]:
                exec_params = ("%s -m %s") % (params, model)

                add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \
                  if p.startswith("fmodel")]

                if len(add_model) == 1:
                    add_model = add_model.pop()
                    model = ("%s_%s") % (model, add_model)
                    out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog,
                                                          approach, model)
                    stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog,
                                                            approach, model)

            elif prog in ["fasttree"]:
                ## On FastTree is selected by default JTT model for AAs - so we don't
                ## set-up that model
                exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \
                  and model.lower() != "jc" else params
                model = model.upper()

            ## In the case of RAxML, we would concatenate the model to an specific
            ## input parameter
            elif prog in ["raxml"]:
                final_model = model
                ## It is possible to add some suffixes to the evolutionary models
                ## in RAxML - There is not better/easy way to code this option
                if "raxml_model_suffix" in parameters:
                    final_model += parameters["raxml_model_suffix"]
                exec_params = " ".join([
                    ("-%s%s") % (p, final_model if p.startswith("m ") else "")
                    for p in map(strip, params.split("-")) if p
                ])

            ## Build the phylogenetic tree using any of the available methods and
            ## register if any downstream file should be redone.
            if perform_tree(prog, binary, exec_params, parameters["in_file"],
                            out_file, stats_file, logFile,
                            parameters["replace"]):
                replace = True

            ## Get the likelihood for each of the reconstructed models
            log_lk = get_likelihood(prog, stats_file)

            if not log_lk:
                print(("ERROR: Impossible to the Log likelihood values "
                  + "for '%s' model using this program '%s'") % (model, prog), file = \
                  sys.stderr)
                sys.exit(exit_codes[prog])

            results.setdefault(model, log_lk)

        ## Get the models sorted by their likelihood values
        records = sorted(iter(results.items()),
                         key=itemgetter(1),
                         reverse=True)

        ## Set the filename which stores the ranking
        rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach)

        update = False
        ## Check the content of the rankings file - if any.
        ## Marked the file as updatable if there is any discrepancy
        if not replace and lookForFile(rank_file):

            old_content = "\n".join([
                "\t".join(list(map(strip, line.split("\t"))))
                for line in open(rank_file, "rU")
            ])

            newly_generated = "\n".join([("%s\t%s") % (r[0], r[1])
                                         for r in records])

            ## Decide whether ranking file should be updated after comparing current
            ## content with newly generated content
            update = old_content != newly_generated

        ## If the file containing the ranking doesn't exist, generate it.
        ## Update the file content if the replace flag is set to true or the content
        ## has changed - since the phylogenetic tree reconstruction step is the most
        ## expensive one - in terms of time/memory consumption - we are not setting
        ## replace flag to True even when this file is generated/updated. On this
        ## way, we can take adventage of any tree generated in any downstream step.
        if not lookForFile(rank_file) or replace or update:

            out_file = open(rank_file, "w")
            print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \
              out_file)
            out_file.close()

            ## We could set the replace flag to True. However, if any tree has been
            ## generated 'de novo' during this iteration, then the flag is already set
            ## to True.
            #~ parameters["replace"] = True

        ## Select a given number of models for the next iteration - if any
        selected_models = [
            pair[0] for pair in records[:parameters["numb_models"]]
        ]

        ## Remove the Codon Frequency model from potential new iterations
        if prog in ["codonphyml"] and add_model:
            selected_models = [
                m.replace("_" + add_model, "") for m in selected_models
                if m.endswith(add_model)
            ]

    final = datetime.datetime.now()
    date = final.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") %
          (date),
          file=logFile)

    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") %
          (total),
          file=logFile)
    ## We just close logfile and clean it up when it is a file
    if "verbose" in parameters and parameters["verbose"] == 1:
        logFile.close()

        ## Clean-up log directory from undesirable lines
        try:
            sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True)
        except OSError:
            print(("ERROR: Impossible to clean-up '%s.log' log file") \
              % (oFile), file=sys.stderr)

    ## Before returning to the main program, get back to the original working
    ## directory
    os.chdir(current_directory)

    return parameters
コード例 #19
0
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \
  logFile, replace):
    '''
  Function to format the command-line of different phylogenetic tree reconstruc-
  tion programs and execute such command lines.
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["phyml", "codonphyml"]:
        cmd = ("%s -i %s %s") % (binary, in_file, parameters)

    elif label in ["fasttree"]:
        cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \
          out_file, in_file)

    elif label in ["raxml"]:
        random_seed = randint(1, 10000)
        suffix = ("%s_%d") % (label, random_seed)

        cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \
          parameters)

    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \
      file = logFile)
    print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \
      logFile)
    logFile.flush()

    try:
        ## We add a small pipeline to avoid informatin written in the same line
        proc = sp.Popen(cmd,
                        shell=True,
                        stderr=logFile,
                        stdout=logFile,
                        stdin=sp.PIPE)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    proc.communicate(b'\n\nY\n')

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s") % (label.upper()),
              file=sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    ## Process program's output and rename output files according to our own
    ## scheme
    if label in ["phyml", "codonphyml"]:

        ## Since resulting tree/stats file have slightly changed between version,
        ## we have to control for that.
        tree_file = ("%s_%s_tree.txt") % (in_file, label)
        sts_file = ("%s_%s_stats.txt") % (in_file, label)
        if not lookForFile(tree_file, attempts=2):
            tree_file = ("%s_%s_tree") % (in_file, label)
            sts_file = ("%s_%s_stats") % (in_file, label)

        try:
            sp.call(("mv %s %s") % (tree_file, out_file), shell=True)
            sp.call(("mv %s %s") % (sts_file, stats_file), shell=True)
        except OSError:
            print(("ERROR: Impossible to rename '%s' output files") \
              % (label.upper()), file=sys.stderr)
            sys.exit(exit_codes[label])

    elif label in ["raxml"]:
        try:
            sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file),
                    shell=True)
            sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell=True)
        except OSError:
            print(("ERROR: Impossible to rename RAxML output files"), file = \
              sys.stderr)
            sys.exit(exit_codes[label])

        oFile = open(stats_file, "a+")
        for oth_file in listDirectory(os.path.split(stats_file)[0], suffix):
            fileName = os.path.split(oth_file)[1]
            hz_line = "#" * (len(fileName) + 4)
            print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file=oFile)
            print(("%s") % ("".join(open(oth_file, "rU").readlines())),
                  file=oFile)
            sp.call(("rm -f %s") % (oth_file), shell=True)
        oFile.close()

    return True
コード例 #20
0
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \
  replace):

  '''
  Function to format the command-line of different multiple sequence alignment
  programs and execute such command lines. It is also support a generic call
  for those programs which has no specific support in the pipeline
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  if label in ["muscle", "kalign"]:
    cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file)

  elif label in ["clustalw"]:
    cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \
      out_file)

  elif label in ["clustal_omega"]:
    cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file)

  ## elif label in ["mafft", "dialign_tx"]:
  elif label in ["mafft"]:
    cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file)

  elif label in ["prank"]:
    cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file)

  ## Starting for newer DiAlign-TX versions
  elif label in ["dialign_tx"]:
    cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file)

  ## On t-coffee case, we need to set-up some ENV variables to be able to run
  ## smoothly the program
  elif label in ["t_coffee", "m_coffee"]:

    sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell = True)
    drc = ("/tmp/tcoffee/%s") % (getuser())
    sp.call(("mkdir -p -m0777 %s") % (drc), shell = True)
    os.putenv("LOCKDIR_4_TCOFFEE", drc)
    os.putenv("TMP_4_TCOFFEE", drc)

    cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file)

  ## In any other case, finish with a generic error
  else:
    sys.exit(exit_codes["generic"])

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \
    logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes[label])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s [exit code != -1]") \
      % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file = logFile)
  logFile.flush()

  ## If we are working with PRANK, move output file - which should have a suffix
  ## depending on the output format
  if label in ["prank"]:
    suffix = "fas" if parameters.find("-f=") == -1 else \
      "nex" if parameters.find("-f=nexus") != -1 else "phy"
    if lookForFile(out_file + ".best." + suffix):
      sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell = True)

  ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the
  ## guide tree generate during the program execution
  if label in ["t_coffee", "m_coffee"]:
    guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1])
    sp.call(("rm -f %s.dnd") % (guide_tree), shell = True)

  ## Check whether the output alignment has been already generated.
  ## In case something goes wrong, remove the output file and finish the
  ## current execution
  if not checkAlignment(in_file, out_file):
    print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \
      out_file), file = sys.stderr)
    print(("ERROR: Execution failed: %s [file check]") % \
      (label.upper()), file = sys.stderr)
    # sp.call(("rm -f %s") % (out_file), shell = True)
    sys.exit(exit_codes[label])

  return True
コード例 #21
0
def phylogenetic_trees(parameters):
  ''' Phylogenetic trees are reconstructed according to the input parameters.
      Once the different files have been generated, the function moves those
      files into a pre-established filename schema
  '''

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  current_directory = os.getcwd()
  ## Change current directory to the output folder. Any temporary file will be
  ## generated therefore in this folder
  os.chdir(parameters["out_directory"])

  ## Depending on the verbosity level - set the appropriate logfile value
  if not "verbose" in parameters or parameters["verbose"] == 0:
    logFile = open(os.devnull, 'wb')

  ## ALL/logfile
  elif parameters["verbose"] == 1:
    ## Set output filename and log file
    mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+"
    logFile = open(oFile + ".log", mode)

  ## ALL/Stderr
  elif parameters["verbose"] == 2:
    logFile = sys.stderr

  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t"
    + "%s\n###") % (date), file=logFile)
  logFile.flush()

  ## Get which program will be used to reconstruct phylogenetic trees. Check
  ## such program is listed among the available binaries
  if not "tree" in parameters:
    sys.exit("ERROR: Check your configuration file. There is no definition for "
      + "the Phylogenetic TREE reconstruction step")

  prog = parameters["tree"][0]
  if not prog in parameters:
    sys.exit(("ERROR: Selected program '%s' is not available accordding to the "
      "the configuration file") % (prog))

  ## Get binary as well as any default parameters for the selected program
  binary = parameters[prog]
  key = ("%s_params") % (prog)
  progr_params = parameters[key] if key in parameters else ""

  if not "evol_models" in parameters:
    sys.exit("ERROR: Check your configuration file. There is no definition for "
      + "the <evol_models> parameter")

  ## If the evolutionary model list is not appropiately formated, do it
  if isinstance(parameters["evol_models"], str):
    parameters["evol_models"] = list(map(strip, parameters["evol_models"].split()))

  ## Check if <numb_models parameters is defined and how many models are
  ## requested to be evaluated
  if not "numb_models" in parameters or parameters["numb_models"].lower() \
    == "all":
    parameters["numb_models"] = len(parameters["evol_models"])
  parameters["numb_models"] = int(parameters["numb_models"])

  if not parameters["numb_models"] in range(1,len(parameters["evol_models"])+1):
    sys.exit(("ERROR: Check how many evolutionary models has been asked to re"
      + "construct '%d'") % (parameters["numb_models"]))

  ## Check whether "readAl" is available or not. It is useful for sequences
  ## manipulation independently of the input format.
  if not "readal" in parameters:
    sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

  ## Create a temporary FASTA file which will be used to detect the sequence
  ## number on the input alignment and the presence of rare amino-acids
  TEMPFILE = tempfile.NamedTemporaryFile()
  convertInputFile_Format("readal", parameters["readal"], parameters["in_file"],
    TEMPFILE.name, "fasta", logFile, parameters["replace"])
  TEMPFILE.flush()

  numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name)

  ## Set the minimum number of sequences required to reconstruct an alignment
  min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
    min_seqs_analysis)
  
  ## Finish when there are not enough sequences to make an alignment
  if numSeqs < min_seqs:
    print(("### INFO: It is necessary, at least, %d sequences to "
      + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile)
    sys.exit(80)

  ## Check which approaches should be used for the phylogenetic reconstruction
  ## and whether there are specific program's parameters for them
  if not "tree_approach" in parameters:
    parameters["tree_approach"] = ["ml"]

  ## Remove potential duplicates and lowercase all approaches for the tree
  ## reconstruction
  parameters["tree_approach"] = set([p.lower() for p in \
    parameters["tree_approach"]])

  ## We will first loot for Neighbour Joining tree reconstruction, then for
  ## Maximum likelihood and then for any other approach defined in the config
  ## file
  tree_approaches = []
  if "nj" in parameters["tree_approach"]:
    tree_approaches.append("nj")
  if "ml" in parameters["tree_approach"]:
    tree_approaches.append("ml")
  others = parameters["tree_approach"] - set(["nj", "ml"])
  if others != set():
    tree_approaches += sorted(others)

  ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are
  ## present in the input alignment
  if prog in ["raxml"]:
    ## If Selenocysteines or Pyrrolysines are present, substitute them by "X"
    if selenocys or pyrrolys:
      out_file = ("%s.no_rare_aa") % (parameters["in_file"])

      if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"],
        logFile, "U:X O:X"):
        parameters["replace"] = True
      parameters["in_file"] = out_file
    TEMPFILE.close()

  ## When using FastTree force the conversion of input alignment to FASTA format
  ## since it may crash reading standard interleave PHYLIP format files
  if prog in ["fasttree"]:

    in_file_format, aligned = getFileFormat("readal", parameters["readal"], \
      parameters["in_file"], logFile)

    if in_file_format != "fasta":
      out_file = ("%s.fa") % (parameters["in_file"])
      if (convertInputFile_Format("readal", parameters["readal"], \
        parameters["in_file"], out_file, "fasta", logFile,
        parameters["replace"])):
        parameters["replace"] = True
      parameters["in_file"] = out_file

  replace = parameters["replace"]
  selected_models = parameters["evol_models"]
  ## Reconstruct trees for each approach considering evolutionary models order
  ## according their likelihood values
  for approach in tree_approaches:

    ## Save results - we will use such data for selecting the best -if required-
    ## models fitting to the input data
    results = {}

    ## Format the choosen program's parameters according to the default ones and
    ## the specific ones for the current approach
    params = ("%s ") % (progr_params)
    params += parameters[approach] if approach in parameters else ""

    for model in selected_models:
      out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
      stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model)

      if prog in ["phyml"]:
        exec_params = ("%s -m %s") % (params, model)

      ## Get additional model -if any- for codons
      elif prog in ["codonphyml"]:
        exec_params = ("%s -m %s") % (params, model)

        add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \
          if p.startswith("fmodel")]

        if len(add_model) == 1:
          add_model = add_model.pop()
          model = ("%s_%s") % (model, add_model)
          out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
          stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model)

      elif prog in ["fasttree"]:
        ## On FastTree is selected by default JTT model for AAs - so we don't
        ## set-up that model
        exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \
          and model.lower() != "jc" else params
        model = model.upper()

      ## In the case of RAxML, we would concatenate the model to an specific
      ## input parameter
      elif prog in ["raxml"]:
        final_model = model
        ## It is possible to add some suffixes to the evolutionary models
        ## in RAxML - There is not better/easy way to code this option
        if "raxml_model_suffix" in parameters:
          final_model += parameters["raxml_model_suffix"]
        exec_params = " ".join([("-%s%s") %(p, final_model if p.startswith("m ")
          else "") for p in map(strip, params.split("-")) if p])

      ## Build the phylogenetic tree using any of the available methods and
      ## register if any downstream file should be redone.
      if perform_tree(prog, binary, exec_params, parameters["in_file"],
        out_file, stats_file, logFile, parameters["replace"]):
          replace = True

      ## Get the likelihood for each of the reconstructed models
      log_lk = get_likelihood(prog, stats_file)

      if not log_lk:
        print(("ERROR: Impossible to the Log likelihood values "
          + "for '%s' model using this program '%s'") % (model, prog), file = \
          sys.stderr)
        sys.exit(exit_codes[prog])

      results.setdefault(model, log_lk)

    ## Get the models sorted by their likelihood values
    records = sorted(iter(results.items()), key = itemgetter(1), reverse = True)

    ## Set the filename which stores the ranking
    rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach)

    update = False
    ## Check the content of the rankings file - if any.
    ## Marked the file as updatable if there is any discrepancy
    if not replace and lookForFile(rank_file):

      old_content = "\n".join(["\t".join(list(map(strip, line.split("\t")))) for line
        in open(rank_file, "rU")])      

      newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records])
      
      ## Decide whether ranking file should be updated after comparing current
      ## content with newly generated content
      update = old_content != newly_generated

    ## If the file containing the ranking doesn't exist, generate it.
    ## Update the file content if the replace flag is set to true or the content
    ## has changed - since the phylogenetic tree reconstruction step is the most
    ## expensive one - in terms of time/memory consumption - we are not setting
    ## replace flag to True even when this file is generated/updated. On this
    ## way, we can take adventage of any tree generated in any downstream step. 
    if not lookForFile(rank_file) or replace or update:

      out_file = open(rank_file, "w")
      print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \
        out_file)
      out_file.close()

      ## We could set the replace flag to True. However, if any tree has been
      ## generated 'de novo' during this iteration, then the flag is already set
      ## to True. 
      #~ parameters["replace"] = True

    ## Select a given number of models for the next iteration - if any
    selected_models = [pair[0] for pair in records[:parameters["numb_models"]]]

    ## Remove the Codon Frequency model from potential new iterations
    if prog in ["codonphyml"] and add_model:
      selected_models = [m.replace("_"+ add_model, "") for m in selected_models
        if m.endswith(add_model)]

  final = datetime.datetime.now()
  date = final.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t"
    + "%s") % (date), file=logFile)
    
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s"
    + "\n###") % (total), file=logFile)
  ## We just close logfile and clean it up when it is a file
  if "verbose" in parameters and parameters["verbose"] == 1:
    logFile.close()

    ## Clean-up log directory from undesirable lines
    try:
      sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True)
    except OSError:
      print(("ERROR: Impossible to clean-up '%s.log' log file") \
        % (oFile), file=sys.stderr)

  ## Before returning to the main program, get back to the original working
  ## directory
  os.chdir(current_directory)

  return parameters
コード例 #22
0
def homology(parameters):

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  current_directory = os.getcwd()
  ## Change current directory to the output folder. Any temporary file will be
  ## generated therefore in this folder
  os.chdir(parameters["out_directory"])
  
  ## Depending on the verbosity level - set the appropriate logfile value
  if not "verbose" in parameters or parameters["verbose"] == 0:
    logFile = open(os.devnull, 'wb')

  ## ALL/logfile
  elif parameters["verbose"] == 1:
    ## Set output filename and log file
    mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+"
    logFile = open(oFile + ".log", mode)

  ## ALL/Stderr
  elif parameters["verbose"] == 2:
    logFile = sys.stderr
    
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file = logFile)
  logFile.flush()

  ## Get which tool will be used to perform the homology search. Check such tool
  ## is listed among the available binaries
  if not "homology" in parameters:
    sys.exit("ERROR: Check your configuration file. There is not tool set for "
      + "the homology search")

  if not parameters["homology"][0] in parameters:
    sys.exit("ERROR: Check your configuration file. This tool '%s' is not among"
      + " available methods")

  ## Check whether if an special mode has been selected - for instance
  ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
  ## If not mode is define, we will work with a datatype - normally proteins
  if "cds" in parameters and not parameters["residue_datatype"] in \
    ["prot2codon", "prot2nuc"]:
    sys.exit("ERROR: To use an additional CDS file, you should set the <parame"
      + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

  if not "cds" in parameters and parameters["residue_datatype"] in \
    ["prot2codon", "prot2nuc"]:
    sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
      + "'prot2nuc', an input CDS file is needed")

  ## If the homology search will use any program from the BLAST package, check
  ## whether the TARGET SEQUENCES file has been already formatted.
  if parameters["homology"][0] in ["legacy_blast", "blast+"]:

    ## Get database sequence type - p: protein or n:nucleotide
    dt = "p" if parameters["residue_datatype"].startswith("prot") else "n"

    ## Check if BLAST DB associated files already exist or not
    for extension in ["hr", "in", "sq"]:
      filename = ("%s.%s%s") % (parameters["db_file"], dt, extension)

      ## If the input file doesn't exist check whether input database has been
      ## split into different volumes
      if not lookForFile(filename):
        alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension)
        if not lookForFile(alternative):
          db_file = parameters["db_file"]
          sys.exit(("ERROR: Check your input TARGET SEQUENCES file '%s' has "
            + "been formated using 'formatdb'/'makeblastdb'") % (db_file))

    ## If the homology search step should be perfomed using BLAST, call the
    ## appropiate function
    blast(parameters, logFile)
    tag = "blast"

  elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]:
    hmmer(parameters, logFile)
    ## Set the tag for the output files
    tag = "hmmer"

  ## Check whether the output file contains any result
  homologs = 0
  inFile = ("%s.homology.%s.out") % (oFile, tag)
  for line in open(inFile, "rU"):
    if not line.strip() or line.startswith("#"):
      continue
    homologs += 1
  if not homologs:
    print(("INFO: NO Homologous sequences found for '%s'") % \
      parameters["prefix"], file = sys.stderr)
    sys.exit(80)

  ## Filter homology search data. A dictionary containing selected sequences,
  ## including the sequences themselves
  selected_sequences = filter_results(parameters, logFile)

  ## Generate a MD5 file containing selected sequences for the current run.
  ## MD5s are used to recompute the same phylogenetic tree starting from other
  ## seqs - with identical similarity search results - in the set of homologs
  outFile = ("%s.seqs.md5") % (oFile)

  ## Check whether the file already exists or not.
  if not lookForFile(outFile) or parameters["replace"]:
    parameters["replace"] = True

    seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest()
    print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \
      open(outFile, "w"))

  ## Generate a file containing the selected sequences after performing the
  ## homology search and filtering its output according to a set of parameters.
  outFile = ("%s.seqs") % (oFile)

  ## Check whether the file already exists or not.
  if not lookForFile(outFile) or parameters["replace"]:
    parameters["replace"] = True

    output_file = open(outFile, "w")
    for seqId in sorted(selected_sequences):
      print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \
      output_file)
    output_file.close()

  ## If a CDS input file is set, use it to associate to homologous protein
  ## sequences their corresponding CDS
  if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
    cdsFile = ("%s.seqs_cds") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(cdsFile) or parameters["replace"]:
      parameters["replace"] = True

      output_file = open(cdsFile, "w")
      found = set()
      for record in SeqIO.parse(parameters["cds"], "fasta"):
        if not record.id in selected_sequences:
          continue
        seq = splitSequence(str(record.seq))
        print((">%s\n%s") % (record.id, seq), file = output_file)
        found.add(record.id)
      output_file.close()

      if set(selected_sequences.keys()) - found != set():
        missed = ",".join(sorted(set(selected_sequences.keys()) - found))
        sys.exit(("ERROR: Check your input CDS file '%s'. Impossible to find "
          "homologs sequences [missing:'%s']") % (parameters["cds"], missed))

  ## Print how much time was needed to perform the whole homology search step
  final = datetime.datetime.now()
  date  = final.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file = logFile)

  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file = logFile)

  ## We just close logfile and clean it up when it is a file
  if "verbose" in parameters and parameters["verbose"] == 1:
    logFile.close()

    ## Clean-up log directory from undesirable lines
    try:
      sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True)
    except OSError:
      print(("ERROR: Impossible to clean-up '%s.log' log file") \
        % (oFile), file = sys.stderr)

  ## Update the input file parameter and return the dictionary containing all
  ## parameters. Those parameters may be used in other steps
  parameters["in_file"] = outFile

  ## Update the associate CDS file with the resulting cds file. It will be used
  ## to make the back-translation in a hypothetical MSA step
  if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
    parameters["cds"] = ("%s.seqs_cds") % (oFile)

  ## Before returning to the main program, get back to the original working
  ## directory
  os.chdir(current_directory)

  return parameters
コード例 #23
0
def filter_results(parameters, logFile):
  '''
  Filter Homology search results taking into account which package was used to
  perform the search. Depending on the package only e-values (HMMER) or e-value
  plus coverage -ratio of aligned region between query and target sequences vs.
  query sequence lenght- (BLAST) are used.
  '''

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  ## Get tag for the input/output file. It will depend on which method has been
  ## used to perform the homology seach
  tag = "hmmer" if parameters["homology"][0] in ["phmmer", "jackhmmer", \
    "hmmer_search"] else "blast" if parameters["homology"][0] in \
    ["legacy_blast",  "blast+"] else ""

  ## Get input file
  inFile = ("%s.homology.%s.out") % (oFile, tag)
  ## If input file doesn't exist, just go back to the main function
  if not lookForFile(inFile):
    sys.exit(("ERROR: Check previously generated file '%s'") % (inFile))

  ## Get input file name and check whether has been previously generated or
  ## not. It will also affect whether the variable REPLACE is set or not
  outFile = ("%s.homology.%s.filter") % (oFile, tag)

  ## If output file exist and it is not set to replace it, then load the
  ## selected sequences and go back to the main function. Otherwise, set the
  ## replace parameter to TRUE in other to replace any already generated file
  ## downstream
  if lookForFile(outFile) and not parameters["replace"]:
    ## Get selected sequences. It will be used to produce MD5s key as well as to
    ## generate the sequences FASTA file
    target_sequences = set()
    for line in open(outFile, "rU"):
      ## Parse line
      f = list(map(strip, line.split()))
      parsed = [elem for elem in parseComments([e for e in f if e]) if elem]
      ## Include only target sequences - we assume query sequence had been
      ## include as part of the filtered results
      target_sequences.add(parsed[0] if tag == "hmmer" else parsed[1])

    ## We read selected sequences from input database and return it to the main
    ## function
    selected_sequences = read_database(parameters["db_file"], target_sequences)
    return selected_sequences

  ## We set the replace flag to true in order to reconstruct any downstream file
  parameters["replace"] = True

  input_lines, target_sequences, query_line = [], set(), None
  for line in open(inFile, "rU"):
    ## Parse line
    parsed_line = [element for element in parseComments([e for e in map(strip, \
      line.split()) if e]) if element]

    ## Discard empty lines or those starting by "#"
    if not parsed_line:
      continue

    ## Detect the target sequence which is placed at different columns depending
    ## whether it is blast or hmmer package which generated the output
    target = parsed_line[0] if tag == "hmmer" else parsed_line[1]

    ## We also include the query sequence, it is only important for the BLAST-
    ## based search
    query = parsed_line[2] if tag == "hmmer" else parsed_line[0]

    ## Store the self-hit line - on this way we make sure we will include the
    ## query protein among the finally selected sequences despite any cut-off
    if target == query and not query_line:
      query_line = [parsed_line]

    ## If current target sequence has not been found yet, register it
    if not target in target_sequences:
      input_lines.append(parsed_line)
      target_sequences|= set([target])

  ## We make sure query sequence is included
  sequences = read_database(parameters["db_file"], target_sequences)
  seed_seqs = read_database(parameters["in_file"])  

  ## Depending on how the search was performed, we will filter-out data
  ## by e-values and coverage (BLAST only) or not
  e_value = float(parameters["e_value"])
  coverage = float(parameters["coverage"])
  hits = -1 if not "hits" in parameters or parameters["hits"] == "no_limit" \
    else int(parameters["hits"])

  accepted_lines, accepted_targets = [], set()
  for line in input_lines:
    ## If the current target has been already found, move to next hit
    target = line[0] if tag == "hmmer" else line[1]
    if target in accepted_targets:
      continue
    ## Depending on the package, filter just by two e-values (sequence and best
    ## found domain) or by sequence e-value + coverage between sequences
    if tag == "hmmer":
      if float(line[4]) > e_value or float(line[7]) > e_value:
        continue
    elif tag == "blast":
      ## To make sure we have the seed sequence used to perform the homology
      ## search, we read it independently of the input sequence database
      seed = line[0]
      seedSeq = (seed_seqs[seed] if seed in seed_seqs else sequences[seed])[0]
      covTarget = ((int(line[7]) - int(line[6]))+1)/float(seedSeq)
      if covTarget < coverage or float(line[-2]) > e_value:
        continue

    ## Store current line and target sequence
    accepted_lines.append(line)
    accepted_targets.add(target)

  ## Sort by e-values (and bit-score for BLAST only) accepted lines
  accepted_lines.sort(sort_blast_hits if tag == "blast" else sort_hmmer_hits)

  ## Recover query ID from query line. Including the starting sequence depends
  ## on the configuration  
  query = None
  if query_line:
    query = query_line[0][2] if tag == "hmmer" else query_line[0][0]
    if not query in accepted_targets and parameters["force_seed_sequence"]:
      accepted_lines = query_line + accepted_lines

  if hits != -1 and len(accepted_lines) > hits:
    accepted_lines = accepted_lines[:hits]

  ## Get selected sequences. It will be used to produce MD5s key as well as to
  ## generate the sequences FASTA file
  selected_sequences = {}
  for line in accepted_lines:
    sequence_id = line[0] if tag == "hmmer" else line[1]
    selected_sequences.setdefault(sequence_id, sequences[sequence_id])

  out = ["\t".join([str(x).ljust(6) for x in l]) for l in accepted_lines]
  print("\n".join(out), file = open(outFile, "w"))

  return selected_sequences
コード例 #24
0
def hmmer(parameters, logFile):
  '''
  Perform the homology search using three different approximations implemented
  in the HMMER package.
  '''

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  ## Get output file name and check whether has been previously generated or
  ## not. It will also affect whether the variable REPLACE is set or not
  outFile = ("%s.homology.hmmer.out") % (oFile)

  ## If output file exist and it is not set to replace it, just go back to the
  ## main function. Otherwise, set the replace parameter to TRUE in other to
  ## replace any already generated file downstream
  if lookForFile(outFile) and not parameters["replace"]:
    return
  parameters["replace"] = True

  ## If we are ask to perform a HMM search using a Multiple Sequence Alignment
  ## as input rather than a single sequence, we need first to construct a HMM
  ## to perfom the search
  if parameters["homology"][0] == "hmmsearch" :
    if not "readal" in parameters or not "hmmbuild" in parameters:
      sys.exit(("ERROR: Check your CONFIG file to search whether 'readAl' and "
        + "'hmmbuild' are available"))

    ## Create a temporary FASTA file which will be used as input for HMMBuild
    TEMPFILE = tempfile.NamedTemporaryFile()
    convertInputFile_Format("readal", parameters["readal"], parameters["in_file"],
      TEMPFILE.name, "fasta", logFile, parameters["replace"])
    TEMPFILE.flush()

    ## Generate the profile
    ## Set the current residues type to amino-acids if search is performed using
    ## proteins, otherwise, allow the program to guess it
    dt = "--amino" if parameters["residue_datatype"].startswith("prot") else ""
    hmmFile = ("%s.homology.hmmer.hmm") % (oFile)

    cmd = ("%s --informat afa %s %s %s") % (parameters["hmmbuild"], dt, hmmFile,
      TEMPFILE.name)

    name = getfqdn()
    print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
      logFile)
    logFile.flush()

    try:
      proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
    except OSError as e:
      sys.exit("ERROR: Execution failed: " + str(e))

    if proc.wait() != 0:
      sys.exit(("ERROR: Execution failed: '%s'") % ("hmmbuild"))

    ## We update the input file for performing the HMM-based homology search
    parameters["in_file"] = hmmFile
    TEMPFILE.close()

  ## Generate command-line depending on HMMER specific program and parameters
  binary = parameters["homology"][0]
  params = parameters["hmmer_params"]

  cmd = ("%s %s -E %s --tblout %s %s %s") % (parameters[binary], params, \
    str(parameters["e_value"]), outFile, parameters["in_file"], \
    parameters["db_file"])

  name = getfqdn()
  print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
    logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    sys.exit("ERROR: Execution failed: " + str(e))

  if proc.wait() != 0:
    sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))
コード例 #25
0
def hmmer(parameters, logFile):
    '''
  Perform the homology search using three different approximations implemented
  in the HMMER package.
  '''

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    ## Get output file name and check whether has been previously generated or
    ## not. It will also affect whether the variable REPLACE is set or not
    outFile = ("%s.homology.hmmer.out") % (oFile)

    ## If output file exist and it is not set to replace it, just go back to the
    ## main function. Otherwise, set the replace parameter to TRUE in other to
    ## replace any already generated file downstream
    if lookForFile(outFile) and not parameters["replace"]:
        return
    parameters["replace"] = True

    ## If we are ask to perform a HMM search using a Multiple Sequence Alignment
    ## as input rather than a single sequence, we need first to construct a HMM
    ## to perfom the search
    if parameters["homology"][0] == "hmmsearch":
        if not "readal" in parameters or not "hmmbuild" in parameters:
            sys.exit((
                "ERROR: Check your CONFIG file to search whether 'readAl' and "
                + "'hmmbuild' are available"))

        ## Create a temporary FASTA file which will be used as input for HMMBuild
        TEMPFILE = tempfile.NamedTemporaryFile()
        convertInputFile_Format("readal", parameters["readal"],
                                parameters["in_file"], TEMPFILE.name, "fasta",
                                logFile, parameters["replace"])
        TEMPFILE.flush()

        ## Generate the profile
        ## Set the current residues type to amino-acids if search is performed using
        ## proteins, otherwise, allow the program to guess it
        dt = "--amino" if parameters["residue_datatype"].startswith(
            "prot") else ""
        hmmFile = ("%s.homology.hmmer.hmm") % (oFile)

        cmd = ("%s --informat afa %s %s %s") % (parameters["hmmbuild"], dt,
                                                hmmFile, TEMPFILE.name)

        name = getfqdn()
        print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
          logFile)
        logFile.flush()

        try:
            proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
        except OSError as e:
            sys.exit("ERROR: Execution failed: " + str(e))

        if proc.wait() != 0:
            sys.exit(("ERROR: Execution failed: '%s'") % ("hmmbuild"))

        ## We update the input file for performing the HMM-based homology search
        parameters["in_file"] = hmmFile
        TEMPFILE.close()

    ## Generate command-line depending on HMMER specific program and parameters
    binary = parameters["homology"][0]
    params = parameters["hmmer_params"]

    cmd = ("%s %s -E %s --tblout %s %s %s") % (parameters[binary], params, \
      str(parameters["e_value"]), outFile, parameters["in_file"], \
      parameters["db_file"])

    name = getfqdn()
    print(("###\n###\t[%s]\tCommand-line\t%s\n###\n") % (name, cmd), file = \
      logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        sys.exit("ERROR: Execution failed: " + str(e))

    if proc.wait() != 0:
        sys.exit(("ERROR: Execution failed: '%s'") % (parameters[binary]))
コード例 #26
0
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \
  replace):
    '''
  Function to format the command-line of different multiple sequence alignment
  programs and execute such command lines. It is also support a generic call
  for those programs which has no specific support in the pipeline
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["muscle", "kalign"]:
        cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file,
                                          out_file)

    elif label in ["clustalw"]:
        cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \
          out_file)

    elif label in ["clustal_omega"]:
        cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file,
                                            out_file)

    ## elif label in ["mafft", "dialign_tx"]:
    elif label in ["mafft"]:
        cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file)

    elif label in ["prank"]:
        cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file)

    ## Starting for newer DiAlign-TX versions
    elif label in ["dialign_tx"]:
        cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file)

    ## On t-coffee case, we need to set-up some ENV variables to be able to run
    ## smoothly the program
    elif label in ["t_coffee", "m_coffee"]:

        sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True)
        drc = ("/tmp/tcoffee/%s") % (getuser())
        sp.call(("mkdir -p -m0777 %s") % (drc), shell=True)
        os.putenv("LOCKDIR_4_TCOFFEE", drc)
        os.putenv("TMP_4_TCOFFEE", drc)

        cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters,
                                          out_file)

    ## In any other case, finish with a generic error
    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print >> logFile, ("###\n###\t%s - Alignment\t%s") % (label.upper(), date)
    print >> logFile, ("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError, e:
        print >> sys.stderr, "ERROR: Execution failed: " + str(e)
        sys.exit(exit_codes[label])
コード例 #27
0
        print >> sys.stderr, ("ERROR: Execution failed: %s [exit code != -1]") \
          % (label.upper())
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print >> logFile, ("###\tTime\t%s\n###") % (total)
    logFile.flush()

    ## If we are working with PRANK, move output file - which should have a suffix
    ## depending on the output format
    if label in ["prank"]:
        suffix = "fas" if parameters.find("-f=") == -1 else \
          "nex" if parameters.find("-f=nexus") != -1 else "phy"
        if lookForFile(out_file + ".best." + suffix):
            sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file),
                    shell=True)

    ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the
    ## guide tree generate during the program execution
    if label in ["t_coffee", "m_coffee"]:
        guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1])
        sp.call(("rm -f %s.dnd") % (guide_tree), shell=True)

    ## Check whether the output alignment has been already generated.
    ## In case something goes wrong, remove the output file and finish the
    ## current execution
    if not checkAlignment(in_file, out_file):
        print in_file, out_file
        print >> sys.stderr, ("ERROR: Execution failed: %s [file check]") % \
コード例 #28
0
def filter_results(parameters, logFile):
    '''
  Filter Homology search results taking into account which package was used to
  perform the search. Depending on the package only e-values (HMMER) or e-value
  plus coverage -ratio of aligned region between query and target sequences vs.
  query sequence lenght- (BLAST) are used.
  '''

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    ## Get tag for the input/output file. It will depend on which method has been
    ## used to perform the homology seach
    tag = "hmmer" if parameters["homology"][0] in ["phmmer", "jackhmmer", \
      "hmmer_search"] else "blast" if parameters["homology"][0] in \
      ["legacy_blast",  "blast+"] else ""

    ## Get input file
    inFile = ("%s.homology.%s.out") % (oFile, tag)
    ## If input file doesn't exist, just go back to the main function
    if not lookForFile(inFile):
        sys.exit(("ERROR: Check previously generated file '%s'") % (inFile))

    ## Get input file name and check whether has been previously generated or
    ## not. It will also affect whether the variable REPLACE is set or not
    outFile = ("%s.homology.%s.filter") % (oFile, tag)

    ## If output file exist and it is not set to replace it, then load the
    ## selected sequences and go back to the main function. Otherwise, set the
    ## replace parameter to TRUE in other to replace any already generated file
    ## downstream
    if lookForFile(outFile) and not parameters["replace"]:
        ## Get selected sequences. It will be used to produce MD5s key as well as to
        ## generate the sequences FASTA file
        target_sequences = set()
        for line in open(outFile, "rU"):
            ## Parse line
            f = list(map(strip, line.split()))
            parsed = [
                elem for elem in parseComments([e for e in f if e]) if elem
            ]
            ## Include only target sequences - we assume query sequence had been
            ## include as part of the filtered results
            target_sequences.add(parsed[0] if tag == "hmmer" else parsed[1])

        ## We read selected sequences from input database and return it to the main
        ## function
        selected_sequences = read_database(parameters["db_file"],
                                           target_sequences)
        return selected_sequences

    ## We set the replace flag to true in order to reconstruct any downstream file
    parameters["replace"] = True

    input_lines, target_sequences, query_line = [], set(), None
    for line in open(inFile, "rU"):
        ## Parse line
        parsed_line = [element for element in parseComments([e for e in map(strip, \
          line.split()) if e]) if element]

        ## Discard empty lines or those starting by "#"
        if not parsed_line:
            continue

        ## Detect the target sequence which is placed at different columns depending
        ## whether it is blast or hmmer package which generated the output
        target = parsed_line[0] if tag == "hmmer" else parsed_line[1]

        ## We also include the query sequence, it is only important for the BLAST-
        ## based search
        query = parsed_line[2] if tag == "hmmer" else parsed_line[0]

        ## Store the self-hit line - on this way we make sure we will include the
        ## query protein among the finally selected sequences despite any cut-off
        if target == query and not query_line:
            query_line = [parsed_line]

        ## If current target sequence has not been found yet, register it
        if not target in target_sequences:
            input_lines.append(parsed_line)
            target_sequences |= set([target])

    ## We make sure query sequence is included
    sequences = read_database(parameters["db_file"], target_sequences)
    seed_seqs = read_database(parameters["in_file"])

    ## Depending on how the search was performed, we will filter-out data
    ## by e-values and coverage (BLAST only) or not
    e_value = float(parameters["e_value"])
    coverage = float(parameters["coverage"])
    hits = -1 if not "hits" in parameters or parameters["hits"] == "no_limit" \
      else int(parameters["hits"])

    accepted_lines, accepted_targets = [], set()
    for line in input_lines:
        ## If the current target has been already found, move to next hit
        target = line[0] if tag == "hmmer" else line[1]
        if target in accepted_targets:
            continue
        ## Depending on the package, filter just by two e-values (sequence and best
        ## found domain) or by sequence e-value + coverage between sequences
        if tag == "hmmer":
            if float(line[4]) > e_value or float(line[7]) > e_value:
                continue
        elif tag == "blast":
            ## To make sure we have the seed sequence used to perform the homology
            ## search, we read it independently of the input sequence database
            seed = line[0]
            seedSeq = (seed_seqs[seed]
                       if seed in seed_seqs else sequences[seed])[0]
            covTarget = ((int(line[7]) - int(line[6])) + 1) / float(seedSeq)
            if covTarget < coverage or float(line[-2]) > e_value:
                continue

        ## Store current line and target sequence
        accepted_lines.append(line)
        accepted_targets.add(target)

    ## Sort by e-values (and bit-score for BLAST only) accepted lines
    accepted_lines.sort(sort_blast_hits if tag == "blast" else sort_hmmer_hits)

    ## Recover query ID from query line. Including the starting sequence depends
    ## on the configuration
    query = None
    if query_line:
        query = query_line[0][2] if tag == "hmmer" else query_line[0][0]
        if not query in accepted_targets and parameters["force_seed_sequence"]:
            accepted_lines = query_line + accepted_lines

    if hits != -1 and len(accepted_lines) > hits:
        accepted_lines = accepted_lines[:hits]

    ## Get selected sequences. It will be used to produce MD5s key as well as to
    ## generate the sequences FASTA file
    selected_sequences = {}
    for line in accepted_lines:
        sequence_id = line[0] if tag == "hmmer" else line[1]
        selected_sequences.setdefault(sequence_id, sequences[sequence_id])

    out = ["\t".join([str(x).ljust(6) for x in l]) for l in accepted_lines]
    print("\n".join(out), file=open(outFile, "w"))

    return selected_sequences
コード例 #29
0
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \
  replace):
    '''
  Function to format the command-line of different multiple sequence alignment
  programs and execute such command lines. It is also support a generic call
  for those programs which has no specific support in the pipeline
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["muscle", "kalign"]:
        cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file,
                                          out_file)

    elif label in ["clustalw"]:
        cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \
          out_file)

    elif label in ["clustal_omega"]:
        cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file,
                                            out_file)

    ## elif label in ["mafft", "dialign_tx"]:
    elif label in ["mafft"]:
        cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file)

    elif label in ["prank"]:
        cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file)

    ## Starting for newer DiAlign-TX versions
    elif label in ["dialign_tx"]:
        cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file)

    ## On t-coffee case, we need to set-up some ENV variables to be able to run
    ## smoothly the program
    elif label in ["t_coffee", "m_coffee"]:

        sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True)
        drc = ("/tmp/tcoffee/%s") % (getuser())
        sp.call(("mkdir -p -m0777 %s") % (drc), shell=True)
        os.putenv("LOCKDIR_4_TCOFFEE", drc)
        os.putenv("TMP_4_TCOFFEE", drc)

        cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters,
                                          out_file)

    ## In any other case, finish with a generic error
    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \
      logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s [exit code != -1]") \
          % (label.upper()), file = sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    ## If we are working with PRANK, move output file - which should have a suffix
    ## depending on the output format
    if label in ["prank"]:
        suffix = "fas" if parameters.find("-f=") == -1 else \
          "nex" if parameters.find("-f=nexus") != -1 else "phy"
        if lookForFile(out_file + ".best." + suffix):
            sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file),
                    shell=True)

    ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the
    ## guide tree generate during the program execution
    if label in ["t_coffee", "m_coffee"]:
        guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1])
        sp.call(("rm -f %s.dnd") % (guide_tree), shell=True)

    ## Check whether the output alignment has been already generated.
    ## In case something goes wrong, remove the output file and finish the
    ## current execution
    if not checkAlignment(in_file, out_file):
        print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \
          out_file), file = sys.stderr)
        print(("ERROR: Execution failed: %s [file check]") % \
          (label.upper()), file = sys.stderr)
        # sp.call(("rm -f %s") % (out_file), shell = True)
        sys.exit(exit_codes[label])

    return True
コード例 #30
0
    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print >> logFile, ("###\tTime\t%s\n###") % (total)
    logFile.flush()

    ## Process program's output and rename output files according to our own
    ## scheme
    if label in ["phyml", "codonphyml"]:

        ## Since resulting tree/stats file have slightly changed between version,
        ## we have to control for that.
        tree_file = ("%s_%s_tree.txt") % (in_file, label)
        sts_file = ("%s_%s_stats.txt") % (in_file, label)
        if not lookForFile(tree_file, attempts=2):
            tree_file = ("%s_%s_tree") % (in_file, label)
            sts_file = ("%s_%s_stats") % (in_file, label)

        try:
            sp.call(("mv %s %s") % (tree_file, out_file), shell=True)
            sp.call(("mv %s %s") % (sts_file, stats_file), shell=True)
        except OSError:
            print >> sys.stderr, ("ERROR: Impossible to rename '%s' output files") \
              % (label.upper())
            sys.exit(exit_codes[label])

    elif label in ["raxml"]:
        try:
            sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file),
                    shell=True)
コード例 #31
0
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \
  logFile, replace):

  '''
  Function to format the command-line of different phylogenetic tree reconstruc-
  tion programs and execute such command lines.
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  if label in ["phyml", "codonphyml"]:
    cmd = ("%s -i %s %s") % (binary, in_file, parameters)

  elif label in ["fasttree"]:
    cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \
      out_file, in_file)

  elif label in ["raxml"]:
    random_seed = randint(1, 10000)
    suffix = ("%s_%d") % (label, random_seed)

    cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \
      parameters)

  else:
    sys.exit(exit_codes["generic"])

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \
    file = logFile)
  print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \
    logFile)
  logFile.flush()

  try:
    ## We add a small pipeline to avoid informatin written in the same line
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile,
      stdin = sp.PIPE)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file=sys.stderr)
    sys.exit(exit_codes[label])

  proc.communicate(b'\n\nY\n')

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file=logFile)
  logFile.flush()

  ## Process program's output and rename output files according to our own
  ## scheme
  if label in ["phyml", "codonphyml"]:

    ## Since resulting tree/stats file have slightly changed between version,
    ## we have to control for that.
    tree_file = ("%s_%s_tree.txt") % (in_file, label)
    sts_file = ("%s_%s_stats.txt") % (in_file, label)
    if not lookForFile(tree_file, attempts = 2):
      tree_file = ("%s_%s_tree") % (in_file, label)   
      sts_file = ("%s_%s_stats") % (in_file, label)   

    try:
      sp.call(("mv %s %s") % (tree_file, out_file), shell = True)
      sp.call(("mv %s %s") % (sts_file, stats_file), shell = True)
    except OSError:
      print(("ERROR: Impossible to rename '%s' output files") \
        % (label.upper()), file=sys.stderr)
      sys.exit(exit_codes[label])

  elif label in ["raxml"]:
    try:
      sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell = True)
      sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell = True)
    except OSError:
      print(("ERROR: Impossible to rename RAxML output files"), file = \
        sys.stderr)
      sys.exit(exit_codes[label])

    oFile = open(stats_file, "a+")
    for oth_file in listDirectory(os.path.split(stats_file)[0], suffix):
      fileName = os.path.split(oth_file)[1]
      hz_line = "#" * (len(fileName) + 4)
      print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file = oFile)
      print(("%s") % ("".join(open(oth_file, "rU").readlines())), file = oFile)
      sp.call(("rm -f %s") % (oth_file), shell = True)
    oFile.close()

  return True
コード例 #32
0
def homology(parameters):

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    current_directory = os.getcwd()
    ## Change current directory to the output folder. Any temporary file will be
    ## generated therefore in this folder
    os.chdir(parameters["out_directory"])

    ## Depending on the verbosity level - set the appropriate logfile value
    if not "verbose" in parameters or parameters["verbose"] == 0:
        logFile = open(os.devnull, 'wb')

    ## ALL/logfile
    elif parameters["verbose"] == 1:
        ## Set output filename and log file
        mode = "w" if parameters["replace"] and parameters[
            "step"] == 0 else "a+"
        logFile = open(oFile + ".log", mode)

    ## ALL/Stderr
    elif parameters["verbose"] == 2:
        logFile = sys.stderr

    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file=logFile)
    logFile.flush()

    ## Get which tool will be used to perform the homology search. Check such tool
    ## is listed among the available binaries
    if not "homology" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is not tool set for "
            + "the homology search")

    if not parameters["homology"][0] in parameters:
        sys.exit(
            "ERROR: Check your configuration file. This tool '%s' is not among"
            + " available methods")

    ## Check whether if an special mode has been selected - for instance
    ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
    ## If not mode is define, we will work with a datatype - normally proteins
    if "cds" in parameters and not parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"]:
        sys.exit(
            "ERROR: To use an additional CDS file, you should set the <parame"
            + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

    if not "cds" in parameters and parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"]:
        sys.exit(
            "ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
            + "'prot2nuc', an input CDS file is needed")

    ## If the homology search will use any program from the BLAST package, check
    ## whether the TARGET SEQUENCES file has been already formatted.
    if parameters["homology"][0] in ["legacy_blast", "blast+"]:

        ## Get database sequence type - p: protein or n:nucleotide
        dt = "p" if parameters["residue_datatype"].startswith("prot") else "n"

        ## Check if BLAST DB associated files already exist or not
        for extension in ["hr", "in", "sq"]:
            filename = ("%s.%s%s") % (parameters["db_file"], dt, extension)

            ## If the input file doesn't exist check whether input database has been
            ## split into different volumes
            if not lookForFile(filename):
                alternative = ("%s.00.%s%s") % (parameters["db_file"], dt,
                                                extension)
                if not lookForFile(alternative):
                    db_file = parameters["db_file"]
                    sys.exit((
                        "ERROR: Check your input TARGET SEQUENCES file '%s' has "
                        + "been formated using 'formatdb'/'makeblastdb'") %
                             (db_file))

        ## If the homology search step should be perfomed using BLAST, call the
        ## appropiate function
        blast(parameters, logFile)
        tag = "blast"

    elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]:
        hmmer(parameters, logFile)
        ## Set the tag for the output files
        tag = "hmmer"

    ## Check whether the output file contains any result
    homologs = 0
    inFile = ("%s.homology.%s.out") % (oFile, tag)
    for line in open(inFile, "rU"):
        if not line.strip() or line.startswith("#"):
            continue
        homologs += 1
    if not homologs:
        print(("INFO: NO Homologous sequences found for '%s'") % \
          parameters["prefix"], file = sys.stderr)
        sys.exit(80)

    ## Filter homology search data. A dictionary containing selected sequences,
    ## including the sequences themselves
    selected_sequences = filter_results(parameters, logFile)

    ## Generate a MD5 file containing selected sequences for the current run.
    ## MD5s are used to recompute the same phylogenetic tree starting from other
    ## seqs - with identical similarity search results - in the set of homologs
    outFile = ("%s.seqs.md5") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(outFile) or parameters["replace"]:
        parameters["replace"] = True

        seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest()
        print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \
          open(outFile, "w"))

    ## Generate a file containing the selected sequences after performing the
    ## homology search and filtering its output according to a set of parameters.
    outFile = ("%s.seqs") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(outFile) or parameters["replace"]:
        parameters["replace"] = True

        output_file = open(outFile, "w")
        for seqId in sorted(selected_sequences):
            print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \
            output_file)
        output_file.close()

    ## If a CDS input file is set, use it to associate to homologous protein
    ## sequences their corresponding CDS
    if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        cdsFile = ("%s.seqs_cds") % (oFile)

        ## Check whether the file already exists or not.
        if not lookForFile(cdsFile) or parameters["replace"]:
            parameters["replace"] = True

            output_file = open(cdsFile, "w")
            found = set()
            for record in SeqIO.parse(parameters["cds"], "fasta"):
                if not record.id in selected_sequences:
                    continue
                seq = splitSequence(str(record.seq))
                print((">%s\n%s") % (record.id, seq), file=output_file)
                found.add(record.id)
            output_file.close()

            if set(selected_sequences.keys()) - found != set():
                missed = ",".join(
                    sorted(set(selected_sequences.keys()) - found))
                sys.exit((
                    "ERROR: Check your input CDS file '%s'. Impossible to find "
                    "homologs sequences [missing:'%s']") %
                         (parameters["cds"], missed))

    ## Print how much time was needed to perform the whole homology search step
    final = datetime.datetime.now()
    date = final.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file=logFile)

    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file=logFile)

    ## We just close logfile and clean it up when it is a file
    if "verbose" in parameters and parameters["verbose"] == 1:
        logFile.close()

        ## Clean-up log directory from undesirable lines
        try:
            sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True)
        except OSError:
            print(("ERROR: Impossible to clean-up '%s.log' log file") \
              % (oFile), file = sys.stderr)

    ## Update the input file parameter and return the dictionary containing all
    ## parameters. Those parameters may be used in other steps
    parameters["in_file"] = outFile

    ## Update the associate CDS file with the resulting cds file. It will be used
    ## to make the back-translation in a hypothetical MSA step
    if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        parameters["cds"] = ("%s.seqs_cds") % (oFile)

    ## Before returning to the main program, get back to the original working
    ## directory
    os.chdir(current_directory)

    return parameters