def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \ in_file = None, compare_msa = None, force_refer_msa = None, cds = None): ''' Function to trim a given multiple sequence alignment according to a number of parameters. It may also returns the output file in codons if appropiate parameters are used. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False cmd = "" ## Construct a customize trimAl command-line call ## If an input CDS file is set, generate the output alignment using such ## information if cds: cmd = ("%s -backtrans %s ") % (cmd, cds) if compare_msa: cmd = ("%s -compareset %s ") % (cmd, compare_msa) if force_refer_msa: cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa) if in_file: cmd = ("%s -in %s ") % (cmd, in_file) cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tTrimming Input MSA\t%s") % (date), file=logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() return True
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \ in_file = None, compare_msa = None, force_refer_msa = None, cds = None): ''' Function to trim a given multiple sequence alignment according to a number of parameters. It may also returns the output file in codons if appropiate parameters are used. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False cmd = "" ## Construct a customize trimAl command-line call ## If an input CDS file is set, generate the output alignment using such ## information if cds: cmd = ("%s -backtrans %s ") % (cmd, cds) if compare_msa: cmd = ("%s -compareset %s ") % (cmd, compare_msa) if force_refer_msa: cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa) if in_file: cmd = ("%s -in %s ") % (cmd, in_file) cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tTrimming Input MSA\t%s") % (date), file = logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file = logFile) logFile.flush() return True
def alignment(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tMultiple Sequence Alignment\tSTART\t%s" + "\n###") % (date), file = logFile) logFile.flush() ## Get which program/s will be used to align the input sequences. Check such ## program/s are listed among the available binaries if not "alignment" in parameters: sys.exit("ERROR: Check your configuration file. There is no definition for " + "the ALIGNMENT step") for program in parameters["alignment"]: if not program in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (program)) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Evaluate whether input sequences will be aligned following one direction, ## forward - left to right - or both directions meaning forward/reverse if isinstance(parameters["both_direction"], str): parameters["both_direction"] = parameters["both_direction"].lower() =="true" ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and (not "residue_datatype" in parameters or \ not parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]): sys.exit("ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## In normal cases, we don't really need to define a specific datatype to ## build alignments but we need the variable defined to avoid crashed in some ## checks if not "residue_datatype" in parameters: parameters["residue_datatype"] = "" ## Get some information such as number of input sequences and the presence of ## selenocysteine/pyrrolysine residues numSeqs, selenocys, pyrrolys = check_count_sequences(parameters["in_file"]) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file = \ logFile) sys.exit(80) ## Otherwise, process the input sequence, substitute rare amino-acids and ## reverse input sequences when neccesary ## Reverse input sequences if needed it if parameters["both_direction"]: ## If get an positive answer means, the reverse sequence file has been ## generated and therefore any downstream file should be over-written out_file = ("%s.seqs.reverse") % (oFile) if reverseSequences(parameters["readal"], parameters["in_file"], \ out_file, parameters["replace"], logFile): parameters["replace"] = True ## Substitute rare amino-acids if needed it if selenocys or pyrrolys: out_file = ("%s.seqs.no_rare_aa") % (oFile) ## If the output file has been generated, over-write, if any, downstream ## files if replaceRareAminoAcids(parameters["in_file"], out_file, \ parameters["replace"], logFile, parameters["in_letter"]): parameters["replace"] = True ## If there is a reverse file, replace also the rare amino-acids in that one if parameters["both_direction"]: in_file = ("%s.seqs.reverse") % (oFile) out_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) ## Replace any downstream file is the current one is generated again if replaceRareAminoAcids(in_file, out_file, parameters["replace"], \ logFile, parameters["in_letter"]): parameters["replace"] = True ## Set in which directions alignments will be reconstructed directions = ["forward"] if parameters["both_direction"]: directions.append("reverse") generated_alignments = set() ## Once all required sequence files has been set-up, proceed to build the ## alignments itself. for prog in parameters["alignment"]: ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] key = ("%s_params") % (prog) params = parameters[key] if key in parameters else "" altern_ext = ("%s%s") % (prog[:2], prog[-1]) extension = file_extension[prog] if prog in file_extension else altern_ext ## Generate as many alignments as needed for direc in directions: ## Set the input file depending on the presence of rare amino-acids if direc == "forward": in_file = ("%s.seqs.no_rare_aa") % (oFile) if selenocys \ or pyrrolys else parameters["in_file"] else: in_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) if selenocys \ or pyrrolys else ("%s.seqs.reverse") % (oFile) out_file = ("%s.alg.%s%s.%s") % (oFile, "no_rare_aa." if selenocys \ or pyrrolys else "", direc, extension) ## Perfom alignment and check whether it has been generated or already ## exist if perfomAlignment(prog, binary, params, in_file, out_file, logFile, parameters["replace"]): parameters["replace"] = True ## If any Selenocysteine or Pyrrolyseine is present, generate the final ## alignment removing the wild cards and putting back the original amino- ## acids if selenocys or pyrrolys: ## Get real output filename alt_file = ("%s.alg.%s.%s") % (oFile, direc, extension) ## Make the change and record whether files has been generated de-novo if replaceRareAminoAcids(out_file, alt_file, parameters["replace"], \ logFile, parameters["in_letter"], back = True): parameters["replace"] = True ## We over-write out_file variable with the current outfile name.We will ## store such output file in case a meta-alignment has to be generated out_file = alt_file ## For reverse alignment, get its reverse - meaning get residues according ## to the initial order if direc == "reverse": in_file = ("%s.alg.reverse.%s") % (oFile, extension) out_file = ("%s.alg.reverse.forw.%s") % (oFile, extension) if reverseSequences(parameters["readal"], in_file, out_file, \ parameters["replace"], logFile): parameters["replace"] = True ## Store all output alignments generated_alignments.add(out_file) if len(generated_alignments) > 1 and "consensus" in parameters: prog = parameters["consensus"][0] if not prog in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] prog_params = ("%s_params") % (prog) params = parameters[prog_params] if prog_params in parameters else "" params = ("%s -aln %s") % (params, " ".join(generated_alignments)) out_file = ("%s.alg.metalig") % (oFile) if perfomAlignment(prog, binary, params, parameters["in_file"], out_file, logFile, parameters["replace"]): parameters["replace"] = True ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file,out_file, "phylip", logFile, parameters["replace"]) ## Set the current output alignment as the one generated at a previous step else: out_file = generated_alignments.pop() ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file,out_file, "phylip", logFile, parameters["replace"]) ## Either we have to trim the final alignment or we have to backtranslate to ## codons/nucleotides, we will need to check for a program - hopefully ## trimAl - to make the job if parameters["residue_datatype"] in ["prot2codon","prot2nuc"] or "trimming" \ in parameters: prog = parameters["trimming"][0] if not prog in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] ## If the modes "prot2codon" or "prot2nuc" are selected - backtranslated the ## untrimmed/final alignment if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) params = parameters[prog_params] if prog_params in parameters else "" if (trimmingAlignment(prog, binary, params, out_file + "_cds", logFile, parameters["replace"], in_file = out_file, cds = parameters["cds"])): parameters["replace"] = True ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file + "_cds", out_file + "_cds" ,"phylip", logFile, parameters["replace"]) ## If set, trim resulting alignment if "trimming" in parameters: prog = parameters["trimming"][0] if not prog in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension prog_params = ("%s_params") % (prog) params = parameters[prog_params] if prog_params in parameters else "" clean_file = ("%s.alg.clean") % (oFile) prog_params = ("%s_compare") % (prog) if len(generated_alignments) > 1: if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) path_file = ("%s.alg.paths") % (oFile) print("\n".join(generated_alignments), file=open(path_file, "w")) trimmingAlignment(prog, binary, params, clean_file, logFile, parameters["replace"], compare_msa = path_file, force_refer_msa = \ out_file) ## If the backtranslation to codon/nucleotides is required, do it if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile, parameters["replace"], compare_msa = path_file, force_refer_msa = \ out_file, cds = parameters["cds"]) else: trimmingAlignment(prog, binary, params, clean_file, logFile, parameters["replace"], in_file = out_file) ## If the backtranslation to codon/nucleotides is required, do it if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile, parameters["replace"], in_file = out_file, cds = parameters["cds"]) ## After the trimming, set the final output file as the trimmed file out_file = clean_file + ("_cds" if parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"] else "") final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tMultipple Sequence Alignment\tEND\t" + "%s") % (date), file = logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tMultiple Sequence Alignment\t%s" + "\n###") % (total), file = logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = out_file ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \ replace): ''' Function to format the command-line of different multiple sequence alignment programs and execute such command lines. It is also support a generic call for those programs which has no specific support in the pipeline ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["muscle", "kalign"]: cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file) elif label in ["clustalw"]: cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \ out_file) elif label in ["clustal_omega"]: cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file) ## elif label in ["mafft", "dialign_tx"]: elif label in ["mafft"]: cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file) elif label in ["prank"]: cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file) ## Starting for newer DiAlign-TX versions elif label in ["dialign_tx"]: cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file) ## On t-coffee case, we need to set-up some ENV variables to be able to run ## smoothly the program elif label in ["t_coffee", "m_coffee"]: sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell = True) drc = ("/tmp/tcoffee/%s") % (getuser()) sp.call(("mkdir -p -m0777 %s") % (drc), shell = True) os.putenv("LOCKDIR_4_TCOFFEE", drc) os.putenv("TMP_4_TCOFFEE", drc) cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file) ## In any other case, finish with a generic error else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \ logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile) logFile.flush() try: proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file = sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s [exit code != -1]") \ % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file = logFile) logFile.flush() ## If we are working with PRANK, move output file - which should have a suffix ## depending on the output format if label in ["prank"]: suffix = "fas" if parameters.find("-f=") == -1 else \ "nex" if parameters.find("-f=nexus") != -1 else "phy" if lookForFile(out_file + ".best." + suffix): sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell = True) ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the ## guide tree generate during the program execution if label in ["t_coffee", "m_coffee"]: guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1]) sp.call(("rm -f %s.dnd") % (guide_tree), shell = True) ## Check whether the output alignment has been already generated. ## In case something goes wrong, remove the output file and finish the ## current execution if not checkAlignment(in_file, out_file): print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \ out_file), file = sys.stderr) print(("ERROR: Execution failed: %s [file check]") % \ (label.upper()), file = sys.stderr) # sp.call(("rm -f %s") % (out_file), shell = True) sys.exit(exit_codes[label]) return True
## Reconstruct the Multiple Sequence Alignment for the selected sequences parameters.update(alignment(parameters)) ## Assign which step is being executed. It is useful to know whether the log ## file should be replaced or not - even when the flag "replace" is set parameters["step"] = 2 ## Reconstruct the Multiple Sequence Alignment for the input Sequences phylogenetic_trees(parameters) ## Get final time final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps steps = "', '".join(args.steps) total = format_time(final - start if start else 0) ## Dump into stderr - when requested all verbose info or just stderr if parameters["verbose"] > 0: print(("\n###\tTOTAL Time\t[ '%s' ]\t%s\n###") % (steps, total), file = \ sys.stderr) ## Dump into logfile - when requested all verbose info or just logfile if parameters["verbose"] == 1: ## Get output folder/generic filename - Set output filename and log file oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) logFile = open(oFile + ".log", "a+") print(("\n###\tTOTAL Time\t[ '%s' ]\t%s\n###") % (steps, total), file = \ logFile) logFile.close()
def phylogenetic_trees(parameters): ''' Phylogenetic trees are reconstructed according to the input parameters. Once the different files have been generated, the function moves those files into a pre-established filename schema ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters[ "step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" + "%s\n###") % (date), file=logFile) logFile.flush() ## Get which program will be used to reconstruct phylogenetic trees. Check ## such program is listed among the available binaries if not "tree" in parameters: sys.exit( "ERROR: Check your configuration file. There is no definition for " + "the Phylogenetic TREE reconstruction step") prog = parameters["tree"][0] if not prog in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to the " "the configuration file") % (prog)) ## Get binary as well as any default parameters for the selected program binary = parameters[prog] key = ("%s_params") % (prog) progr_params = parameters[key] if key in parameters else "" if not "evol_models" in parameters: sys.exit( "ERROR: Check your configuration file. There is no definition for " + "the <evol_models> parameter") ## If the evolutionary model list is not appropiately formated, do it if isinstance(parameters["evol_models"], str): parameters["evol_models"] = list( map(strip, parameters["evol_models"].split())) ## Check if <numb_models parameters is defined and how many models are ## requested to be evaluated if not "numb_models" in parameters or parameters["numb_models"].lower() \ == "all": parameters["numb_models"] = len(parameters["evol_models"]) parameters["numb_models"] = int(parameters["numb_models"]) if not parameters["numb_models"] in range( 1, len(parameters["evol_models"]) + 1): sys.exit( ("ERROR: Check how many evolutionary models has been asked to re" + "construct '%d'") % (parameters["numb_models"])) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Create a temporary FASTA file which will be used to detect the sequence ## number on the input alignment and the presence of rare amino-acids TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile) sys.exit(80) ## Check which approaches should be used for the phylogenetic reconstruction ## and whether there are specific program's parameters for them if not "tree_approach" in parameters: parameters["tree_approach"] = ["ml"] ## Remove potential duplicates and lowercase all approaches for the tree ## reconstruction parameters["tree_approach"] = set([p.lower() for p in \ parameters["tree_approach"]]) ## We will first loot for Neighbour Joining tree reconstruction, then for ## Maximum likelihood and then for any other approach defined in the config ## file tree_approaches = [] if "nj" in parameters["tree_approach"]: tree_approaches.append("nj") if "ml" in parameters["tree_approach"]: tree_approaches.append("ml") others = parameters["tree_approach"] - set(["nj", "ml"]) if others != set(): tree_approaches += sorted(others) ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are ## present in the input alignment if prog in ["raxml"]: ## If Selenocysteines or Pyrrolysines are present, substitute them by "X" if selenocys or pyrrolys: out_file = ("%s.no_rare_aa") % (parameters["in_file"]) if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"], logFile, "U:X O:X"): parameters["replace"] = True parameters["in_file"] = out_file TEMPFILE.close() ## When using FastTree force the conversion of input alignment to FASTA format ## since it may crash reading standard interleave PHYLIP format files if prog in ["fasttree"]: in_file_format, aligned = getFileFormat("readal", parameters["readal"], \ parameters["in_file"], logFile) if in_file_format != "fasta": out_file = ("%s.fa") % (parameters["in_file"]) if (convertInputFile_Format("readal", parameters["readal"], \ parameters["in_file"], out_file, "fasta", logFile, parameters["replace"])): parameters["replace"] = True parameters["in_file"] = out_file replace = parameters["replace"] selected_models = parameters["evol_models"] ## Reconstruct trees for each approach considering evolutionary models order ## according their likelihood values for approach in tree_approaches: ## Save results - we will use such data for selecting the best -if required- ## models fitting to the input data results = {} ## Format the choosen program's parameters according to the default ones and ## the specific ones for the current approach params = ("%s ") % (progr_params) params += parameters[approach] if approach in parameters else "" for model in selected_models: out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) if prog in ["phyml"]: exec_params = ("%s -m %s") % (params, model) ## Get additional model -if any- for codons elif prog in ["codonphyml"]: exec_params = ("%s -m %s") % (params, model) add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \ if p.startswith("fmodel")] if len(add_model) == 1: add_model = add_model.pop() model = ("%s_%s") % (model, add_model) out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) elif prog in ["fasttree"]: ## On FastTree is selected by default JTT model for AAs - so we don't ## set-up that model exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \ and model.lower() != "jc" else params model = model.upper() ## In the case of RAxML, we would concatenate the model to an specific ## input parameter elif prog in ["raxml"]: final_model = model ## It is possible to add some suffixes to the evolutionary models ## in RAxML - There is not better/easy way to code this option if "raxml_model_suffix" in parameters: final_model += parameters["raxml_model_suffix"] exec_params = " ".join([ ("-%s%s") % (p, final_model if p.startswith("m ") else "") for p in map(strip, params.split("-")) if p ]) ## Build the phylogenetic tree using any of the available methods and ## register if any downstream file should be redone. if perform_tree(prog, binary, exec_params, parameters["in_file"], out_file, stats_file, logFile, parameters["replace"]): replace = True ## Get the likelihood for each of the reconstructed models log_lk = get_likelihood(prog, stats_file) if not log_lk: print(("ERROR: Impossible to the Log likelihood values " + "for '%s' model using this program '%s'") % (model, prog), file = \ sys.stderr) sys.exit(exit_codes[prog]) results.setdefault(model, log_lk) ## Get the models sorted by their likelihood values records = sorted(iter(results.items()), key=itemgetter(1), reverse=True) ## Set the filename which stores the ranking rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach) update = False ## Check the content of the rankings file - if any. ## Marked the file as updatable if there is any discrepancy if not replace and lookForFile(rank_file): old_content = "\n".join([ "\t".join(list(map(strip, line.split("\t")))) for line in open(rank_file, "rU") ]) newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records]) ## Decide whether ranking file should be updated after comparing current ## content with newly generated content update = old_content != newly_generated ## If the file containing the ranking doesn't exist, generate it. ## Update the file content if the replace flag is set to true or the content ## has changed - since the phylogenetic tree reconstruction step is the most ## expensive one - in terms of time/memory consumption - we are not setting ## replace flag to True even when this file is generated/updated. On this ## way, we can take adventage of any tree generated in any downstream step. if not lookForFile(rank_file) or replace or update: out_file = open(rank_file, "w") print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \ out_file) out_file.close() ## We could set the replace flag to True. However, if any tree has been ## generated 'de novo' during this iteration, then the flag is already set ## to True. #~ parameters["replace"] = True ## Select a given number of models for the next iteration - if any selected_models = [ pair[0] for pair in records[:parameters["numb_models"]] ] ## Remove the Codon Frequency model from potential new iterations if prog in ["codonphyml"] and add_model: selected_models = [ m.replace("_" + add_model, "") for m in selected_models if m.endswith(add_model) ] final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file=sys.stderr) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \ logFile, replace): ''' Function to format the command-line of different phylogenetic tree reconstruc- tion programs and execute such command lines. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["phyml", "codonphyml"]: cmd = ("%s -i %s %s") % (binary, in_file, parameters) elif label in ["fasttree"]: cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \ out_file, in_file) elif label in ["raxml"]: random_seed = randint(1, 10000) suffix = ("%s_%d") % (label, random_seed) cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \ parameters) else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \ file = logFile) print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \ logFile) logFile.flush() try: ## We add a small pipeline to avoid informatin written in the same line proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile, stdin=sp.PIPE) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) proc.communicate(b'\n\nY\n') if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## Process program's output and rename output files according to our own ## scheme if label in ["phyml", "codonphyml"]: ## Since resulting tree/stats file have slightly changed between version, ## we have to control for that. tree_file = ("%s_%s_tree.txt") % (in_file, label) sts_file = ("%s_%s_stats.txt") % (in_file, label) if not lookForFile(tree_file, attempts=2): tree_file = ("%s_%s_tree") % (in_file, label) sts_file = ("%s_%s_stats") % (in_file, label) try: sp.call(("mv %s %s") % (tree_file, out_file), shell=True) sp.call(("mv %s %s") % (sts_file, stats_file), shell=True) except OSError: print(("ERROR: Impossible to rename '%s' output files") \ % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) elif label in ["raxml"]: try: sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell=True) sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell=True) except OSError: print(("ERROR: Impossible to rename RAxML output files"), file = \ sys.stderr) sys.exit(exit_codes[label]) oFile = open(stats_file, "a+") for oth_file in listDirectory(os.path.split(stats_file)[0], suffix): fileName = os.path.split(oth_file)[1] hz_line = "#" * (len(fileName) + 4) print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file=oFile) print(("%s") % ("".join(open(oth_file, "rU").readlines())), file=oFile) sp.call(("rm -f %s") % (oth_file), shell=True) oFile.close() return True
def alignment(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters[ "step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print( ("###\n###\tSTEP\tMultiple Sequence Alignment\tSTART\t%s" + "\n###") % (date), file=logFile) logFile.flush() ## Get which program/s will be used to align the input sequences. Check such ## program/s are listed among the available binaries if not "alignment" in parameters: sys.exit( "ERROR: Check your configuration file. There is no definition for " + "the ALIGNMENT step") for program in parameters["alignment"]: if not program in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (program)) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Evaluate whether input sequences will be aligned following one direction, ## forward - left to right - or both directions meaning forward/reverse if isinstance(parameters["both_direction"], str): parameters["both_direction"] = parameters["both_direction"].lower( ) == "true" ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and (not "residue_datatype" in parameters or \ not parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]): sys.exit( "ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit( "ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## In normal cases, we don't really need to define a specific datatype to ## build alignments but we need the variable defined to avoid crashed in some ## checks if not "residue_datatype" in parameters: parameters["residue_datatype"] = "" ## Get some information such as number of input sequences and the presence of ## selenocysteine/pyrrolysine residues numSeqs, selenocys, pyrrolys = check_count_sequences(parameters["in_file"]) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file = \ logFile) sys.exit(80) ## Otherwise, process the input sequence, substitute rare amino-acids and ## reverse input sequences when neccesary ## Reverse input sequences if needed it if parameters["both_direction"]: ## If get an positive answer means, the reverse sequence file has been ## generated and therefore any downstream file should be over-written out_file = ("%s.seqs.reverse") % (oFile) if reverseSequences(parameters["readal"], parameters["in_file"], \ out_file, parameters["replace"], logFile): parameters["replace"] = True ## Substitute rare amino-acids if needed it if selenocys or pyrrolys: out_file = ("%s.seqs.no_rare_aa") % (oFile) ## If the output file has been generated, over-write, if any, downstream ## files if replaceRareAminoAcids(parameters["in_file"], out_file, \ parameters["replace"], logFile, parameters["in_letter"]): parameters["replace"] = True ## If there is a reverse file, replace also the rare amino-acids in that one if parameters["both_direction"]: in_file = ("%s.seqs.reverse") % (oFile) out_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) ## Replace any downstream file is the current one is generated again if replaceRareAminoAcids(in_file, out_file, parameters["replace"], \ logFile, parameters["in_letter"]): parameters["replace"] = True ## Set in which directions alignments will be reconstructed directions = ["forward"] if parameters["both_direction"]: directions.append("reverse") generated_alignments = set() ## Once all required sequence files has been set-up, proceed to build the ## alignments itself. for prog in parameters["alignment"]: ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] key = ("%s_params") % (prog) params = parameters[key] if key in parameters else "" altern_ext = ("%s%s") % (prog[:2], prog[-1]) extension = file_extension[ prog] if prog in file_extension else altern_ext ## Generate as many alignments as needed for direc in directions: ## Set the input file depending on the presence of rare amino-acids if direc == "forward": in_file = ("%s.seqs.no_rare_aa") % (oFile) if selenocys \ or pyrrolys else parameters["in_file"] else: in_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) if selenocys \ or pyrrolys else ("%s.seqs.reverse") % (oFile) out_file = ("%s.alg.%s%s.%s") % (oFile, "no_rare_aa." if selenocys \ or pyrrolys else "", direc, extension) ## Perfom alignment and check whether it has been generated or already ## exist if perfomAlignment(prog, binary, params, in_file, out_file, logFile, parameters["replace"]): parameters["replace"] = True ## If any Selenocysteine or Pyrrolyseine is present, generate the final ## alignment removing the wild cards and putting back the original amino- ## acids if selenocys or pyrrolys: ## Get real output filename alt_file = ("%s.alg.%s.%s") % (oFile, direc, extension) ## Make the change and record whether files has been generated de-novo if replaceRareAminoAcids(out_file, alt_file, parameters["replace"], \ logFile, parameters["in_letter"], back = True): parameters["replace"] = True ## We over-write out_file variable with the current outfile name.We will ## store such output file in case a meta-alignment has to be generated out_file = alt_file ## For reverse alignment, get its reverse - meaning get residues according ## to the initial order if direc == "reverse": in_file = ("%s.alg.reverse.%s") % (oFile, extension) out_file = ("%s.alg.reverse.forw.%s") % (oFile, extension) if reverseSequences(parameters["readal"], in_file, out_file, \ parameters["replace"], logFile): parameters["replace"] = True ## Store all output alignments generated_alignments.add(out_file) if len(generated_alignments) > 1 and "consensus" in parameters: prog = parameters["consensus"][0] if not prog in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] prog_params = ("%s_params") % (prog) params = parameters[prog_params] if prog_params in parameters else "" params = ("%s -aln %s") % (params, " ".join(generated_alignments)) out_file = ("%s.alg.metalig") % (oFile) if perfomAlignment(prog, binary, params, parameters["in_file"], out_file, logFile, parameters["replace"]): parameters["replace"] = True ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file, out_file, "phylip", logFile, parameters["replace"]) ## Set the current output alignment as the one generated at a previous step else: out_file = generated_alignments.pop() ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file, out_file, "phylip", logFile, parameters["replace"]) ## Either we have to trim the final alignment or we have to backtranslate to ## codons/nucleotides, we will need to check for a program - hopefully ## trimAl - to make the job if parameters["residue_datatype"] in ["prot2codon","prot2nuc"] or "trimming" \ in parameters: prog = parameters["trimming"][0] if not prog in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension binary = parameters[prog] ## If the modes "prot2codon" or "prot2nuc" are selected - backtranslated the ## untrimmed/final alignment if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) params = parameters[prog_params] if prog_params in parameters else "" if (trimmingAlignment(prog, binary, params, out_file + "_cds", logFile, parameters["replace"], in_file=out_file, cds=parameters["cds"])): parameters["replace"] = True ## Make such untrimmed alignment it is in phylip format convertInputFile_Format("readal", parameters["readal"], out_file + "_cds", out_file + "_cds", "phylip", logFile, parameters["replace"]) ## If set, trim resulting alignment if "trimming" in parameters: prog = parameters["trimming"][0] if not prog in parameters: sys.exit( ("ERROR: Selected program '%s' is not available accordding to " "the configuration file") % (prog)) ## Get binary as well as any input parameters for each aligner and the ## output file extension prog_params = ("%s_params") % (prog) params = parameters[prog_params] if prog_params in parameters else "" clean_file = ("%s.alg.clean") % (oFile) prog_params = ("%s_compare") % (prog) if len(generated_alignments) > 1: if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) path_file = ("%s.alg.paths") % (oFile) print("\n".join(generated_alignments), file=open(path_file, "w")) trimmingAlignment(prog, binary, params, clean_file, logFile, parameters["replace"], compare_msa = path_file, force_refer_msa = \ out_file) ## If the backtranslation to codon/nucleotides is required, do it if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile, parameters["replace"], compare_msa = path_file, force_refer_msa = \ out_file, cds = parameters["cds"]) else: trimmingAlignment(prog, binary, params, clean_file, logFile, parameters["replace"], in_file=out_file) ## If the backtranslation to codon/nucleotides is required, do it if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: prog_params = ("%s_cds") % (prog) if prog_params in parameters: params = ("%s %s") % (params, parameters[prog_params]) trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile, parameters["replace"], in_file=out_file, cds=parameters["cds"]) ## After the trimming, set the final output file as the trimmed file out_file = clean_file + ("_cds" if parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"] else "") final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tMultipple Sequence Alignment\tEND\t" + "%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tMultiple Sequence Alignment\t%s" + "\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = out_file ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \ replace): ''' Function to format the command-line of different multiple sequence alignment programs and execute such command lines. It is also support a generic call for those programs which has no specific support in the pipeline ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["muscle", "kalign"]: cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file) elif label in ["clustalw"]: cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \ out_file) elif label in ["clustal_omega"]: cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file) ## elif label in ["mafft", "dialign_tx"]: elif label in ["mafft"]: cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file) elif label in ["prank"]: cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file) ## Starting for newer DiAlign-TX versions elif label in ["dialign_tx"]: cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file) ## On t-coffee case, we need to set-up some ENV variables to be able to run ## smoothly the program elif label in ["t_coffee", "m_coffee"]: sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True) drc = ("/tmp/tcoffee/%s") % (getuser()) sp.call(("mkdir -p -m0777 %s") % (drc), shell=True) os.putenv("LOCKDIR_4_TCOFFEE", drc) os.putenv("TMP_4_TCOFFEE", drc) cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file) ## In any other case, finish with a generic error else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \ logFile) print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile) logFile.flush() try: proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) if proc.wait() != 0: print(("ERROR: Execution failed: %s [exit code != -1]") \ % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## If we are working with PRANK, move output file - which should have a suffix ## depending on the output format if label in ["prank"]: suffix = "fas" if parameters.find("-f=") == -1 else \ "nex" if parameters.find("-f=nexus") != -1 else "phy" if lookForFile(out_file + ".best." + suffix): sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell=True) ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the ## guide tree generate during the program execution if label in ["t_coffee", "m_coffee"]: guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1]) sp.call(("rm -f %s.dnd") % (guide_tree), shell=True) ## Check whether the output alignment has been already generated. ## In case something goes wrong, remove the output file and finish the ## current execution if not checkAlignment(in_file, out_file): print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \ out_file), file = sys.stderr) print(("ERROR: Execution failed: %s [file check]") % \ (label.upper()), file = sys.stderr) # sp.call(("rm -f %s") % (out_file), shell = True) sys.exit(exit_codes[label]) return True
def homology(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file = logFile) logFile.flush() ## Get which tool will be used to perform the homology search. Check such tool ## is listed among the available binaries if not "homology" in parameters: sys.exit("ERROR: Check your configuration file. There is not tool set for " + "the homology search") if not parameters["homology"][0] in parameters: sys.exit("ERROR: Check your configuration file. This tool '%s' is not among" + " available methods") ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and not parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit("ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## If the homology search will use any program from the BLAST package, check ## whether the TARGET SEQUENCES file has been already formatted. if parameters["homology"][0] in ["legacy_blast", "blast+"]: ## Get database sequence type - p: protein or n:nucleotide dt = "p" if parameters["residue_datatype"].startswith("prot") else "n" ## Check if BLAST DB associated files already exist or not for extension in ["hr", "in", "sq"]: filename = ("%s.%s%s") % (parameters["db_file"], dt, extension) ## If the input file doesn't exist check whether input database has been ## split into different volumes if not lookForFile(filename): alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension) if not lookForFile(alternative): db_file = parameters["db_file"] sys.exit(("ERROR: Check your input TARGET SEQUENCES file '%s' has " + "been formated using 'formatdb'/'makeblastdb'") % (db_file)) ## If the homology search step should be perfomed using BLAST, call the ## appropiate function blast(parameters, logFile) tag = "blast" elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]: hmmer(parameters, logFile) ## Set the tag for the output files tag = "hmmer" ## Check whether the output file contains any result homologs = 0 inFile = ("%s.homology.%s.out") % (oFile, tag) for line in open(inFile, "rU"): if not line.strip() or line.startswith("#"): continue homologs += 1 if not homologs: print(("INFO: NO Homologous sequences found for '%s'") % \ parameters["prefix"], file = sys.stderr) sys.exit(80) ## Filter homology search data. A dictionary containing selected sequences, ## including the sequences themselves selected_sequences = filter_results(parameters, logFile) ## Generate a MD5 file containing selected sequences for the current run. ## MD5s are used to recompute the same phylogenetic tree starting from other ## seqs - with identical similarity search results - in the set of homologs outFile = ("%s.seqs.md5") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest() print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \ open(outFile, "w")) ## Generate a file containing the selected sequences after performing the ## homology search and filtering its output according to a set of parameters. outFile = ("%s.seqs") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True output_file = open(outFile, "w") for seqId in sorted(selected_sequences): print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \ output_file) output_file.close() ## If a CDS input file is set, use it to associate to homologous protein ## sequences their corresponding CDS if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: cdsFile = ("%s.seqs_cds") % (oFile) ## Check whether the file already exists or not. if not lookForFile(cdsFile) or parameters["replace"]: parameters["replace"] = True output_file = open(cdsFile, "w") found = set() for record in SeqIO.parse(parameters["cds"], "fasta"): if not record.id in selected_sequences: continue seq = splitSequence(str(record.seq)) print((">%s\n%s") % (record.id, seq), file = output_file) found.add(record.id) output_file.close() if set(selected_sequences.keys()) - found != set(): missed = ",".join(sorted(set(selected_sequences.keys()) - found)) sys.exit(("ERROR: Check your input CDS file '%s'. Impossible to find " "homologs sequences [missing:'%s']") % (parameters["cds"], missed)) ## Print how much time was needed to perform the whole homology search step final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file = logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file = logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = outFile ## Update the associate CDS file with the resulting cds file. It will be used ## to make the back-translation in a hypothetical MSA step if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: parameters["cds"] = ("%s.seqs_cds") % (oFile) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def homology(parameters): ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters[ "step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file=logFile) logFile.flush() ## Get which tool will be used to perform the homology search. Check such tool ## is listed among the available binaries if not "homology" in parameters: sys.exit( "ERROR: Check your configuration file. There is not tool set for " + "the homology search") if not parameters["homology"][0] in parameters: sys.exit( "ERROR: Check your configuration file. This tool '%s' is not among" + " available methods") ## Check whether if an special mode has been selected - for instance ## "prot2codon" or "prot2nuc" - and a CDS file has been defined ## If not mode is define, we will work with a datatype - normally proteins if "cds" in parameters and not parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit( "ERROR: To use an additional CDS file, you should set the <parame" + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'") if not "cds" in parameters and parameters["residue_datatype"] in \ ["prot2codon", "prot2nuc"]: sys.exit( "ERROR: When 'residue_datatype' is set to either 'prot2codon' or " + "'prot2nuc', an input CDS file is needed") ## If the homology search will use any program from the BLAST package, check ## whether the TARGET SEQUENCES file has been already formatted. if parameters["homology"][0] in ["legacy_blast", "blast+"]: ## Get database sequence type - p: protein or n:nucleotide dt = "p" if parameters["residue_datatype"].startswith("prot") else "n" ## Check if BLAST DB associated files already exist or not for extension in ["hr", "in", "sq"]: filename = ("%s.%s%s") % (parameters["db_file"], dt, extension) ## If the input file doesn't exist check whether input database has been ## split into different volumes if not lookForFile(filename): alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension) if not lookForFile(alternative): db_file = parameters["db_file"] sys.exit(( "ERROR: Check your input TARGET SEQUENCES file '%s' has " + "been formated using 'formatdb'/'makeblastdb'") % (db_file)) ## If the homology search step should be perfomed using BLAST, call the ## appropiate function blast(parameters, logFile) tag = "blast" elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]: hmmer(parameters, logFile) ## Set the tag for the output files tag = "hmmer" ## Check whether the output file contains any result homologs = 0 inFile = ("%s.homology.%s.out") % (oFile, tag) for line in open(inFile, "rU"): if not line.strip() or line.startswith("#"): continue homologs += 1 if not homologs: print(("INFO: NO Homologous sequences found for '%s'") % \ parameters["prefix"], file = sys.stderr) sys.exit(80) ## Filter homology search data. A dictionary containing selected sequences, ## including the sequences themselves selected_sequences = filter_results(parameters, logFile) ## Generate a MD5 file containing selected sequences for the current run. ## MD5s are used to recompute the same phylogenetic tree starting from other ## seqs - with identical similarity search results - in the set of homologs outFile = ("%s.seqs.md5") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest() print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \ open(outFile, "w")) ## Generate a file containing the selected sequences after performing the ## homology search and filtering its output according to a set of parameters. outFile = ("%s.seqs") % (oFile) ## Check whether the file already exists or not. if not lookForFile(outFile) or parameters["replace"]: parameters["replace"] = True output_file = open(outFile, "w") for seqId in sorted(selected_sequences): print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \ output_file) output_file.close() ## If a CDS input file is set, use it to associate to homologous protein ## sequences their corresponding CDS if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: cdsFile = ("%s.seqs_cds") % (oFile) ## Check whether the file already exists or not. if not lookForFile(cdsFile) or parameters["replace"]: parameters["replace"] = True output_file = open(cdsFile, "w") found = set() for record in SeqIO.parse(parameters["cds"], "fasta"): if not record.id in selected_sequences: continue seq = splitSequence(str(record.seq)) print((">%s\n%s") % (record.id, seq), file=output_file) found.add(record.id) output_file.close() if set(selected_sequences.keys()) - found != set(): missed = ",".join( sorted(set(selected_sequences.keys()) - found)) sys.exit(( "ERROR: Check your input CDS file '%s'. Impossible to find " "homologs sequences [missing:'%s']") % (parameters["cds"], missed)) ## Print how much time was needed to perform the whole homology search step final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file = sys.stderr) ## Update the input file parameter and return the dictionary containing all ## parameters. Those parameters may be used in other steps parameters["in_file"] = outFile ## Update the associate CDS file with the resulting cds file. It will be used ## to make the back-translation in a hypothetical MSA step if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]: parameters["cds"] = ("%s.seqs_cds") % (oFile) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def phylogenetic_trees(parameters): ''' Phylogenetic trees are reconstructed according to the input parameters. Once the different files have been generated, the function moves those files into a pre-established filename schema ''' ## Get output folder/generic filename oFile = os.path.join(parameters["out_directory"], parameters["prefix"]) current_directory = os.getcwd() ## Change current directory to the output folder. Any temporary file will be ## generated therefore in this folder os.chdir(parameters["out_directory"]) ## Depending on the verbosity level - set the appropriate logfile value if not "verbose" in parameters or parameters["verbose"] == 0: logFile = open(os.devnull, 'wb') ## ALL/logfile elif parameters["verbose"] == 1: ## Set output filename and log file mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+" logFile = open(oFile + ".log", mode) ## ALL/Stderr elif parameters["verbose"] == 2: logFile = sys.stderr start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" + "%s\n###") % (date), file=logFile) logFile.flush() ## Get which program will be used to reconstruct phylogenetic trees. Check ## such program is listed among the available binaries if not "tree" in parameters: sys.exit("ERROR: Check your configuration file. There is no definition for " + "the Phylogenetic TREE reconstruction step") prog = parameters["tree"][0] if not prog in parameters: sys.exit(("ERROR: Selected program '%s' is not available accordding to the " "the configuration file") % (prog)) ## Get binary as well as any default parameters for the selected program binary = parameters[prog] key = ("%s_params") % (prog) progr_params = parameters[key] if key in parameters else "" if not "evol_models" in parameters: sys.exit("ERROR: Check your configuration file. There is no definition for " + "the <evol_models> parameter") ## If the evolutionary model list is not appropiately formated, do it if isinstance(parameters["evol_models"], str): parameters["evol_models"] = list(map(strip, parameters["evol_models"].split())) ## Check if <numb_models parameters is defined and how many models are ## requested to be evaluated if not "numb_models" in parameters or parameters["numb_models"].lower() \ == "all": parameters["numb_models"] = len(parameters["evol_models"]) parameters["numb_models"] = int(parameters["numb_models"]) if not parameters["numb_models"] in range(1,len(parameters["evol_models"])+1): sys.exit(("ERROR: Check how many evolutionary models has been asked to re" + "construct '%d'") % (parameters["numb_models"])) ## Check whether "readAl" is available or not. It is useful for sequences ## manipulation independently of the input format. if not "readal" in parameters: sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available") ## Create a temporary FASTA file which will be used to detect the sequence ## number on the input alignment and the presence of rare amino-acids TEMPFILE = tempfile.NamedTemporaryFile() convertInputFile_Format("readal", parameters["readal"], parameters["in_file"], TEMPFILE.name, "fasta", logFile, parameters["replace"]) TEMPFILE.flush() numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name) ## Set the minimum number of sequences required to reconstruct an alignment min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \ min_seqs_analysis) ## Finish when there are not enough sequences to make an alignment if numSeqs < min_seqs: print(("### INFO: It is necessary, at least, %d sequences to " + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile) sys.exit(80) ## Check which approaches should be used for the phylogenetic reconstruction ## and whether there are specific program's parameters for them if not "tree_approach" in parameters: parameters["tree_approach"] = ["ml"] ## Remove potential duplicates and lowercase all approaches for the tree ## reconstruction parameters["tree_approach"] = set([p.lower() for p in \ parameters["tree_approach"]]) ## We will first loot for Neighbour Joining tree reconstruction, then for ## Maximum likelihood and then for any other approach defined in the config ## file tree_approaches = [] if "nj" in parameters["tree_approach"]: tree_approaches.append("nj") if "ml" in parameters["tree_approach"]: tree_approaches.append("ml") others = parameters["tree_approach"] - set(["nj", "ml"]) if others != set(): tree_approaches += sorted(others) ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are ## present in the input alignment if prog in ["raxml"]: ## If Selenocysteines or Pyrrolysines are present, substitute them by "X" if selenocys or pyrrolys: out_file = ("%s.no_rare_aa") % (parameters["in_file"]) if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"], logFile, "U:X O:X"): parameters["replace"] = True parameters["in_file"] = out_file TEMPFILE.close() ## When using FastTree force the conversion of input alignment to FASTA format ## since it may crash reading standard interleave PHYLIP format files if prog in ["fasttree"]: in_file_format, aligned = getFileFormat("readal", parameters["readal"], \ parameters["in_file"], logFile) if in_file_format != "fasta": out_file = ("%s.fa") % (parameters["in_file"]) if (convertInputFile_Format("readal", parameters["readal"], \ parameters["in_file"], out_file, "fasta", logFile, parameters["replace"])): parameters["replace"] = True parameters["in_file"] = out_file replace = parameters["replace"] selected_models = parameters["evol_models"] ## Reconstruct trees for each approach considering evolutionary models order ## according their likelihood values for approach in tree_approaches: ## Save results - we will use such data for selecting the best -if required- ## models fitting to the input data results = {} ## Format the choosen program's parameters according to the default ones and ## the specific ones for the current approach params = ("%s ") % (progr_params) params += parameters[approach] if approach in parameters else "" for model in selected_models: out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) if prog in ["phyml"]: exec_params = ("%s -m %s") % (params, model) ## Get additional model -if any- for codons elif prog in ["codonphyml"]: exec_params = ("%s -m %s") % (params, model) add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \ if p.startswith("fmodel")] if len(add_model) == 1: add_model = add_model.pop() model = ("%s_%s") % (model, add_model) out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model) stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model) elif prog in ["fasttree"]: ## On FastTree is selected by default JTT model for AAs - so we don't ## set-up that model exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \ and model.lower() != "jc" else params model = model.upper() ## In the case of RAxML, we would concatenate the model to an specific ## input parameter elif prog in ["raxml"]: final_model = model ## It is possible to add some suffixes to the evolutionary models ## in RAxML - There is not better/easy way to code this option if "raxml_model_suffix" in parameters: final_model += parameters["raxml_model_suffix"] exec_params = " ".join([("-%s%s") %(p, final_model if p.startswith("m ") else "") for p in map(strip, params.split("-")) if p]) ## Build the phylogenetic tree using any of the available methods and ## register if any downstream file should be redone. if perform_tree(prog, binary, exec_params, parameters["in_file"], out_file, stats_file, logFile, parameters["replace"]): replace = True ## Get the likelihood for each of the reconstructed models log_lk = get_likelihood(prog, stats_file) if not log_lk: print(("ERROR: Impossible to the Log likelihood values " + "for '%s' model using this program '%s'") % (model, prog), file = \ sys.stderr) sys.exit(exit_codes[prog]) results.setdefault(model, log_lk) ## Get the models sorted by their likelihood values records = sorted(iter(results.items()), key = itemgetter(1), reverse = True) ## Set the filename which stores the ranking rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach) update = False ## Check the content of the rankings file - if any. ## Marked the file as updatable if there is any discrepancy if not replace and lookForFile(rank_file): old_content = "\n".join(["\t".join(list(map(strip, line.split("\t")))) for line in open(rank_file, "rU")]) newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records]) ## Decide whether ranking file should be updated after comparing current ## content with newly generated content update = old_content != newly_generated ## If the file containing the ranking doesn't exist, generate it. ## Update the file content if the replace flag is set to true or the content ## has changed - since the phylogenetic tree reconstruction step is the most ## expensive one - in terms of time/memory consumption - we are not setting ## replace flag to True even when this file is generated/updated. On this ## way, we can take adventage of any tree generated in any downstream step. if not lookForFile(rank_file) or replace or update: out_file = open(rank_file, "w") print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \ out_file) out_file.close() ## We could set the replace flag to True. However, if any tree has been ## generated 'de novo' during this iteration, then the flag is already set ## to True. #~ parameters["replace"] = True ## Select a given number of models for the next iteration - if any selected_models = [pair[0] for pair in records[:parameters["numb_models"]]] ## Remove the Codon Frequency model from potential new iterations if prog in ["codonphyml"] and add_model: selected_models = [m.replace("_"+ add_model, "") for m in selected_models if m.endswith(add_model)] final = datetime.datetime.now() date = final.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") % (date), file=logFile) ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") % (total), file=logFile) ## We just close logfile and clean it up when it is a file if "verbose" in parameters and parameters["verbose"] == 1: logFile.close() ## Clean-up log directory from undesirable lines try: sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True) sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True) except OSError: print(("ERROR: Impossible to clean-up '%s.log' log file") \ % (oFile), file=sys.stderr) ## Before returning to the main program, get back to the original working ## directory os.chdir(current_directory) return parameters
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \ logFile, replace): ''' Function to format the command-line of different phylogenetic tree reconstruc- tion programs and execute such command lines. ''' ## Check whether the output file already exists. If it is not set to replace ## it, just return to the calling function if lookForFile(out_file) and not replace: return False if label in ["phyml", "codonphyml"]: cmd = ("%s -i %s %s") % (binary, in_file, parameters) elif label in ["fasttree"]: cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \ out_file, in_file) elif label in ["raxml"]: random_seed = randint(1, 10000) suffix = ("%s_%d") % (label, random_seed) cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \ parameters) else: sys.exit(exit_codes["generic"]) ## Record the time and precise command-line name = getfqdn() start = datetime.datetime.now() date = start.strftime("%H:%M:%S %m/%d/%y") print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \ file = logFile) print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \ logFile) logFile.flush() try: ## We add a small pipeline to avoid informatin written in the same line proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile, stdin = sp.PIPE) except OSError as e: print("ERROR: Execution failed: " + str(e), file=sys.stderr) sys.exit(exit_codes[label]) proc.communicate(b'\n\nY\n') if proc.wait() != 0: print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr) sys.exit(exit_codes[label]) final = datetime.datetime.now() ## We return a DELTA object comparing both timestamps total = format_time(final - start if start else 0) print(("###\tTime\t%s\n###") % (total), file=logFile) logFile.flush() ## Process program's output and rename output files according to our own ## scheme if label in ["phyml", "codonphyml"]: ## Since resulting tree/stats file have slightly changed between version, ## we have to control for that. tree_file = ("%s_%s_tree.txt") % (in_file, label) sts_file = ("%s_%s_stats.txt") % (in_file, label) if not lookForFile(tree_file, attempts = 2): tree_file = ("%s_%s_tree") % (in_file, label) sts_file = ("%s_%s_stats") % (in_file, label) try: sp.call(("mv %s %s") % (tree_file, out_file), shell = True) sp.call(("mv %s %s") % (sts_file, stats_file), shell = True) except OSError: print(("ERROR: Impossible to rename '%s' output files") \ % (label.upper()), file=sys.stderr) sys.exit(exit_codes[label]) elif label in ["raxml"]: try: sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell = True) sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell = True) except OSError: print(("ERROR: Impossible to rename RAxML output files"), file = \ sys.stderr) sys.exit(exit_codes[label]) oFile = open(stats_file, "a+") for oth_file in listDirectory(os.path.split(stats_file)[0], suffix): fileName = os.path.split(oth_file)[1] hz_line = "#" * (len(fileName) + 4) print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file = oFile) print(("%s") % ("".join(open(oth_file, "rU").readlines())), file = oFile) sp.call(("rm -f %s") % (oth_file), shell = True) oFile.close() return True