def from_phytrees ( cls, phytrees_file ) : """ Create a PhyTrees object retrieving all the information from previously saved PhyTrees tree and report files. If 'phytrees_file' contains a relative path, the current working directory will be used to get the absolute path. Arguments : phytrees_file ( string ) Tree file generated by PhyTrees.write(). Raises : ValueError If the number of trees read doesn't match the number stored in the report document. """ data_filepath = get_abspath(phytrees_file) report_filepath = os.path.splitext(data_filepath)[0] + '.rep' # Load all the contents into a new PhyTrees object tree_list = list(Phylo.parse(data_filepath, 'newick')) report = [] with open(report_filepath, 'r') as report_file : str_num_trees = report_file.readline() num_trees = int(str_num_trees.split(':')[-1]) if ( len(tree_list) != num_trees ) : message = 'The number of trees at report file doesn\'t match ' \ 'the number of trees loaded' raise ValueError(message) # Ignore "History:" line report_file.readline() for line in report_file.readlines() : date_time, filepath, fileformat = line.strip().split(' ') report.append((date_time, filepath, fileformat)) return ( cls(tree_list, report) )
def gen_args(args, infile_path, outfile): """ Return the argument list generated from 'args' and the infile path requested. Arguments : args ( string ) Keyword or arguments to use in the call of Consense, excluding infile and outfile arguments. infile_path ( string ) Input alignment file path. outfile ( string ) Consensus tree output file. Returns : list List of arguments (excluding binary file) to call Consense. """ if (outfile): outfile_path = get_abspath(outfile) else: # Output files will be saved in temporary files to retrieve the # consensus tree outfile_path = os.path.join(tempfile.gettempdir(), tempfile.gettempprefix() + \ next(tempfile._get_candidate_names())) # Create full command line list argument_list = [infile_path, outfile_path] return (argument_list)
def write(self, bioseqs_file): """ Save all sequences stored at the BioSeqs object in the 'bioseqs_file' (in GENBANK format). A file with a detailed report of the sequences will be created replacing the extension of 'bioseqs_file' by ".rep". If 'bioseqs_file' contains a relative path, the current working directory will be used to get the absolute path. If any file already exists, it will be overwritten without warning. Arguments: bioseqs_file (string) New BioSeqs sequence file. Raises: IOError If the path provided doesn't exist. """ data_filepath = get_abspath(bioseqs_file) report_filepath = os.path.splitext(data_filepath)[0] + '.rep' # Generate a single string with all the report content str_report = '\n'.join([' '.join(x) for x in self._report]) # Write all the information in the BioSeqs files try: SeqIO.write(viewvalues(self.data), data_filepath, 'genbank') with open(report_filepath, 'w') as report_file: report_file.write('Num. sequences: {:d}\nHistory:\n' \ '{:s}'.format(len(self), str_report)) except IOError: raise except: if (os.path.lexists(data_filepath)): os.remove(data_filepath) if (os.path.lexists(report_filepath)): os.remove(report_filepath) raise
def write ( self, phytrees_file ) : """ Save all trees stored at the PhyTrees object in the 'phytrees_file' (in newick format). A file with a detailed report of the trees will be created replacing the extension of 'phytrees_file' by ".rep". If 'phytrees_file' contains a relative path, the current working directory will be used to get the absolute path. If any file already exists, it will be overwritten without warning. Arguments : phytrees_file ( string ) New PhyTrees tree file. Raises : IOError If the path provided doesn't exist. """ data_filepath = get_abspath(phytrees_file) report_filepath = os.path.splitext(data_filepath)[0] + '.rep' # Generate a single string with all the report content str_report = '\n'.join([' '.join(x) for x in self._report]) # Write all the information in the PhyTrees files try : Phylo.write(self.data, data_filepath, 'newick') with open(report_filepath, 'w') as report_file : report_file.write('Num. trees: {:d}\nHistory:\n' \ '{:s}'.format(len(self), str_report)) except IOError : raise except : if ( os.path.isfile(data_filepath) ) : os.remove(data_filepath) if ( os.path.isfile(report_filepath) ) : os.remove(report_filepath) raise
def from_bioseqs(cls, bioseqs_file): """ Create a BioSeqs object retrieving all the information from previously saved BioSeqs sequence and report files. If 'bioseqs_file' contains a relative path, the current working directory will be used to get the absolute path. Arguments: bioseqs_file (string) Sequence file generated by BioSeqs.write(). Raises: ValueError If the number of sequences read doesn't match the number stored in the report document. """ data_filepath = get_abspath(bioseqs_file) report_filepath = os.path.splitext(data_filepath)[0] + '.rep' # Load all the contents into a new BioSeqs object seq_dict = SeqIO.to_dict(SeqIO.parse(data_filepath, 'genbank')) report = [] with open(report_filepath, 'r') as report_file: str_num_seqs = report_file.readline() num_seqs = int(str_num_seqs.split(':')[-1]) if (len(seq_dict) != num_seqs): message = 'The number of sequences at report file doesn\'t ' \ 'match the number of sequences loaded' raise ValueError(message) # Ignore "History:" line report_file.readline() for line in report_file.readlines(): date_time, src_type, src, details = line.strip().split(' ') report.append((date_time, src_type, src, details)) return (cls(seq_dict, report))
def get_results(command, output): """ Extract resultant phylogeny and its log-likelihood score from 'output' and files generated during the execution of 'command'. Arguments : command ( list ) FastTree's command line executed. output ( string ) Output from 'command' execution. Returns : Bio.Phylo.BaseTree Resultant phylogenetic tree. float Log-likelihood score of the phylogeny. """ phylogeny = Phylo.read(StringIO(output), 'newick') # Read the log file to get the log-likelihood score of the final phylogeny index = command.index('-log') + 1 logfile_path = get_abspath(command[index]) with open(logfile_path, 'r') as logfile: # It is located at the last line that matches "TreeLogLk.*" pattern for line in logfile.readlines(): if ('TreeLogLk' in line): score = float(line.split('\t')[2]) return (phylogeny, score)
def gen_args ( args, infile_path, outfile ) : """ Return the argument list generated from 'args' and the infile path requested. Arguments : args ( string ) Keyword or arguments to use in the call of Consense, excluding infile and outfile arguments. infile_path ( string ) Input alignment file path. outfile ( string ) Consensus tree output file. Returns : list List of arguments (excluding binary file) to call Consense. """ if ( outfile ) : outfile_path = get_abspath(outfile) else : # Output files will be saved in temporary files to retrieve the # consensus tree outfile_path = os.path.join(tempfile.gettempdir(), tempfile.gettempprefix() + \ next(tempfile._get_candidate_names())) # Create full command line list argument_list = [infile_path, outfile_path] return ( argument_list )
def from_bioseqs(cls, bioseqs_file): """ Create a BioSeqs object retrieving all the information from previously saved BioSeqs sequence and report files. If 'bioseqs_file' contains a relative path, the current working directory will be used to get the absolute path. Arguments : bioseqs_file ( string ) Sequence file generated by BioSeqs.write(). Raises : ValueError If the number of sequences read doesn't match the number stored in the report document. """ data_filepath = get_abspath(bioseqs_file) report_filepath = os.path.splitext(data_filepath)[0] + ".rep" # Load all the contents into a new BioSeqs object seq_dict = SeqIO.to_dict(SeqIO.parse(data_filepath, "genbank")) report = [] with open(report_filepath, "r") as report_file: str_num_seqs = report_file.readline() num_seqs = int(str_num_seqs.split(":")[-1]) if len(seq_dict) != num_seqs: message = "The number of sequences at report file doesn't " "match the number of sequences loaded" raise ValueError(message) # Ignore "History:" line report_file.readline() for line in report_file.readlines(): date_time, src_type, src, details = line.strip().split(" ") report.append((date_time, src_type, src, details)) return cls(seq_dict, report)
def write(self, bioseqs_file): """ Save all sequences stored at the BioSeqs object in the 'bioseqs_file' (in GENBANK format). A file with a detailed report of the sequences will be created replacing the extension of 'bioseqs_file' by ".rep". If 'bioseqs_file' contains a relative path, the current working directory will be used to get the absolute path. If any file already exists, it will be overwritten without warning. Arguments : bioseqs_file ( string ) New BioSeqs sequence file. Raises : IOError If the path provided doesn't exist. """ data_filepath = get_abspath(bioseqs_file) report_filepath = os.path.splitext(data_filepath)[0] + ".rep" # Generate a single string with all the report content str_report = "\n".join([" ".join(x) for x in self._report]) # Write all the information in the BioSeqs files try: SeqIO.write(viewvalues(self.data), data_filepath, "genbank") with open(report_filepath, "w") as report_file: report_file.write("Num. sequences: {:d}\nHistory:\n" "{:s}".format(len(self), str_report)) except IOError: raise except: if os.path.lexists(data_filepath): os.remove(data_filepath) if os.path.lexists(report_filepath): os.remove(report_filepath) raise
def get_results(command, output): """ Extract resultant phylogeny and its log-likelihood score from 'output' and files generated during the execution of 'command'. Arguments : command ( list ) FastTree's command line executed. output ( string ) Output from 'command' execution. Returns : Bio.Phylo.BaseTree Resultant phylogenetic tree. float Log-likelihood score of the phylogeny. """ phylogeny = Phylo.read(StringIO(output), "newick") # Read the log file to get the log-likelihood score of the final phylogeny index = command.index("-log") + 1 logfile_path = get_abspath(command[index]) with open(logfile_path, "r") as logfile: # It is located at the last line that matches "TreeLogLk.*" pattern for line in logfile.readlines(): if "TreeLogLk" in line: score = float(line.split("\t")[2]) return (phylogeny, score)
def from_treefile ( cls, treefile, fileformat ) : """ Create a PhyTrees object retrieving all the information stored at the tree file provided. If 'treefile' contains a relative path, the current working directory will be used to get the absolute path. Arguments : treefile ( string ) Input tree file. fileformat ( string ) Input file format. Raises : IOError If the path or the file provided doesn't exist. * The file format must be supported by Bio.Phylo. * If the file format provided doesn't correspond to the actual file format, an empty tree list will be created. """ filepath = get_abspath(treefile) # Read the tree file and create a new PhyTrees object, generating a new # report list tree_list = list(Phylo.parse(filepath, fileformat)) date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S') report = [(date_time, filepath, fileformat)] return ( cls(tree_list, report) )
def cleanup ( command ) : """ Remove the temporary file created (if any) in gen_args() function. Arguments : command ( list ) Consense's command line executed. """ logfile_path = get_abspath(command[2]) if ( os.path.dirname(logfile_path) == tempfile.gettempdir() ) : os.remove(logfile_path)
def cleanup(command): """ Remove the temporary file created (if any) in gen_args() function. Arguments : command ( list ) Consense's command line executed. """ logfile_path = get_abspath(command[2]) if (os.path.dirname(logfile_path) == tempfile.gettempdir()): os.remove(logfile_path)
def cleanup(command): """ Remove the temporary files and directories created (if any) in gen_args() function. Arguments : command ( list ) FastTree's command line executed. """ index = command.index("-log") + 1 logfile_path = get_abspath(command[index]) if (os.path.dirname(logfile_path) == tempfile.gettempdir()) and os.path.lexists(logfile_path): os.remove(logfile_path)
def cleanup(command): """ Remove the temporary files and directories created (if any) in gen_args() function. Arguments : command ( list ) FastTree's command line executed. """ index = command.index('-log') + 1 logfile_path = get_abspath(command[index]) if ((os.path.dirname(logfile_path) == tempfile.gettempdir()) and os.path.lexists(logfile_path)): os.remove(logfile_path)
def from_seqfile(cls, seqfile, fileformat): """ Create a BioSeqs object retrieving all the information stored at the sequence file provided. If 'seqfile' contains a relative path, the current working directory will be used to get the absolute path. Arguments: seqfile (string) Input sequences file. fileformat (string) Input file format. Raises: IOError If the path or the file provided doesn't exist. * The file format must be supported by Bio.SeqIO. * If the file format provided doesn't correspond to the actual file format, an empty sequence dictionary will be created. """ filepath = get_abspath(seqfile) # Read the sequence file and create a new BioSeqs object, generating a # new report list seq_dict = {} for record in SeqIO.parse(filepath, fileformat): # When reading or parsing from certain sequence file format # (e.g. FASTA), Bio.SeqIO gives a default alphabet to the Seq object # created that will raise an error when writing it in a GENBANK # file. Thus, we change that alphabet to a more specific one, # checking if it is a DNA or a protein sequence if (isinstance(record.seq.alphabet, Alphabet.SingleLetterAlphabet)): record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACDNA() if (not Alphabet._verify_alphabet(record.seq)): record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACProtein() seq_dict[record.id] = record date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S') report = [(date_time, 'local', filepath, fileformat)] return (cls(seq_dict, report))
def from_seqfile(cls, seqfile, fileformat): """ Create a BioSeqs object retrieving all the information stored at the sequence file provided. If 'seqfile' contains a relative path, the current working directory will be used to get the absolute path. Arguments : seqfile ( string ) Input sequences file. fileformat ( string ) Input file format. Raises : IOError If the path or the file provided doesn't exist. * The file format must be supported by Bio.SeqIO. * If the file format provided doesn't correspond to the actual file format, an empty sequence dictionary will be created. """ filepath = get_abspath(seqfile) # Read the sequence file and create a new BioSeqs object, generating a # new report list seq_dict = {} for record in SeqIO.parse(filepath, fileformat): # When reading or parsing from certain sequence file format # (e.g. FASTA), Bio.SeqIO gives a default alphabet to the Seq object # created that will raise an error when writing it in a GENBANK # file. Thus, we change that alphabet to a more specific one, # checking if it is a DNA or a protein sequence if isinstance(record.seq.alphabet, Alphabet.SingleLetterAlphabet): record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACDNA() if not Alphabet._verify_alphabet(record.seq): record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACProtein() seq_dict[record.id] = record date_time = datetime.now().strftime("%Y/%m/%d %H:%M:%S") report = [(date_time, "local", filepath, fileformat)] return cls(seq_dict, report)
def map_seqs ( record_list, tree_file, file_format, subset_size, overlapping, binary = 'dcm' ) : """ Generate a map of the sequences in sets, of at most 'subset_size', with the specified overlapping using the padded-Recursive-DMC3 decomposition (PRD) from DACTAL system. If 'tree_file' contains a relative path, the current working directory will be used to get the absolute path. Arguments : record_list ( list ) List of SeqRecord objects (from Biopython). tree_file ( string ) Input tree file. file_format ( string ) Tree file format. subset_size ( int ) Maximum subset size. overlapping ( int ) Number of overlapping sequences between any two resultant subsets. binary ( Optional[string] ) Name or path of the DCM binary file. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequences as values in lists of SeqRecord objects. Raises : ValueError When 'subset_size' < (4 * 'overlapping'). RuntimeError If the call to the dcm command raises an exception. IOError If the dcm tool can't generate a decomposition for the 'subset_size' and 'overlapping' values given. * The tree file format must be supported by Bio.Phylo. """ if ( subset_size < (4 * overlapping) ) : raise ValueError('The maximum subset size must be greater than or ' \ 'equal to 4 times the overlapping value') # else : # subset_size >= (4 * overlapping) # If the input file format is not supported by the PRD process, convert it # to a temporary supported file infile_path = get_abspath(tree_file) if ( file_format.lower() != 'newick' ) : tmpfile = tempfile.NamedTemporaryFile() Phylo.convert(infile_path, file_format, tmpfile.name, 'newick') infile_path = tmpfile.name # The first decomposition process will be always executed, so there is no # need to overload this stage with the multiprocess generation set_list, further_decomp = _prd_decomposition(infile_path, subset_size, str(overlapping), binary) # Parallelization of the recursive decomposition of the different subtrees. # All new subtrees are attached to 'further_decomp' file list so we can # launch at most one process per core, speeding up the whole process start = 0 to_process = len(further_decomp[start:]) pool = multiprocessing.Pool(processes=NUMCORES) while ( to_process > 0 ) : end = start + min(to_process, NUMCORES) results = [pool.apply_async(_prd_decomposition, args=(further_decomp[i], subset_size, str(overlapping), binary,)) for i in range(start, end)] # Collect the results of all the processes launched for pool_result in results : output = pool_result.get() set_list += output[0] further_decomp += output[1] start = end to_process = len(further_decomp[start:]) # Remove all the temporal files created for the multirpocessing stage for file_path in further_decomp : os.remove(file_path) record_dict = {record.id: record for record in record_list} # Map all the resultant sets with an unique set id and replace the sequence # ids by their corresponding Bio.SeqRecord object set_dict = {} num_zeros = len(str(len(set_list))) for index, seq_id_list in enumerate(set_list, 1) : set_id = 'prdset{}'.format(str(index).zfill(num_zeros)) set_dict[set_id] = [] for seq_id in seq_id_list : set_dict[set_id].append(record_dict[seq_id]) return ( set_dict )
def get_subsets(method, seqfile, fileformat='genbank', *args, **kwargs): """ Division of all the sequences stored in the sequence input file into subsets applying the 'method' function. If 'seqfile' contains a relative path, the current working directory will be used to get the absolute path. Arguments : method ( string ) Desired partition method (case-insensitive): genes, naive rows or cols, padded-Recursive-DCM3. seqfile ( string ) Input sequences file. fileformat ( string ) Input file format. args & kwargs Non-keyworded and keyworded arguments passed to the selected method. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequences as values in lists of SeqRecord objects. Raises : ValueError If there is no corresponding method to 'method' value. IOError If the path or the file provided doesn't exist. IOError If the file format provided doesn't correspond to the actual one. * The file format must be supported by Bio.SeqIO. * For "rows" method, if the number of input sequences is lower than the number of sets multiplied by the number of cores, the resulting sets might be fewer than the number requested. """ method_key = method.lower() if (method_key not in _METHOD_TO_FUNC): message = 'The method "{}" isn\'t included in ' \ 'MEvoLib.Cluster'.format(method) raise ValueError(message) # else : # method_key in _METHOD_TO_FUNC # Get the mapping function and the sequence file path mapseqs_func = _METHOD_TO_FUNC[method_key] filepath = get_abspath(seqfile) if (method_key in ['prd', 'genes']): # Non data-driven (throught input slicing) parallelizable methods seq_list = (x for x in SeqIO.parse(filepath, fileformat)) set_dict = mapseqs_func(seq_list, *args, **kwargs) else: # Data-driven (throught input slicing) parallelizable methods manager = multiprocessing.Manager() seq_list = manager.list([x for x in SeqIO.parse(filepath, fileformat)]) num_seqs = len(seq_list) # Launch one process per available CPU core slice_size = int(math.ceil(num_seqs / NUMCORES)) pool = multiprocessing.Pool(processes=NUMCORES) results = [pool.apply_async(mapseqs_func, args=(seq_list[start:start+slice_size],) + \ args) for start in range(0, num_seqs, slice_size)] # Build the final sets dictionary merging the results of all executed # processes output = [p.get() for p in iter(results)] set_dict = output[0] for key in iter(set_dict): for result in output[1:]: set_dict[key].extend(result[key]) return (set_dict)
def get_subsets ( method, seqfile, fileformat = 'genbank', *args, **kwargs ) : """ Division of all the sequences stored in the sequence input file into subsets applying the 'method' function. If 'seqfile' contains a relative path, the current working directory will be used to get the absolute path. Arguments : method ( string ) Desired partition method (case-insensitive): genes, naive rows or cols, padded-Recursive-DCM3. seqfile ( string ) Input sequences file. fileformat ( string ) Input file format. args & kwargs Non-keyworded and keyworded arguments passed to the selected method. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequences as values in lists of SeqRecord objects. Raises : ValueError If there is no corresponding method to 'method' value. IOError If the path or the file provided doesn't exist. IOError If the file format provided doesn't correspond to the actual one. * The file format must be supported by Bio.SeqIO. * For "rows" method, if the number of input sequences is lower than the number of sets multiplied by the number of cores, the resulting sets might be fewer than the number requested. """ method_key = method.lower() if ( method_key not in _METHOD_TO_FUNC ) : message = 'The method "{}" isn\'t included in ' \ 'MEvoLib.Cluster'.format(method) raise ValueError(message) # else : # method_key in _METHOD_TO_FUNC # Get the mapping function and the sequence file path mapseqs_func = _METHOD_TO_FUNC[method_key] filepath = get_abspath(seqfile) if ( method_key in ['prd', 'genes'] ) : # Non data-driven (throught input slicing) parallelizable methods seq_list = (x for x in SeqIO.parse(filepath, fileformat)) set_dict = mapseqs_func(seq_list, *args, **kwargs) else : # Data-driven (throught input slicing) parallelizable methods manager = multiprocessing.Manager() seq_list = manager.list([x for x in SeqIO.parse(filepath, fileformat)]) num_seqs = len(seq_list) # Launch one process per available CPU core slice_size = int(math.ceil(num_seqs / NUMCORES)) pool = multiprocessing.Pool(processes=NUMCORES) results = [pool.apply_async(mapseqs_func, args=(seq_list[start:start+slice_size],) + \ args) for start in range(0, num_seqs, slice_size)] # Build the final sets dictionary merging the results of all executed # processes output = [p.get() for p in iter(results)] set_dict = output[0] for key in iter(set_dict) : for result in output[1:] : set_dict[key].extend(result[key]) return ( set_dict )
def get_alignment(binary, infile, infile_format, args='default', outfile=None, outfile_format='fasta', **kwargs): """ Align the sequences of the input file using the alignment tool and arguments given. The resultant alignment is returned as a Bio.Align.MultipleSeqAlign object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. The alignment tool might not be included in MEvoLib, but it can still be used passing in '**kwargs' the keys "informats" and "incmd" with the list of of supported infile formats and the infile argument, respectively. Arguments : binary ( string ) Name or path of the alignment tool. infile ( string ) Unaligned input sequence file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the aligment tool, excluding infile and outfile arguments. By default, 'default' arguments are used. outfile ( Optional[string] ) Alignment output file. outfile_format ( Optional[string] ) Output file format. By default, FASTA format. **kwargs ( Optional[dict] ) Keyworded arguments required to execute alignment tools not included in the current version of MEvoLib. It is neccesary to pass a list of supported infile formats under "informats" key, and the infile argument (e.g. "-in") with "incmd" key. Returns : Bio.Align.MultipleSeqAlignment Resultant alignment. Raises : IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the alignment tool command raises an exception. * The input file format must be supported by Bio.SeqIO. * The output file format must be supported by Bio.AlignIO. """ # Get the variables associated with the given alignment tool, or get those # values from **kwargs bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if (bin_name in _TOOL_TO_LIB): tool_lib = _TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS infile_cmd = tool_lib.INFILE_CMD keywords = tool_lib.KEYWORDS else: # bin_name not in _TOOL_TO_LIB # Include the required variables through **kwargs dictionary sprt_infile_formats = kwargs['informats'] infile_cmd = kwargs['incmd'] keywords = dict() # Get the command line to run in order to get the resultant alignment infile_path = get_abspath(infile) # If the input file format is not supported by the alignment tool, convert # it to a temporary supported file if (infile_format.lower() not in sprt_infile_formats): tmpfile = tempfile.NamedTemporaryFile() SeqIO.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Get argument list from keyword dictionary or 'args' string if (args in keywords): arg_list = keywords[args] else: # args not in keywords # Remove possible empty strings in the given arguments arg_list = [arg for arg in args.split(' ')] # Create full command line list (removing empty elements) command = [x for x in [binary] + arg_list + [infile_cmd, infile_path] if x] # Run the alignment process handling any Runtime exception try: output = subprocess.check_output(command, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e: message = 'Running "{}" raised an exception'.format(' '.join(e.cmd)) raise RuntimeError(message) else: alignment = AlignIO.read(StringIO(output), 'fasta') if (outfile): # Save the resultant alignment in the given outfile and format outfile_path = get_abspath(outfile) AlignIO.write(alignment, outfile_path, outfile_format) # Return the resultant alignment as a Bio.Align.MultipleSeqAligment # object return (alignment)
def get_phylogeny ( binary, infile, infile_format, args = 'default', outfile = None, outfile_format = 'newick', bootstraps = 0 ) : """ Infer the phylogeny from the input alignment using the phylogenetic inference tool and arguments given. The resultant phylogeny is returned as a Bio.Phylo.BaseTree object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. Arguments : binary ( string ) Name or path of the phylogenetic inference tool. infile ( string ) Sequence alignment file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the phylogenetic inference tool, excluding infile and outfile arguments. By default, 'default' arguments are used. outfile ( Optional[string] ) Phylogenetic tree output file. outfile_format ( Optional[string] ) Output file format. By default, NEWICK format. bootstraps ( Optional[int] ) Number of bootstraps to generate. By default, 0 (only use the input alignment). Returns : Bio.Phylo.BaseTree Resultant phylogenetic tree. float Log-likelihood score of the phylogeny. Raises : ValueError If the tool introduced isn't included in MEvoLib. IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the phylogenetic inference tool command raises an exception. * The input file format must be supported by Bio.AlignIO. * The output file format must be supported by Bio.Phylo. """ # Get the variables associated with the given phylogenetic inference tool bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if ( bin_name in _PHYLO_TOOL_TO_LIB ) : tool_lib = _PHYLO_TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS gen_args = tool_lib.gen_args get_results = tool_lib.get_results cleanup = tool_lib.cleanup else : # bin_name not in _PHYLO_TOOL_TO_LIB message = 'The phylogenetic inference tool "{}" isn\'t included in ' \ 'MEvoLib.Inference'.format(bin_name) raise ValueError(message) # Get the command line to run in order to get the resultant phylogeny infile_path = get_abspath(infile) # If the input file format is not supported by the phylogenetic inference # tool, convert it to a temporary supported file if ( infile_format.lower() not in sprt_infile_formats ) : tmpfile = tempfile.NamedTemporaryFile() AlignIO.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Create full command line list command = [binary] + gen_args(args, infile_path, bootstraps) # Run the phylogenetic inference process handling any Runtime exception try : output = subprocess.check_output(command, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e : cleanup(command) message = 'Running "{}" raised an exception'.format(' '.join(e.cmd)) raise RuntimeError(message) else : phylogeny, score = get_results(command, output) if ( outfile ) : # Save the resultant phylogeny in the given outfile and format outfile_path = get_abspath(outfile) Phylo.write(phylogeny, outfile_path, outfile_format) cleanup(command) # Return the resultant phylogeny as a Bio.Phylo.BaseTree object and its # log-likelihood score return ( phylogeny, score )
def map_seqs(record_list, tree_file, file_format, subset_size, overlapping, binary='dcm'): """ Generate a map of the sequences in sets, of at most 'subset_size', with the specified overlapping using the padded-Recursive-DMC3 decomposition (PRD) from DACTAL system. If 'tree_file' contains a relative path, the current working directory will be used to get the absolute path. Arguments : record_list ( list ) List of SeqRecord objects (from Biopython). tree_file ( string ) Input tree file. file_format ( string ) Tree file format. subset_size ( int ) Maximum subset size. overlapping ( int ) Number of overlapping sequences between any two resultant subsets. binary ( Optional[string] ) Name or path of the DCM binary file. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequences as values in lists of SeqRecord objects. Raises : ValueError When 'subset_size' < (4 * 'overlapping'). RuntimeError If the call to the dcm command raises an exception. IOError If the dcm tool can't generate a decomposition for the 'subset_size' and 'overlapping' values given. * The tree file format must be supported by Bio.Phylo. """ if (subset_size < (4 * overlapping)): raise ValueError('The maximum subset size must be greater than or ' \ 'equal to 4 times the overlapping value') # else : # subset_size >= (4 * overlapping) # If the input file format is not supported by the PRD process, convert it # to a temporary supported file infile_path = get_abspath(tree_file) if (file_format.lower() != 'newick'): tmpfile = tempfile.NamedTemporaryFile() Phylo.convert(infile_path, file_format, tmpfile.name, 'newick') infile_path = tmpfile.name # The first decomposition process will be always executed, so there is no # need to overload this stage with the multiprocess generation set_list, further_decomp = _prd_decomposition(infile_path, subset_size, str(overlapping), binary) # Parallelization of the recursive decomposition of the different subtrees. # All new subtrees are attached to 'further_decomp' file list so we can # launch at most one process per core, speeding up the whole process start = 0 to_process = len(further_decomp[start:]) pool = multiprocessing.Pool(processes=NUMCORES) while (to_process > 0): end = start + min(to_process, NUMCORES) results = [ pool.apply_async(_prd_decomposition, args=( further_decomp[i], subset_size, str(overlapping), binary, )) for i in range(start, end) ] # Collect the results of all the processes launched for pool_result in results: output = pool_result.get() set_list += output[0] further_decomp += output[1] start = end to_process = len(further_decomp[start:]) # Remove all the temporal files created for the multirpocessing stage for file_path in further_decomp: os.remove(file_path) record_dict = {record.id: record for record in record_list} # Map all the resultant sets with an unique set id and replace the sequence # ids by their corresponding Bio.SeqRecord object set_dict = {} num_zeros = len(str(len(set_list))) for index, seq_id_list in enumerate(set_list, 1): set_id = 'prdset{}'.format(str(index).zfill(num_zeros)) set_dict[set_id] = [] for seq_id in seq_id_list: set_dict[set_id].append(record_dict[seq_id]) return (set_dict)
def get_alignment ( binary, infile, infile_format, args = 'default', outfile = None, outfile_format = 'fasta', **kwargs ) : """ Align the sequences of the input file using the alignment tool and arguments given. The resultant alignment is returned as a Bio.Align.MultipleSeqAlign object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. The alignment tool might not be included in MEvoLib, but it can still be used passing in '**kwargs' the keys "informats" and "incmd" with the list of of supported infile formats and the infile argument, respectively. Arguments : binary ( string ) Name or path of the alignment tool. infile ( string ) Unaligned input sequence file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the aligment tool, excluding infile and outfile arguments. By default, 'default' arguments are used. outfile ( Optional[string] ) Alignment output file. outfile_format ( Optional[string] ) Output file format. By default, FASTA format. **kwargs ( Optional[dict] ) Keyworded arguments required to execute alignment tools not included in the current version of MEvoLib. It is neccesary to pass a list of supported infile formats under "informats" key, and the infile argument (e.g. "-in") with "incmd" key. Returns : Bio.Align.MultipleSeqAlignment Resultant alignment. Raises : IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the alignment tool command raises an exception. * The input file format must be supported by Bio.SeqIO. * The output file format must be supported by Bio.AlignIO. """ # Get the variables associated with the given alignment tool, or get those # values from **kwargs bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if ( bin_name in _TOOL_TO_LIB ) : tool_lib = _TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS infile_cmd = tool_lib.INFILE_CMD keywords = tool_lib.KEYWORDS else : # bin_name not in _TOOL_TO_LIB # Include the required variables through **kwargs dictionary sprt_infile_formats = kwargs['informats'] infile_cmd = kwargs['incmd'] keywords = dict() # Get the command line to run in order to get the resultant alignment infile_path = get_abspath(infile) # If the input file format is not supported by the alignment tool, convert # it to a temporary supported file if ( infile_format.lower() not in sprt_infile_formats ) : tmpfile = tempfile.NamedTemporaryFile() SeqIO.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Get argument list from keyword dictionary or 'args' string if ( args in keywords ) : arg_list = keywords[args] else : # args not in keywords # Remove possible empty strings in the given arguments arg_list = [arg for arg in args.split(' ')] # Create full command line list (removing empty elements) command = [x for x in [binary] + arg_list + [infile_cmd, infile_path] if x] # Run the alignment process handling any Runtime exception try : output = subprocess.check_output(command, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e : message = 'Running "{}" raised an exception'.format(' '.join(e.cmd)) raise RuntimeError(message) else : alignment = AlignIO.read(StringIO(output), 'fasta') if ( outfile ) : # Save the resultant alignment in the given outfile and format outfile_path = get_abspath(outfile) AlignIO.write(alignment, outfile_path, outfile_format) # Return the resultant alignment as a Bio.Align.MultipleSeqAligment # object return ( alignment )
def get_consensus_tree ( binary, infile, infile_format, args = 'default', outfile = None, outfile_format = 'newick' ) : """ Calculate the consensus tree of the input trees file with the given arguments. The resultant consensus tree is returned as a Bio.Phylo.BaseTree object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. Arguments : binary ( string ) Name or path of the consensus tool. infile ( string ) Input phylogenetic trees file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the consensus tool, excluding infile and outfile arguments. By default, 'default' arguments are used. * For Consense, the second character will be used as separator of the different arguments. outfile ( Optional[string] ) Consensus tree output file. outfile_format ( Optional[string] ) Output file format. By default, NEWICK format. Returns : Bio.Phylo.BaseTree Resultant consensus tree. Raises : ValueError If the tool introduced isn't included in MEvoLib. IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the phylogenetic inference tool command raises an exception. IOError If the consensus tool didn't generate a consensus tree (indicated by user's options/arguments). * The input file format must be supported by Bio.Phylo. * The output file format must be supported by Bio.Phylo. """ # Get the variables associated with the given consensus tool bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if ( bin_name in _CONS_TOOL_TO_LIB ) : tool_lib = _CONS_TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS gen_args = tool_lib.gen_args gen_stdin_content = tool_lib.gen_stdin_content get_results = tool_lib.get_results cleanup = tool_lib.cleanup else : # bin_name not in _CONS_TOOL_TO_LIB message = 'The consensus tool "{}" isn\'t included in ' \ 'MEvoLib.PhyloAssemble'.format(bin_name) raise ValueError(message) # Get the command line to run in order to get the consensus tree infile_path = get_abspath(infile) # If the input file format is not supported by the consensus tool, convert # it to a temporary supported file if ( infile_format.lower() not in sprt_infile_formats ) : tmpfile = tempfile.NamedTemporaryFile() Phylo.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Create full command line list command = [binary] + gen_args(args, infile_path, outfile) # Generate the standard input file content stdin_content = gen_stdin_content(args) # Create the input file with the given options with tempfile.NamedTemporaryFile(mode='w+') as stdin_file : stdin_file.write(stdin_content) stdin_file.seek(0) # Run the consensus process handling any Runtime exception try : subprocess.check_call(command, stdin=stdin_file, stdout=DEVNULL, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e : cleanup(command) message = 'Running "{}" raised an exception'.format(' '.join(e.cmd)) raise RuntimeError(message) else : consensus_tree = get_results(command) cleanup(command) # Return the resultant consensus tree as a Bio.Phylo.BaseTree object return ( consensus_tree )
def get_consensus_tree(binary, infile, infile_format, args='default', outfile=None, outfile_format='newick'): """ Calculate the consensus tree of the input trees file with the given arguments. The resultant consensus tree is returned as a Bio.Phylo.BaseTree object and saved in the ouput file (if provided). If 'infile' or 'outfile' contain a relative path, the current working directory will be used to get the absolute path. If the output file already exists, the old file will be overwritten without any warning. Arguments : binary ( string ) Name or path of the consensus tool. infile ( string ) Input phylogenetic trees file. infile_format ( string ) Input file format. args ( Optional[string] ) Keyword or arguments to use in the call of the consensus tool, excluding infile and outfile arguments. By default, 'default' arguments are used. * For Consense, the second character will be used as separator of the different arguments. outfile ( Optional[string] ) Consensus tree output file. outfile_format ( Optional[string] ) Output file format. By default, NEWICK format. Returns : Bio.Phylo.BaseTree Resultant consensus tree. Raises : ValueError If the tool introduced isn't included in MEvoLib. IOError If the input path or the input file provided doesn't exist. RuntimeError If the call to the phylogenetic inference tool command raises an exception. IOError If the consensus tool didn't generate a consensus tree (indicated by user's options/arguments). * The input file format must be supported by Bio.Phylo. * The output file format must be supported by Bio.Phylo. """ # Get the variables associated with the given consensus tool bin_path, bin_name = os.path.split(binary) bin_name = bin_name.lower() if (bin_name in _CONS_TOOL_TO_LIB): tool_lib = _CONS_TOOL_TO_LIB[bin_name] sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS gen_args = tool_lib.gen_args gen_stdin_content = tool_lib.gen_stdin_content get_results = tool_lib.get_results cleanup = tool_lib.cleanup else: # bin_name not in _CONS_TOOL_TO_LIB message = 'The consensus tool "{}" isn\'t included in ' \ 'MEvoLib.PhyloAssemble'.format(bin_name) raise ValueError(message) # Get the command line to run in order to get the consensus tree infile_path = get_abspath(infile) # If the input file format is not supported by the consensus tool, convert # it to a temporary supported file if (infile_format.lower() not in sprt_infile_formats): tmpfile = tempfile.NamedTemporaryFile() Phylo.convert(infile_path, infile_format, tmpfile.name, sprt_infile_formats[0]) infile_path = tmpfile.name # Create full command line list command = [binary] + gen_args(args, infile_path, outfile) # Generate the standard input file content stdin_content = gen_stdin_content(args) # Create the input file with the given options with tempfile.NamedTemporaryFile(mode='w+') as stdin_file: stdin_file.write(stdin_content) stdin_file.seek(0) # Run the consensus process handling any Runtime exception try: subprocess.check_call(command, stdin=stdin_file, stdout=DEVNULL, stderr=DEVNULL, universal_newlines=True) except subprocess.CalledProcessError as e: cleanup(command) message = 'Running "{}" raised an exception'.format(' '.join( e.cmd)) raise RuntimeError(message) else: consensus_tree = get_results(command) cleanup(command) # Return the resultant consensus tree as a Bio.Phylo.BaseTree object return (consensus_tree)