def run(self, input_tree, msa_file, num_replicates, model, base_type, frac, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert (model in ['wag', 'lg', 'jtt']) assert (base_type in ['nt', 'prot']) self.model = model self.base_type = base_type self.frac = frac self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support values rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. gamma : bool Indicates if GAMMA model should be used num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ assert (seq_type.upper() in ['NT', 'PROT']) assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR']) self.output_dir = output_dir self.seq_type = seq_type self.model = model_str self.gamma = gamma self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append( os.path.join(self.output_dir, 'rep_%d' % rep_index, 'bootstrap.tree')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. model_str : str Specified either the 'WAG' or 'LG' model. num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ check_on_path('seqmagick') assert(model_str.upper() in ['WAG', 'LG']) self.output_dir = output_dir self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. num_replicates : int Number of replicates to perform. output_tree: str Output file containing tree with bootstrap values. cpus : int Number of cpus to use. """ assert(seq_type in ['nt', 'prot']) assert(model_str in ['wag', 'jtt']) self.replicate_dir = tempfile.mkdtemp() self.seq_type = seq_type self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) parallel.run(self._bootstrap, None, xrange(num_replicates), None) # calculate support values rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre')) bootstrap_support(input_tree, rep_tree_files, output_tree) shutil.rmtree(self.replicate_dir)
def run(self, input_tree, msa_file, num_replicates, model, gamma, base_type, frac, boot_dir, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert(model in ['wag', 'lg', 'jtt']) assert(base_type in ['nt', 'prot']) self.model = model self.gamma = gamma self.base_type = base_type self.frac = frac rep_tree_files = [] if not boot_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) else: for f in os.listdir(boot_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(boot_dir, f)) self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files)) # calculate support values self.logger.info('Calculating bootstrap support values.') output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert (model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error( 'Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert(model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error('Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree