def parallel_run(self, msa_files, seq_type, model_str, gamma, output_dir, cpus): """Infer tree using FastTree in parallel. Parameters ---------- msa_files : str Fasta files containing multiple sequence alignments. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. output_dir: str Prefix for all output files. """ assert (seq_type.upper() in ['NT', 'PROT']) assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR']) self.output_dir = output_dir self.seq_type = seq_type self.model = model_str self.gamma = gamma parallel = Parallel(cpus) parallel.run(self._parallel_infer_tree, None, msa_files, None)
def generate_metadata(self, gtdb_genome_path_file): self.starttime = datetime.datetime.utcnow().replace(microsecond=0) input_files = [] countr = 0 for line in open(gtdb_genome_path_file): countr += 1 statusStr = '{} lines read.'.format(countr) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() line_split = line.strip().split('\t') gid = line_split[0] gpath = line_split[1] assembly_id = os.path.basename(os.path.normpath(gpath)) genome_file = os.path.join(gpath, assembly_id + '_genomic.fna') gff_file = os.path.join(gpath, 'prodigal', gid + '_protein.gff') input_files.append([genome_file, gff_file]) # process each genome print('Generating metadata for each genome:') parallel = Parallel(cpus=self.cpus) parallel.run(self._producer, None, input_files, self._progress)
def run(self, gene_files, critical_value, output_dir): """Calculate dinucleotide usage over a set of genomes. Parameters ---------- gene_files : list Fasta files containing called genes in nucleotide space. critical_value : float Critical value used to define a deviant gene (i.e., potential LGT event). output_dir : str Directory to store results. """ self.output_dir = output_dir if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.critical_value = critical_value self.logger.info('Calculating dinucleotide usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) parallel.run(self._producer, None, gene_files, progress_func)
def run(self, input_tree, msa_file, num_replicates, model, base_type, frac, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert (model in ['wag', 'lg', 'jtt']) assert (base_type in ['nt', 'prot']) self.model = model self.base_type = base_type self.frac = frac self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support values rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. gamma : bool Indicates if GAMMA model should be used num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ assert (seq_type.upper() in ['NT', 'PROT']) assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR']) self.output_dir = output_dir self.seq_type = seq_type self.model = model_str self.gamma = gamma self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append( os.path.join(self.output_dir, 'rep_%d' % rep_index, 'bootstrap.tree')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, gene_files): """Calculate codon usage over a set of genomes. Parameters ---------- gene_files : list Fasta files containing called genes in nucleotide space. Returns ------- dict of dict : d[genome_id][codon] -> count Codon usage of each genome. set Set with all identified codons. dict of dict : d[genome_id][codon] -> length Mean length of genes for each stop codon. """ self.logger.info('Calculating codon usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func) return consumer_data.genome_codon_usage, consumer_data.codon_set, consumer_data.mean_gene_length
def run(self, gene_files): """Calculate amino acid usage over a set of genomes. Parameters ---------- gene_files : list Fasta files containing called genes. Returns ------- dict of dict : dict[genome_id][aa] -> count Amino acid usage of each genome. set Set with all identified amino acids. """ self.logger.info('Calculating amino acid usage for each genome:') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func) return consumer_data.genome_aa_usage, consumer_data.aa_set
def run(self, cluster_file, genome_dir_file, output_prefix): output_dir = os.path.dirname(output_prefix) if not os.path.exists(output_dir): os.makedirs(output_dir) # get path to all nucleotide gene file of all genomes gene_files = {} for line in open(genome_dir_file): line_split = line.strip().split('\t') genome_id = line_split[0] genome_path = line_split[1] genome_dir_id = os.path.basename(os.path.normpath(genome_path)) gene_files[genome_id] = os.path.join(line_split[1], genome_dir_id + '_protein.fna') print 'Read path for %d genomes.' % len(gene_files) # process all clusters fout = open(output_prefix + '.ani.tsv', 'w') fout_summary = open(output_prefix + '.ani_summary.tsv', 'w') for line in open(cluster_file): line_split = line.strip().split('\t') rep_genome = line_split[0] rep_gene_file = gene_files[rep_genome] if len(line_split) == 4: data_items = [] genome_ids = line_split[3].split(',') for genome_id in genome_ids: gene_file = gene_files[genome_id] data_items.append((rep_gene_file, gene_file, genome_id)) parallel = Parallel(cpus = 38) results = parallel.run(self._producer, self._consumer, data_items, self._progress) gANIs = [] AFs = [] for r in results: genome_id, gANI, AF = r fout.write('%s\t%s\t%.3f\t%.3f\n' % (rep_genome, genome_id, gANI, AF)) gANIs.append(gANI) AFs.append(AF) fout_summary.write('%s\t%.3f\t%.4f\t%.4f\t%.3f\t%.4f\t%.4f\n' % (rep_genome, mean(gANIs), std(gANIs), min(gANIs), mean(AFs), std(AFs), min(AFs))) fout.flush() fout_summary.flush() fout.close() fout_summary.close()
def run(self, genome_files): """Calculate kmer usage over a set of genomes. Parameters ---------- genome_files : list Fasta files containing genomic sequences in nucleotide space. Returns ------- dict of dict : d[genome_id][kmer] -> count Kmer usage of each genome. set Set with all identified kmers. """ self.logger.info('Calculating kmer usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) consumer_data = parallel.run(self._producer, self._consumer, genome_files, progress_func) return consumer_data.genome_kmer_usage, consumer_data.kmer_set
def run(self, genome_files): """Calculate kmer usage over a set of genomes. Parameters ---------- genome_files : list Fasta files containing genomic sequences in nucleotide space. Returns ------- dict of dict : d[genome_id][kmer] -> count Kmer usage of each genome. set Set with all identified kmers. """ self.logger.info('Calculating kmer usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func) return kmer_counts, self.signatures.canonical_order()
def run(self, ncbi_genome_dir, user_genome_dir, cpus): """Create metadata by parsing assembly stats files.""" input_files = [] # generate metadata for NCBI assemblies print 'Reading NCBI assembly directories.' processed_assemblies = defaultdict(list) for domain in ['archaea', 'bacteria']: domain_dir = os.path.join(ncbi_genome_dir, domain) for species_dir in os.listdir(domain_dir): full_species_dir = os.path.join(domain_dir, species_dir) for assembly_dir in os.listdir(full_species_dir): accession = assembly_dir[0:assembly_dir.find('_', 4)] processed_assemblies[accession].append(species_dir) if len(processed_assemblies[accession]) >= 2: continue full_assembly_dir = os.path.join(full_species_dir, assembly_dir) genome_file = os.path.join(full_assembly_dir, assembly_dir + '_genomic.fna') gff_file = os.path.join(full_assembly_dir, 'prodigal', accession + '_protein.gff') input_files.append([genome_file, gff_file]) # generate metadata for user genomes if user_genome_dir != 'NONE': print 'Reading user genome directories.' for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') gff_file = os.path.join(full_genome_dir, genome_id + '_protein.gff') input_files.append([genome_file, gff_file]) # process each genome print 'Generating metadata for each genome:' parallel = Parallel(cpus=cpus) parallel.run(self._producer, None, input_files, self._progress)
def run(self, ncbi_genome_dir, user_genome_dir, cpus): """Create metadata by parsing assembly stats files.""" input_files = [] # generate metadata for NCBI assemblies print 'Reading NCBI assembly directories.' processed_assemblies = defaultdict(list) for domain in ['archaea', 'bacteria']: domain_dir = os.path.join(ncbi_genome_dir, domain) for species_dir in os.listdir(domain_dir): full_species_dir = os.path.join(domain_dir, species_dir) for assembly_dir in os.listdir(full_species_dir): accession = assembly_dir[0:assembly_dir.find('_', 4)] processed_assemblies[accession].append(species_dir) if len(processed_assemblies[accession]) >= 2: continue full_assembly_dir = os.path.join(full_species_dir, assembly_dir) genome_file = os.path.join(full_assembly_dir, assembly_dir + '_genomic.fna') gff_file = os.path.join(full_assembly_dir, 'prodigal', accession + '_protein.gff') input_files.append([genome_file, gff_file]) # generate metadata for user genomes if user_genome_dir != 'NONE': print 'Reading user genome directories.' for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') gff_file = os.path.join(full_genome_dir, genome_id + '_protein.gff') input_files.append([genome_file, gff_file]) # process each genome print 'Generating metadata for each genome:' parallel = Parallel(cpus = cpus) parallel.run(self._producer, None, input_files, self._progress)
def run(self, cluster_file, genome_dir_file, output_prefix): output_dir = os.path.dirname(output_prefix) if not os.path.exists(output_dir): os.makedirs(output_dir) # get path to all nucleotide gene file of all genomes gene_files = {} for line in open(genome_dir_file): line_split = line.strip().split('\t') genome_id = line_split[0] genome_path = line_split[1] genome_dir_id = os.path.basename(os.path.normpath(genome_path)) gene_files[genome_id] = os.path.join( line_split[1], genome_dir_id + '_protein.fna') print 'Read path for %d genomes.' % len(gene_files) # process all clusters fout = open(output_prefix + '.ani.tsv', 'w') fout_summary = open(output_prefix + '.ani_summary.tsv', 'w') for line in open(cluster_file): line_split = line.strip().split('\t') rep_genome = line_split[0] rep_gene_file = gene_files[rep_genome] if len(line_split) == 4: data_items = [] genome_ids = line_split[3].split(',') for genome_id in genome_ids: gene_file = gene_files[genome_id] data_items.append((rep_gene_file, gene_file, genome_id)) parallel = Parallel(cpus=38) results = parallel.run(self._producer, self._consumer, data_items, self._progress) gANIs = [] AFs = [] for r in results: genome_id, gANI, AF = r fout.write('%s\t%s\t%.3f\t%.3f\n' % (rep_genome, genome_id, gANI, AF)) gANIs.append(gANI) AFs.append(AF) fout_summary.write('%s\t%.3f\t%.4f\t%.4f\t%.3f\t%.4f\t%.4f\n' % (rep_genome, mean(gANIs), std(gANIs), min(gANIs), mean(AFs), std(AFs), min(AFs))) fout.flush() fout_summary.flush() fout.close() fout_summary.close()
def run(self, gene_files, output_dir): """Calculate codon usage over genes with a set of genomes. Parameters ---------- gene_files : list Fasta files containing called genes in nucleotide space. output_dir : str Directory to store results. """ self.output_dir = output_dir if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logger.info(' Calculating codon usage for each genome.') parallel = Parallel(self.cpus) parallel.run(self._producer, None, gene_files, self._progress)
def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. model_str : str Specified either the 'WAG' or 'LG' model. num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ check_on_path('seqmagick') assert(model_str.upper() in ['WAG', 'LG']) self.output_dir = output_dir self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. num_replicates : int Number of replicates to perform. output_tree: str Output file containing tree with bootstrap values. cpus : int Number of cpus to use. """ assert(seq_type in ['nt', 'prot']) assert(model_str in ['wag', 'jtt']) self.replicate_dir = tempfile.mkdtemp() self.seq_type = seq_type self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) parallel.run(self._bootstrap, None, xrange(num_replicates), None) # calculate support values rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre')) bootstrap_support(input_tree, rep_tree_files, output_tree) shutil.rmtree(self.replicate_dir)
def run(self, gene_files, output_dir): """Calculate codon usage over genes with a set of genomes. Parameters ---------- gene_files : list Fasta files containing called genes in nucleotide space. output_dir : str Directory to store results. """ self.output_dir = output_dir if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logger.info('Calculating codon usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) parallel.run(self._producer, None, gene_files, progress_func)
def run(self, aa_gene_files, evalue, output_dir): """Apply reciprocal blast to all pairs of genomes in parallel. Parameters ---------- aa_gene_files : list of str Amino acid fasta files to process via reciprocal blast. evalue : float E-value threshold used by blast. output_dir : str Directory to store blast results. """ self.evalue = evalue self.output_dir = output_dir # set CPUs per producer process self.producer_cpus = 1 if self.cpus > len(aa_gene_files): self.producer_cpus = self.cpus / len(aa_gene_files) # create the blast databases in serial self.logger.info(' Creating blast databases:') parallel = Parallel(self.cpus) parallel.run(self._producer_db, None, aa_gene_files, self._progress) # perform reciprocal blast between all genome pairs self.logger.info('') self.logger.info(' Identifying hits between all pairs of genomes:') genome_pairs = [] for i in xrange(0, len(aa_gene_files)): for j in xrange(i, len(aa_gene_files)): genome_pairs.append((aa_gene_files[i], aa_gene_files[j])) parallel.run(self._producer_blast, None, genome_pairs, self._progress)
def run(self, ncbi_genome_dir, user_genome_dir, cpus): """Create metadata by parsing assembly stats files.""" input_files = [] # generate metadata for NCBI assemblies if ncbi_genome_dir != 'NONE': print('Reading NCBI assembly directories.') processed_assemblies = defaultdict(list) rfq_dir = os.path.join(ncbi_genome_dir, 'refseq', 'GCF') gbk_dir = os.path.join(ncbi_genome_dir, 'genbank', 'GCA') for input_dir in (gbk_dir, rfq_dir): for first_three in os.listdir(input_dir): onethird_species_dir = os.path.join(input_dir, first_three) print onethird_species_dir if os.path.isfile(onethird_species_dir): continue for second_three in os.listdir(onethird_species_dir): twothird_species_dir = os.path.join( onethird_species_dir, second_three) # print twothird_species_dir if os.path.isfile(twothird_species_dir): continue for third_three in os.listdir(twothird_species_dir): threethird_species_dir = os.path.join( twothird_species_dir, third_three) # print threethird_species_dir if os.path.isfile(threethird_species_dir): continue for complete_name in os.listdir(threethird_species_dir): assembly_dir = os.path.join( threethird_species_dir, complete_name) if os.path.isfile(assembly_dir): continue accession = complete_name[0:complete_name.find( '_', 4)] processed_assemblies[accession].append( assembly_dir) if len(processed_assemblies[accession]) >= 2: continue ssu_file = os.path.join( assembly_dir, self.silva_output_dir, 'ssu.fna') if os.path.exists(ssu_file): genome_file = os.path.join( assembly_dir, complete_name + '_genomic.fna') input_files.append((genome_file, ssu_file)) # generate metadata for user genomes if user_genome_dir != 'NONE': print('Reading user genome directories.') for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) ssu_file = os.path.join( full_genome_dir, self.silva_output_dir, 'ssu.fna') if os.path.exists(ssu_file): genome_file = os.path.join( full_genome_dir, genome_id + '_genomic.fna') input_files.append((genome_file, ssu_file)) print('Identified %d genomes to process.' % len(input_files)) # process each genome print('Generating metadata for each genome:') parallel = Parallel(cpus=cpus) parallel.run(self._producer, None, input_files, self._progress)
def run(self, genome_ids, gene_dir, blast_dir, per_iden_threshold, per_aln_len_threshold, write_shared_genes, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- genome_ids : list of str Unique ids of genomes to process. gene_dir : str Directory with amino acid genes in fasta format. blast_dir : str Directory with reciprocal blast between genome pairs. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. write_shared_genes : boolean Flag indicating if shared genes should be written to file. output_dir : str Directory to store AAI results. """ self.gene_dir = gene_dir self.blast_dir = blast_dir self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.write_shared_genes = write_shared_genes self.output_dir = output_dir shared_genes_dir = os.path.join(output_dir, self.shared_genes) make_sure_path_exists(shared_genes_dir) self.shared_genes_dir = shared_genes_dir # calculate length of genes in each genome self.logger.info(' Calculating length of genes in each genome.') self.gene_lengths = {} gene_files = [] for gene_file in os.listdir(gene_dir): gene_file = os.path.join(gene_dir, gene_file) gene_files.append(gene_file) self.gene_lengths.update(seq_io.seq_lengths(gene_file)) # get byte offset of hits from each genome self.logger.info('') self.logger.info(' Indexing blast hits.') self.blast_table = os.path.join(self.blast_dir, self.blast_table_file) self.offset_table = self._genome_offsets(self.blast_table) # calculate AAI between each pair of genomes in parallel self.logger.info('') self.logger.info(' Calculating amino acid identity between all pairs of genomes:') genome_pairs = [] for i in xrange(0, len(gene_files)): for j in xrange(i + 1, len(gene_files)): genome_pairs.append((gene_files[i], gene_files[j])) if len(genome_pairs) == 0: self.logger.warning(' [Warning] No genome pairs identified.') return parallel = Parallel(self.cpus) consumer_data = parallel.run(self._producer, self._consumer, genome_pairs, self._progress) # write results for each genome pair aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write('Genome Id A\tGenes in A\tGenome Id B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\n') for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\n' % data) fout.close() self.logger.info('') self.logger.info(' Summary of AAI between genomes: %s' % aai_summay_file)
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng*ng - ng) / 2 self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in range(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in range(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n') for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file
def run(self, input_tree, msa_file, num_replicates, model, gamma, base_type, frac, boot_dir, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert(model in ['wag', 'lg', 'jtt']) assert(base_type in ['nt', 'prot']) self.model = model self.gamma = gamma self.base_type = base_type self.frac = frac rep_tree_files = [] if not boot_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) else: for f in os.listdir(boot_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(boot_dir, f)) self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files)) # calculate support values self.logger.info('Calculating bootstrap support values.') output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, db, cpus): """Create metadata by parsing assembly stats files.""" self.rna_gene = rna_gene if db == 'SILVA': # Silva info if rna_gene == 'ssu': self.db = self.silva_ssu_ref_file self.taxonomy = self.silva_ssu_taxonomy_file elif rna_gene == 'lsu_23S': self.db = self.silva_lsu_ref_file self.taxonomy = self.silva_lsu_taxonomy_file elif rna_gene == 'lsu_5S': print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.' self.output_dir = self.silva_output_dir else: print('Unrecognized database: %s' % db) sys.exit(-1) input_files = [] # generate metadata for NCBI assemblies if ncbi_genome_dir != 'NONE': print('Reading NCBI assembly directories.') processed_assemblies = defaultdict(list) for domain in ['archaea', 'bacteria']: domain_dir = os.path.join(ncbi_genome_dir, domain) if not os.path.exists(domain_dir): continue for species_dir in os.listdir(domain_dir): full_species_dir = os.path.join(domain_dir, species_dir) for assembly_dir in os.listdir(full_species_dir): accession = assembly_dir[0:assembly_dir.find('_', 4)] processed_assemblies[accession].append(species_dir) if len(processed_assemblies[accession]) >= 2: continue full_assembly_dir = os.path.join( full_species_dir, assembly_dir) hmm_results_file = os.path.join( full_assembly_dir, self.output_dir, rna_gene + '.hmm_summary.tsv') if os.path.exists(hmm_results_file): continue genome_file = os.path.join( full_assembly_dir, assembly_dir + '_genomic.fna') input_files.append(genome_file) # generate metadata for user genomes if user_genome_dir != 'NONE': print('Reading user genome directories.') for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) hmm_results_file = os.path.join( full_genome_dir, self.output_dir, rna_gene + '.hmm_summary.tsv') if os.path.exists(hmm_results_file): continue genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') input_files.append(genome_file) print('Identified %d genomes to process.' % len(input_files)) # process each genome print('Generating metadata for each genome:') parallel = Parallel(cpus=cpus) parallel.run(self._producer, None, input_files, self._progress)
def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, ssu_db, cpus): """Create metadata by parsing assembly stats files.""" self.rna_gene = rna_gene if ssu_db == 'GG': # Greengenes data files and desired output if rna_gene == 'ssu': self.db = '/srv/db/gg/2013_08/gg_13_8_otus/rep_set/99_otus.fasta' self.taxonomy = '/srv/db/gg/2013_08/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt' self.output_dir = 'ssu_gg' elif rna_gene == 'lsu_23S': print 'There is no 23S LSU database for GG.' return elif rna_gene == 'lsu_5S': return elif ssu_db == 'SILVA': # Silva info if rna_gene == 'ssu': self.db = '/srv/whitlam/bio/db/silva/123.1/SILVA_123.1_SSURef_Nr99_tax_silva.fasta' self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.ssu.tsv' self.output_dir = 'rna_silva' elif rna_gene == 'lsu_23S': self.db = '/srv/db/silva/123.1/SILVA_123.1_LSURef_tax_silva.fasta' self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.lsu.tsv' self.output_dir = 'rna_silva' elif rna_gene == 'lsu_5S': print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.' self.output_dir = 'lsu_5S' input_files = [] # generate metadata for NCBI assemblies if ncbi_genome_dir != 'NONE': print 'Reading NCBI assembly directories.' processed_assemblies = defaultdict(list) for domain in ['archaea', 'bacteria']: domain_dir = os.path.join(ncbi_genome_dir, domain) if not os.path.exists(domain_dir): continue for species_dir in os.listdir(domain_dir): full_species_dir = os.path.join(domain_dir, species_dir) for assembly_dir in os.listdir(full_species_dir): accession = assembly_dir[0:assembly_dir.find('_', 4)] processed_assemblies[accession].append(species_dir) if len(processed_assemblies[accession]) >= 2: continue full_assembly_dir = os.path.join( full_species_dir, assembly_dir) #if os.path.exists(os.path.join(full_assembly_dir, self.output_dir)): # continue genome_file = os.path.join( full_assembly_dir, assembly_dir + '_genomic.fna') input_files.append(genome_file) # generate metadata for user genomes if user_genome_dir != 'NONE': print 'Reading user genome directories.' for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) if os.path.exists( os.path.join(full_genome_dir, self.output_dir)): continue genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') input_files.append(genome_file) print 'Identified %d genomes to process.' % len(input_files) # process each genome print 'Generating metadata for each genome:' parallel = Parallel(cpus=cpus) if len(input_files) > 0: parallel.run(self._producer, None, input_files, self._progress)
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert (model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error( 'Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, genome_files, output_dir, called_genes=False, translation_table=None, meta=False, closed_ends=False): """Call genes with Prodigal. Call genes with prodigal and store the results in the specified output directory. For convenience, the called_gene flag can be used to indicate genes have previously been called and simply need to be copied to the specified output directory. Parameters ---------- genome_files : list of str Nucleotide fasta files to call genes on. called_genes : boolean Flag indicating if genes are already called. translation_table : int Specifies desired translation table, use None to automatically select between tables 4 and 11. meta : boolean Flag indicating if prodigal should call genes with the metagenomics procedure. closed_ends : boolean If True, do not allow genes to run off edges (throws -c flag). output_dir : str Directory to store called genes. Returns ------- d[genome_id] -> namedtuple(best_translation_table coding_density_4 coding_density_11) Summary statistics of called genes for each genome. """ self.called_genes = called_genes self.translation_table = translation_table self.meta = meta self.closed_ends = closed_ends self.output_dir = output_dir make_sure_path_exists(self.output_dir) progress_func = None if self.verbose: file_type = 'genomes' self.progress_str = ' Finished processing %d of %d (%.2f%%) genomes.' if meta: file_type = 'scaffolds' if len(genome_files): file_type = ntpath.basename(genome_files[0]) self.progress_str = ' Finished processing %d of %d (%.2f%%) files.' self.logger.info('Identifying genes within %s: ' % file_type) progress_func = self._progress parallel = Parallel(self.cpus) summary_stats = parallel.run(self._producer, self._consumer, genome_files, progress_func) return summary_stats
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info( 'Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng * ng - ng) / 2 self.logger.info( 'Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in xrange(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in xrange(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write( 'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n' ) for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append( os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file
def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, ssu_db, cpus): """Create metadata by parsing assembly stats files.""" self.rna_gene = rna_gene if ssu_db == 'GG': # Greengenes data files and desired output if rna_gene == 'ssu': self.db = '/srv/db/gg/2013_08/gg_13_8_otus/rep_set/99_otus.fasta' self.taxonomy = '/srv/db/gg/2013_08/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt' self.output_dir = 'ssu_gg' else: print 'There is no LSU database for GG.' sys.exit() elif ssu_db == 'SILVA': # Silva info if rna_gene == 'ssu': self.db = '/srv/whitlam/bio/db/silva/123.1/SILVA_123.1_SSURef_Nr99_tax_silva.fasta' self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.ssu.tsv' elif rna_gene == 'lsu_23S': self.db = '/srv/db/silva/123.1/SILVA_123.1_LSURef_tax_silva.fasta' self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.lsu.tsv' self.output_dir = 'rna_silva' input_files = [] # generate metadata for NCBI assemblies print 'Reading NCBI assembly directories.' processed_assemblies = defaultdict(list) for domain in ['archaea', 'bacteria']: domain_dir = os.path.join(ncbi_genome_dir, domain) if not os.path.exists(domain_dir): continue for species_dir in os.listdir(domain_dir): full_species_dir = os.path.join(domain_dir, species_dir) for assembly_dir in os.listdir(full_species_dir): accession = assembly_dir[0:assembly_dir.find('_', 4)] processed_assemblies[accession].append(species_dir) if len(processed_assemblies[accession]) >= 2: continue full_assembly_dir = os.path.join(full_species_dir, assembly_dir) #if os.path.exists(os.path.join(full_assembly_dir, self.output_dir)): # continue genome_file = os.path.join(full_assembly_dir, assembly_dir + '_genomic.fna') input_files.append(genome_file) # generate metadata for user genomes if user_genome_dir != 'NONE': print 'Reading user genome directories.' for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) #if os.path.exists(os.path.join(full_genome_dir, self.output_dir)): # continue genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') input_files.append(genome_file) print 'Identified %d genomes to process.' % len(input_files) # process each genome print 'Generating metadata for each genome:' parallel = Parallel(cpus = cpus) parallel.run(self._producer, None, input_files, self._progress)
def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, db, cpus): """Create metadata by parsing assembly stats files.""" self.rna_gene = rna_gene if db == 'SILVA': # Silva info if rna_gene == 'ssu': self.db = self.silva_ssu_ref_file self.taxonomy = self.silva_ssu_taxonomy_file elif rna_gene == 'lsu_23S': self.db = self.silva_lsu_ref_file self.taxonomy = self.silva_lsu_taxonomy_file elif rna_gene == 'lsu_5S': print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.' self.output_dir = self.silva_output_dir else: print('Unrecognized database: %s' % db) sys.exit(-1) input_files = [] # generate metadata for NCBI assemblies if ncbi_genome_dir != 'NONE': print('Reading NCBI assembly directories.') processed_assemblies = defaultdict(list) rfq_dir = os.path.join(ncbi_genome_dir, 'refseq', 'GCF') gbk_dir = os.path.join(ncbi_genome_dir, 'genbank', 'GCA') for input_dir in (rfq_dir, gbk_dir): for first_three in os.listdir(input_dir): onethird_species_dir = os.path.join(input_dir, first_three) print onethird_species_dir if os.path.isfile(onethird_species_dir): continue for second_three in os.listdir(onethird_species_dir): twothird_species_dir = os.path.join( onethird_species_dir, second_three) # print twothird_species_dir if os.path.isfile(twothird_species_dir): continue for third_three in os.listdir(twothird_species_dir): threethird_species_dir = os.path.join( twothird_species_dir, third_three) # print threethird_species_dir if os.path.isfile(threethird_species_dir): continue for complete_name in os.listdir( threethird_species_dir): assembly_dir = os.path.join( threethird_species_dir, complete_name) if os.path.isfile(assembly_dir): continue accession = complete_name[0:complete_name. find('_', 4)] processed_assemblies[accession].append( assembly_dir) if len(processed_assemblies[accession]) >= 2: continue hmm_results_file = os.path.join( assembly_dir, self.output_dir, rna_gene + '.hmm_summary.tsv') if os.path.exists(hmm_results_file): continue genome_file = os.path.join( assembly_dir, complete_name + '_genomic.fna') input_files.append(genome_file) # generate metadata for user genomes if user_genome_dir != 'NONE': print('Reading user genome directories.') for user_id in os.listdir(user_genome_dir): full_user_dir = os.path.join(user_genome_dir, user_id) if not os.path.isdir(full_user_dir): continue for genome_id in os.listdir(full_user_dir): full_genome_dir = os.path.join(full_user_dir, genome_id) hmm_results_file = os.path.join( full_genome_dir, self.output_dir, rna_gene + '.hmm_summary.tsv') if os.path.exists(hmm_results_file): continue genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna') input_files.append(genome_file) print('Identified %d genomes to process.' % len(input_files)) # process each genome print('Generating metadata for each genome:') parallel = Parallel(cpus=cpus) parallel.run(self._producer, None, input_files, self._progress)
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert(model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error('Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree