class Clusterer: def __init__(self): self.clust = Deduplicator() self.seqio = SequenceIO() self.seq_library = {} self.orfm_regex = OrfM.regular_expression() def uncluster_annotations(self, input_annotations, reverse_pipe): ''' Update the annotations hash provided by pplacer to include all representatives within each cluster Parameters ---------- input_annotations : hash Classifications for each representative sequence of the clusters. each key being the sequence name, and the entry being the taxonomy string as a list. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_annotations : hash An updated version of the above, which includes all reads from each cluster ''' output_annotations = {} for placed_alignment_file_path, clusters in self.seq_library.items(): if reverse_pipe and placed_alignment_file_path.endswith( "_reverse_clustered.fa"): continue placed_alignment_file = os.path.basename( placed_alignment_file_path) cluster_classifications = input_annotations[placed_alignment_file] if reverse_pipe: placed_alignment_base = placed_alignment_file.replace( '_forward_clustered.fa', '') else: placed_alignment_base = placed_alignment_file.replace( '_clustered.fa', '') output_annotations[placed_alignment_base] = {} for rep_read_name, rep_read_taxonomy in cluster_classifications.items( ): if reverse_pipe: orfm_regex = OrfM.regular_expression() clusters = {(orfm_regex.match(key).groups(0)[0] if orfm_regex.match(key) else key): item for key, item in iter(clusters.items())} for read in clusters[rep_read_name]: output_annotations[placed_alignment_base][ read.name] = rep_read_taxonomy return output_annotations def cluster(self, input_fasta_list, reverse_pipe): ''' cluster - Clusters reads at 100% identity level and writes them to file. Resets the input_fasta variable as the FASTA file containing the clusters. Parameters ---------- input_fasta_list : list list of strings, each a path to input fasta files to be clustered. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_fasta_list : list list of strings, each a path to the output fasta file to which clusters were written to. ''' output_fasta_list = [] for input_fasta in input_fasta_list: output_path = input_fasta.replace('_hits.aln.fa', '_clustered.fa') cluster_dict = {} logging.debug('Clustering reads') if os.path.exists(input_fasta): reads = self.seqio.read_fasta_file( input_fasta) # Read in FASTA records logging.debug('Found %i reads' % len(reads)) # Report number found clusters = self.clust.deduplicate( reads) # Cluster redundant sequences logging.debug('Clustered to %s groups' % len(clusters)) # Report number of clusters logging.debug( 'Writing representative sequences of each cluster to: %s' % output_path) # Report the name of the file else: logging.debug("Found no reads to be clustered") clusters = [] self.seqio.write_fasta_file( [x[0] for x in clusters], output_path ) # Choose the first sequence to write to file as representative (all the same anyway) for cluster in clusters: cluster_dict[cluster[ 0].name] = cluster # assign the cluster to the dictionary self.seq_library[output_path] = cluster_dict output_fasta_list.append(output_path) return output_fasta_list
def main(self, **kwargs): alignment = kwargs.pop('alignment',None) sequences = kwargs.pop('sequences',None) taxonomy = kwargs.pop('taxonomy',None) rerooted_tree = kwargs.pop('rerooted_tree',None) unrooted_tree = kwargs.pop('unrooted_tree',None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files',None) min_aligned_percent = kwargs.pop('min_aligned_percent',0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force',False) graftm_package = kwargs.pop('graftm_package',False) dereplication_level = kwargs.pop('dereplication_level',False) threads = kwargs.pop('threads',5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] tempfiles_to_close = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if user_hmm: align_hmm = user_hmm else: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100)) for s in insufficiently_aligned_sequences: logging.warn("Insufficient alignment of %s, not including this sequence" % s) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(alignment2_fh) alignment2 = alignment2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info("Reconstructing the alignment and HMM from remaining sequences") output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if not user_hmm: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm') tempfiles_to_close.append(search_hmm_fh) search_hmm = search_hmm_fh.name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error("Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception("All sequences must be assigned a taxonomy, cannot continue") logging.debug("Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base+"_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names=[] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm') tempfiles_to_close.append(log_file_tempfile) log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') tempfiles_to_close.append(tre_file_tempfile) tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep=[ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base+"_seqinfo.csv" tax = base+"_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy else: raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") taxonomy_definition = {x:taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep} gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) for tf in tempfiles_to_close: tf.close() # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")
class Clusterer: def __init__(self): self.clust = Deduplicator() self.seqio = SequenceIO() self.seq_library = {} self.orfm_regex = OrfM.regular_expression() def uncluster_annotations(self, input_annotations, reverse_pipe): ''' Update the annotations hash provided by pplacer to include all representatives within each cluster Parameters ---------- input_annotations : hash Classifications for each representative sequence of the clusters. each key being the sequence name, and the entry being the taxonomy string as a list. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_annotations : hash An updated version of the above, which includes all reads from each cluster ''' output_annotations = {} for placed_alignment_file_path, clusters in self.seq_library.iteritems(): if reverse_pipe and placed_alignment_file_path.endswith("_reverse_clustered.fa"): continue placed_alignment_file = os.path.basename(placed_alignment_file_path) cluster_classifications = input_annotations[placed_alignment_file] if reverse_pipe: placed_alignment_base = placed_alignment_file.replace('_forward_clustered.fa', '') else: placed_alignment_base = placed_alignment_file.replace('_clustered.fa', '') output_annotations[placed_alignment_base] = {} for rep_read_name, rep_read_taxonomy in cluster_classifications.iteritems(): if reverse_pipe: orfm_regex = OrfM.regular_expression() clusters={(orfm_regex.match(key).groups(0)[0] if orfm_regex.match(key) else key): item for key, item in clusters.iteritems()} for read in clusters[rep_read_name]: output_annotations[placed_alignment_base][read.name] = rep_read_taxonomy return output_annotations def cluster(self, input_fasta_list, reverse_pipe): ''' cluster - Clusters reads at 100% identity level and writes them to file. Resets the input_fasta variable as the FASTA file containing the clusters. Parameters ---------- input_fasta_list : list list of strings, each a path to input fasta files to be clustered. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_fasta_list : list list of strings, each a path to the output fasta file to which clusters were written to. ''' output_fasta_list = [] for input_fasta in input_fasta_list: output_path = input_fasta.replace('_hits.aln.fa', '_clustered.fa') cluster_dict = {} logging.debug('Clustering reads') if os.path.exists(input_fasta): reads=self.seqio.read_fasta_file(input_fasta) # Read in FASTA records logging.debug('Found %i reads' % len(reads)) # Report number found clusters=self.clust.deduplicate(reads) # Cluster redundant sequences logging.debug('Clustered to %s groups' % len(clusters)) # Report number of clusters logging.debug('Writing representative sequences of each cluster to: %s' % output_path) # Report the name of the file else: logging.debug("Found no reads to be clustered") clusters = [] self.seqio.write_fasta_file( [x[0] for x in clusters], output_path ) # Choose the first sequence to write to file as representative (all the same anyway) for cluster in clusters: cluster_dict[cluster[0].name]=cluster # assign the cluster to the dictionary self.seq_library[output_path]= cluster_dict output_fasta_list.append(output_path) return output_fasta_list
def main(self, **kwargs): alignment = kwargs.pop('alignment', None) sequences = kwargs.pop('sequences', None) taxonomy = kwargs.pop('taxonomy', None) rerooted_tree = kwargs.pop('rerooted_tree', None) unrooted_tree = kwargs.pop('unrooted_tree', None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files', None) min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force', False) graftm_package = kwargs.pop('graftm_package', False) dereplication_level = kwargs.pop('dereplication_level', False) threads = kwargs.pop('threads', 5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception( "Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info( "Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception( "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception( "Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa').name align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile( prefix='graftm', suffix='_align.hmm').name) if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception( "Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn( "One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent * 100)) for s in insufficiently_aligned_sequences: logging.warn( "Insufficient alignment of %s, not including this sequence" % s) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: _, alignment2 = tempfile.mkstemp(prefix='graftm', suffix='.aln.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning( '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info( "After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception( "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info( "Reconstructing the alignment and HMM from remaining sequences" ) output_alignment = tempfile.NamedTemporaryFile( prefix='graftm', suffix='.aln.faa').name if not user_hmm: align_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm').name ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm').name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error( "Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception( "All sequences must be assigned a taxonomy, cannot continue") logging.debug( "Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base + "_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names = [] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree_log', prefix='graftm') log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree', prefix='graftm') tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep = [ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base + "_seqinfo.csv" tax = base + "_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree" ) taxonomy_definition = TaxonomyExtractor( ).taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy else: raise Exception( "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) taxonomy_definition = { x: taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep } gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")