def test_hello_world(self): with tempdir.in_tempdir(): with tempfile.NamedTemporaryFile() as fasta: with tempfile.NamedTemporaryFile() as tax: fasta.write(Tests.extra_mcra_fasta) fasta.flush() tax.write(Tests.extra_mcra_taxonomy) tax.flush() prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg') cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %( path_to_script, prev_path, fasta.name, tax.name, 'updated.gpkg') extern.run(cmd1) prev = GraftMPackage.acquire(prev_path) up = GraftMPackage.acquire('updated.gpkg') prevhash = prev.taxonomy_hash() taxhash = up.taxonomy_hash() self.assertEqual(len(prevhash)+1, len(taxhash)) self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'], taxhash['KYC55281.1']) self.assertEqual(prevhash['638165755'], taxhash['638165755']) seqio = SequenceIO() self.assertEqual( len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1, len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
def test_autodecorate(self): with tempdir.in_tempdir(): with tempfile.NamedTemporaryFile() as fasta: fasta.write(Tests.extra_mcra_fasta) fasta.flush() prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg') update = Update(prerequisites) update.update( input_sequence_path = fasta.name, input_graftm_package_path = prev_path, output_graftm_package_path = 'updated.gpkg') prev = GraftMPackage.acquire(prev_path) up = GraftMPackage.acquire('updated.gpkg') prevhash = prev.taxonomy_hash() taxhash = up.taxonomy_hash() self.assertEqual(11, len(taxhash)) #hard-code 11 because of #https://github.com/geronimp/graftM/issues/204 self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'], taxhash['KYC55281.1']) self.assertEqual(prevhash['638165755'], taxhash['638165755']) seqio = SequenceIO() self.assertEqual( len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1, len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
def _assign_taxonomy(self, extracted_reads, assignment_method): graftm_align_directory_base = os.path.join(self._working_directory, 'graftm_aligns') os.mkdir(graftm_align_directory_base) commands = [] all_tmp_files = [] # Run each one at a time serially so that the number of threads is # respected, to save RAM as one DB needs to be loaded at once, and so # fewer open files are needed, so that the open file count limit is # eased. for singlem_package, readsets in extracted_reads.each_package_wise(): tmp_files = [] for readset in readsets: if len(readset.sequences) > 0: tmp = tempfile.NamedTemporaryFile(prefix='singlem.%s' % readset.sample_name, suffix=".fasta") # Record basename (remove .fasta) so that the graftm output # file is recorded for later on in pipe. tmpbase = os.path.basename(tmp.name[:-6]) readset.tmpfile_basename = tmpbase seqio = SequenceIO() seqio.write_fasta(readset.sequences, tmp) tmp.flush() tmp_files.append(tmp) if len(tmp_files) > 0: tmpnames = list([tg.name for tg in tmp_files]) cmd = "%s "\ "--threads %i "\ "--forward %s "\ "--graftm_package %s "\ "--output_directory %s/%s "\ "--max_samples_for_krona 0 "\ "--assignment_method %s" % ( self._graftm_command_prefix(singlem_package.is_protein_package()), self._num_threads, ' '.join(tmpnames), singlem_package.graftm_package_path(), graftm_align_directory_base, singlem_package.graftm_package_basename(), assignment_method) commands.append(cmd) all_tmp_files.append(tmp_files) extern.run_many(commands, num_threads=1) for tmp_files in all_tmp_files: [t.close() for t in tmp_files] logging.info("Finished running taxonomic assignment with GraftM") return SingleMPipeTaxonomicAssignmentResult( graftm_align_directory_base)
def _align_sequences(self, input_sequences_path, output_alignment_path, threads): '''Align sequences into alignment_file Parameters ---------- input_sequences_path: str path to input sequences in fasta format output_alignment_path: str path to output alignment path threads: str number of threads to use Returns ------- Nothing ''' logging.debug("Aligning sequences using mafft") cmd = "mafft --anysymbol --thread %s --auto /dev/stdin > %s" % ( threads, output_alignment_path) inputs = [] with open(input_sequences_path) as f: for name, seq, _ in SequenceIO().each(f): inputs.append('>%s' % name) # Do not include * characters in the HMM, as this means tree # insertion fails. inputs.append(seq.replace('*', '')) extern.run(cmd, stdin="\n".join(inputs))
def test_hello_world(self): with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f1: with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f2: f1.write(self.eg1) f1.flush() extern.run("diamond makedb --in %s --db %s.dmnd" %\ (f1.name, f1.name)) f2.write(self.eg1) f2.write(self.eg2) f2.flush() extern.run("diamond makedb --in %s --db %s.dmnd" %\ (f2.name, f2.name)) with tempfile.NamedTemporaryFile( prefix='graftm_decoy_test') as f3: with tempfile.NamedTemporaryFile( prefix='graftm_decoy_test') as f4: f3.write(self.eg1) f3.flush() ret = DecoyFilter(Diamond(f2.name + ".dmnd"), Diamond(f1.name + ".dmnd")).filter( f1.name, f4.name) self.assertEqual(True, ret) seqs = SequenceIO().read_fasta_file(f4.name) self.assertEqual(1, len(seqs)) self.assertEqual("PROKKA_03952", seqs[0].name) # clean up os.remove(f1.name + ".dmnd") os.remove(f2.name + ".dmnd")
def extract_and_read(self, reads_to_extract, database_fasta_file): '''Extract the reads_to_extract from the database_fasta_file and return them. Parameters ---------- reads_to_extract: Iterable of str IDs of reads to be extracted database_fasta_file: str path the fasta file that containing the reads Returns ------- An array of graftm.sequence_io.Sequence objects''' cmd = "fxtract -XH -f /dev/stdin '%s'" % database_fasta_file process = subprocess.Popen(["bash", "-c", cmd], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output, error = process.communicate('\n'.join(reads_to_extract)) if process.returncode != 0: raise Exception( "Extraction command '%s' failed with exitstatus %i" % (cmd, process.returncode)) seqs = [] for name, seq, _ in SequenceIO().each(StringIO(output)): seqs.append(Sequence(name, seq)) return seqs
def _test_package(self, package_path): '''Give a GraftM package a spin, and see if it works in reality with default parameters (i.e. pplacer). If it does not work, then raise an error. Parameters ---------- package_path: str path to graftm_package to be tested ''' pkg = GraftMPackage.acquire(package_path) with tempdir.TempDir() as graftM_graft_test_dir_name: # Take a subset of sequences for testing with tempfile.NamedTemporaryFile(suffix=".fa", mode='w') as tf: seqio = SequenceIO() with open(pkg.unaligned_sequence_database_path()) as f: seqio.write_fasta( itertools.islice(seqio.each_sequence(f), 10), tf) tf.flush() cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" % ( tf.name, package_path, graftM_graft_test_dir_name) extern.run(cmd)
def _test_package(self, package_path): '''Give a GraftM package a spin, and see if it works in reality with default parameters (i.e. pplacer). If it does not work, then raise an error. Parameters ---------- package_path: str path to graftm_package to be tested ''' pkg = GraftMPackage.acquire(package_path) with tempdir.TempDir() as graftM_graft_test_dir_name: # Take a subset of sequences for testing with tempfile.NamedTemporaryFile(suffix=".fa") as tf: seqio = SequenceIO() seqio.write_fasta( itertools.islice(seqio.each_sequence(open(pkg.unaligned_sequence_database_path())), 10), tf) tf.flush() cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" %( tf.name, package_path, graftM_graft_test_dir_name) extern.run(cmd)
def extract_and_read(self, reads_to_extract, database_fasta_file): '''Extract the reads_to_extract from the database_fasta_file and return them. Parameters ---------- reads_to_extract: Iterable of str IDs of reads to be extracted database_fasta_file: str path the fasta file that containing the reads Returns ------- An array of graftm.sequence_io.Sequence objects''' cmd = "mfqe --output-uncompressed --fasta-read-name-lists /dev/stdin --input-fasta '{}' --output-fasta-files /dev/stdout".format( database_fasta_file) # Retrieve each sequence exactly once so mfqe does not croak output = extern.run(cmd, stdin='\n'.join(set(reads_to_extract))) seqs = [] for name, seq, _ in SequenceIO().each(StringIO(output)): seqs.append(Sequence(name, seq)) return seqs
def __init__(self): self.clust = Deduplicator() self.seqio = SequenceIO() self.seq_library = {} self.orfm_regex = OrfM.regular_expression()
class Clusterer: def __init__(self): self.clust = Deduplicator() self.seqio = SequenceIO() self.seq_library = {} self.orfm_regex = OrfM.regular_expression() def uncluster_annotations(self, input_annotations, reverse_pipe): ''' Update the annotations hash provided by pplacer to include all representatives within each cluster Parameters ---------- input_annotations : hash Classifications for each representative sequence of the clusters. each key being the sequence name, and the entry being the taxonomy string as a list. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_annotations : hash An updated version of the above, which includes all reads from each cluster ''' output_annotations = {} for placed_alignment_file_path, clusters in self.seq_library.items(): if reverse_pipe and placed_alignment_file_path.endswith( "_reverse_clustered.fa"): continue placed_alignment_file = os.path.basename( placed_alignment_file_path) cluster_classifications = input_annotations[placed_alignment_file] if reverse_pipe: placed_alignment_base = placed_alignment_file.replace( '_forward_clustered.fa', '') else: placed_alignment_base = placed_alignment_file.replace( '_clustered.fa', '') output_annotations[placed_alignment_base] = {} for rep_read_name, rep_read_taxonomy in cluster_classifications.items( ): if reverse_pipe: orfm_regex = OrfM.regular_expression() clusters = {(orfm_regex.match(key).groups(0)[0] if orfm_regex.match(key) else key): item for key, item in iter(clusters.items())} for read in clusters[rep_read_name]: output_annotations[placed_alignment_base][ read.name] = rep_read_taxonomy return output_annotations def cluster(self, input_fasta_list, reverse_pipe): ''' cluster - Clusters reads at 100% identity level and writes them to file. Resets the input_fasta variable as the FASTA file containing the clusters. Parameters ---------- input_fasta_list : list list of strings, each a path to input fasta files to be clustered. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_fasta_list : list list of strings, each a path to the output fasta file to which clusters were written to. ''' output_fasta_list = [] for input_fasta in input_fasta_list: output_path = input_fasta.replace('_hits.aln.fa', '_clustered.fa') cluster_dict = {} logging.debug('Clustering reads') if os.path.exists(input_fasta): reads = self.seqio.read_fasta_file( input_fasta) # Read in FASTA records logging.debug('Found %i reads' % len(reads)) # Report number found clusters = self.clust.deduplicate( reads) # Cluster redundant sequences logging.debug('Clustered to %s groups' % len(clusters)) # Report number of clusters logging.debug( 'Writing representative sequences of each cluster to: %s' % output_path) # Report the name of the file else: logging.debug("Found no reads to be clustered") clusters = [] self.seqio.write_fasta_file( [x[0] for x in clusters], output_path ) # Choose the first sequence to write to file as representative (all the same anyway) for cluster in clusters: cluster_dict[cluster[ 0].name] = cluster # assign the cluster to the dictionary self.seq_library[output_path] = cluster_dict output_fasta_list.append(output_path) return output_fasta_list
class Clusterer: def __init__(self): self.clust = Deduplicator() self.seqio = SequenceIO() self.seq_library = {} self.orfm_regex = OrfM.regular_expression() def uncluster_annotations(self, input_annotations, reverse_pipe): ''' Update the annotations hash provided by pplacer to include all representatives within each cluster Parameters ---------- input_annotations : hash Classifications for each representative sequence of the clusters. each key being the sequence name, and the entry being the taxonomy string as a list. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_annotations : hash An updated version of the above, which includes all reads from each cluster ''' output_annotations = {} for placed_alignment_file_path, clusters in self.seq_library.iteritems(): if reverse_pipe and placed_alignment_file_path.endswith("_reverse_clustered.fa"): continue placed_alignment_file = os.path.basename(placed_alignment_file_path) cluster_classifications = input_annotations[placed_alignment_file] if reverse_pipe: placed_alignment_base = placed_alignment_file.replace('_forward_clustered.fa', '') else: placed_alignment_base = placed_alignment_file.replace('_clustered.fa', '') output_annotations[placed_alignment_base] = {} for rep_read_name, rep_read_taxonomy in cluster_classifications.iteritems(): if reverse_pipe: orfm_regex = OrfM.regular_expression() clusters={(orfm_regex.match(key).groups(0)[0] if orfm_regex.match(key) else key): item for key, item in clusters.iteritems()} for read in clusters[rep_read_name]: output_annotations[placed_alignment_base][read.name] = rep_read_taxonomy return output_annotations def cluster(self, input_fasta_list, reverse_pipe): ''' cluster - Clusters reads at 100% identity level and writes them to file. Resets the input_fasta variable as the FASTA file containing the clusters. Parameters ---------- input_fasta_list : list list of strings, each a path to input fasta files to be clustered. reverse_pipe : bool True/False, whether the reverse reads pipeline is being followed. Returns ------- output_fasta_list : list list of strings, each a path to the output fasta file to which clusters were written to. ''' output_fasta_list = [] for input_fasta in input_fasta_list: output_path = input_fasta.replace('_hits.aln.fa', '_clustered.fa') cluster_dict = {} logging.debug('Clustering reads') if os.path.exists(input_fasta): reads=self.seqio.read_fasta_file(input_fasta) # Read in FASTA records logging.debug('Found %i reads' % len(reads)) # Report number found clusters=self.clust.deduplicate(reads) # Cluster redundant sequences logging.debug('Clustered to %s groups' % len(clusters)) # Report number of clusters logging.debug('Writing representative sequences of each cluster to: %s' % output_path) # Report the name of the file else: logging.debug("Found no reads to be clustered") clusters = [] self.seqio.write_fasta_file( [x[0] for x in clusters], output_path ) # Choose the first sequence to write to file as representative (all the same anyway) for cluster in clusters: cluster_dict[cluster[0].name]=cluster # assign the cluster to the dictionary self.seq_library[output_path]= cluster_dict output_fasta_list.append(output_path) return output_fasta_list
def generate_expand_search_database_from_contigs(self, contig_files, output_database_file, search_method): '''Given a collection of search_hmm_files, search the contigs in contig_files, and generate an HMM from the resulting hits, outputting it as output_database_file. Parameters ---------- contig_files: list of str list of files to search output_database_file: str path to output file search_method: str "diamond" or "hmmsearch", to specify search method to use and what type of database to build. Returns ------- True if genes were recovered, else False''' ss = SequenceSearcher(self.search_hmm_files) seqio = SequenceIO() if search_method == self.DIAMOND_SEARCH_METHOD: if self.diamond_database == None or self.unaligned_sequence_database == None: logging.warning( "Cannot expand_search continue with no diamond database or unaligned sequences." ) return False with tempfile.NamedTemporaryFile( prefix='graftm_expand_search_orfs') as orfs: logging.info("Finding expand_search hits in provided contigs..") for contig_file in contig_files: logging.debug("Finding expand_search hits in %s.." % contig_file) unpack = UnpackRawReads(contig_file) with tempfile.NamedTemporaryFile(prefix='graftm_expand_search') as \ hit_reads_orfs_fasta: # search and extract matching ORFs with tempfile.NamedTemporaryFile(prefix='graftm_expand_search2') as \ hmmsearch_output_table: with tempfile.NamedTemporaryFile(prefix='graftm_expand_search3') as \ hit_reads_fasta: ss.search_and_extract_orfs_matching_protein_database(\ unpack, search_method, self.maximum_range, self.threads, self.evalue, self.min_orf_length, None, (self.diamond_database if self.diamond_database else None), hmmsearch_output_table.name, hit_reads_fasta.name, hit_reads_orfs_fasta.name) # Append to the file shutil.copyfileobj(open(hit_reads_orfs_fasta.name), orfs) # Now have a fasta file of ORFs. # Check to make sure the file is not zero-length orfs.flush() with tempfile.NamedTemporaryFile( prefix="graftm_expand_search_aln") as aln: if search_method == self.HMM_SEARCH_METHOD: # Check that there is more than one sequence to align. if len( seqio.read_fasta_file(orfs.name) ) <= 1: # Just to build on this, you need to check if there is > 1 hit # otherwise mafft will fail to align, causing a crash when hmmbuild is # run on an empty file. logging.warn( "Failed to find two or more matching ORFs in the expand_search contigs" ) return False # Run mafft to align them cmd = "mafft --auto %s >%s" % (orfs.name, aln.name) logging.info("Aligning expand_search hits..") extern.run(cmd) # Run hmmbuild to create an HMM cmd = "hmmbuild --amino %s %s >/dev/null" % ( output_database_file, aln.name) logging.info("Building HMM from expand_search hits..") extern.run(cmd) elif search_method == self.DIAMOND_SEARCH_METHOD: # Concatenate database with existing database with tempfile.NamedTemporaryFile( prefix="concatenated_database") as databasefile: for f in [orfs.name, self.unaligned_sequence_database]: for line in open(f): databasefile.write(line) databasefile.flush() # Run diamond make to create a diamond database cmd = "diamond makedb --in '%s' -d '%s'" % ( databasefile.name, output_database_file) logging.info( "Building a diamond database from expand_search hits.." ) extern.run(cmd) else: raise Exception("Search method not recognised: %s" % search_method) return False return True
else: loglevel = logging.INFO logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # Read in taxonomy logging.info("Reading taxonomy..") gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy logging.info("Read in %i taxonomies" % len(gg)) # Read in sequence logging.info("Reading sequences..") duplicates = set() sequences = {} for name, seq, _ in SequenceIO()._readfq(open(args.sequences)): if name in sequences: logging.error("Duplicate sequence name %s" % name) duplicates.add(name) else: sequences[name] = seq logging.warn("Found %i duplicated IDs" % len(duplicates)) for dup in duplicates: del sequences[dup] logging.info("Read in %i sequences" % len(sequences)) # Ensure that each sequence in the taxonomy has an associated sequence, # otherwise delete it tax_no_seq = set() for name, taxonomy in gg.items(): if name not in sequences:
def main(self, **kwargs): alignment = kwargs.pop('alignment',None) sequences = kwargs.pop('sequences',None) taxonomy = kwargs.pop('taxonomy',None) rerooted_tree = kwargs.pop('rerooted_tree',None) unrooted_tree = kwargs.pop('unrooted_tree',None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files',None) min_aligned_percent = kwargs.pop('min_aligned_percent',0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force',False) graftm_package = kwargs.pop('graftm_package',False) dereplication_level = kwargs.pop('dereplication_level',False) threads = kwargs.pop('threads',5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] tempfiles_to_close = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if user_hmm: align_hmm = user_hmm else: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100)) for s in insufficiently_aligned_sequences: logging.warn("Insufficient alignment of %s, not including this sequence" % s) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(alignment2_fh) alignment2 = alignment2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info("Reconstructing the alignment and HMM from remaining sequences") output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if not user_hmm: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm') tempfiles_to_close.append(search_hmm_fh) search_hmm = search_hmm_fh.name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error("Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception("All sequences must be assigned a taxonomy, cannot continue") logging.debug("Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base+"_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names=[] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm') tempfiles_to_close.append(log_file_tempfile) log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') tempfiles_to_close.append(tre_file_tempfile) tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep=[ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base+"_seqinfo.csv" tax = base+"_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy else: raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") taxonomy_definition = {x:taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep} gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) for tf in tempfiles_to_close: tf.close() # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")
def main(self, **kwargs): alignment = kwargs.pop('alignment', None) sequences = kwargs.pop('sequences', None) taxonomy = kwargs.pop('taxonomy', None) rerooted_tree = kwargs.pop('rerooted_tree', None) unrooted_tree = kwargs.pop('unrooted_tree', None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files', None) min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force', False) graftm_package = kwargs.pop('graftm_package', False) dereplication_level = kwargs.pop('dereplication_level', False) threads = kwargs.pop('threads', 5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception( "Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info( "Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception( "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception( "Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa').name align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile( prefix='graftm', suffix='_align.hmm').name) if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception( "Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn( "One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent * 100)) for s in insufficiently_aligned_sequences: logging.warn( "Insufficient alignment of %s, not including this sequence" % s) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: _, alignment2 = tempfile.mkstemp(prefix='graftm', suffix='.aln.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning( '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info( "After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception( "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info( "Reconstructing the alignment and HMM from remaining sequences" ) output_alignment = tempfile.NamedTemporaryFile( prefix='graftm', suffix='.aln.faa').name if not user_hmm: align_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm').name ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm').name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error( "Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception( "All sequences must be assigned a taxonomy, cannot continue") logging.debug( "Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base + "_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names = [] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree_log', prefix='graftm') log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree', prefix='graftm') tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep = [ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base + "_seqinfo.csv" tax = base + "_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree" ) taxonomy_definition = TaxonomyExtractor( ).taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy else: raise Exception( "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) taxonomy_definition = { x: taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep } gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")
def _assign_taxonomy_with_diamond(self, base_list, db_search_results, graftm_package, graftm_files, diamond_performance_parameters): '''Run diamond to assign taxonomy Parameters ---------- base_list: list of str list of sequence block names db_search_results: list of DBSearchResult the result of running hmmsearches graftm_package: GraftMPackage object Diamond is run against this database graftm_files: GraftMFiles object Result files are written here diamond_performance_parameters : str extra args for DIAMOND Returns ------- list of 1. time taken for assignment 2. assignments i.e. dict of base_list entry to dict of read names to to taxonomies, or None if there was no hit detected. ''' runner = Diamond(graftm_package.diamond_database_path(), self.args.threads, self.args.evalue) taxonomy_definition = Getaxnseq().read_taxtastic_taxonomy_and_seqinfo\ (open(graftm_package.taxtastic_taxonomy_path()), open(graftm_package.taxtastic_seqinfo_path())) results = {} # For each of the search results, for i, search_result in enumerate(db_search_results): if search_result.hit_fasta() is None: sequence_id_to_taxonomy = {} else: sequence_id_to_hit = {} # Run diamond logging.debug("Running diamond on %s" % search_result.hit_fasta()) diamond_result = runner.run( search_result.hit_fasta(), UnpackRawReads.PROTEIN_SEQUENCE_TYPE, daa_file_basename=graftm_files. diamond_assignment_output_basename(base_list[i]), extra_args=diamond_performance_parameters) for res in diamond_result.each([ SequenceSearchResult.QUERY_ID_FIELD, SequenceSearchResult.HIT_ID_FIELD ]): if res[0] in sequence_id_to_hit: # do not accept duplicates if sequence_id_to_hit[res[0]] != res[1]: raise Exception( "Diamond unexpectedly gave two hits for a single query sequence for %s" % res[0]) else: sequence_id_to_hit[res[0]] = res[1] # Extract taxonomy of the best hit, and add in the no hits sequence_id_to_taxonomy = {} for seqio in SequenceIO().read_fasta_file( search_result.hit_fasta()): name = seqio.name if name in sequence_id_to_hit: # Add Root; to be in line with pplacer assignment method sequence_id_to_taxonomy[name] = [ 'Root' ] + taxonomy_definition[sequence_id_to_hit[name]] else: # picked up in the initial search (by hmmsearch, say), but diamond misses it sequence_id_to_taxonomy[name] = ['Root'] results[base_list[i]] = sequence_id_to_taxonomy return results