def test__guess_sequence_type(self): urr = UnpackRawReads(None) self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10)) self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10+'T'*89)) self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('P'*10+'T'*90)) self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('A'*300+'E'*999)) #only look at the first 300bp self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('a'*10+'T'*89)) #lowercase
def test__guess_sequence_type(self): urr = UnpackRawReads(None) self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P' * 10)) self.assertEqual( 'aminoacid', urr._guess_sequence_type_from_string('P' * 10 + 'T' * 89)) self.assertEqual( 'nucleotide', urr._guess_sequence_type_from_string('P' * 10 + 'T' * 90)) self.assertEqual( 'nucleotide', urr._guess_sequence_type_from_string( 'A' * 300 + 'E' * 999)) #only look at the first 300bp self.assertEqual( 'nucleotide', urr._guess_sequence_type_from_string('a' * 10 + 'T' * 89)) #lowercase
def test_stars(self): urr = UnpackRawReads(None) self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P' * 10 + "*"))
def generate_expand_search_database_from_contigs(self, contig_files, output_database_file, search_method): '''Given a collection of search_hmm_files, search the contigs in contig_files, and generate an HMM from the resulting hits, outputting it as output_database_file. Parameters ---------- contig_files: list of str list of files to search output_database_file: str path to output file search_method: str "diamond" or "hmmsearch", to specify search method to use and what type of database to build. Returns ------- True if genes were recovered, else False''' ss = SequenceSearcher(self.search_hmm_files) seqio = SequenceIO() if search_method == self.DIAMOND_SEARCH_METHOD: if self.diamond_database == None or self.unaligned_sequence_database == None: logging.warning( "Cannot expand_search continue with no diamond database or unaligned sequences." ) return False with tempfile.NamedTemporaryFile( prefix='graftm_expand_search_orfs') as orfs: logging.info("Finding expand_search hits in provided contigs..") for contig_file in contig_files: logging.debug("Finding expand_search hits in %s.." % contig_file) unpack = UnpackRawReads(contig_file) with tempfile.NamedTemporaryFile(prefix='graftm_expand_search') as \ hit_reads_orfs_fasta: # search and extract matching ORFs with tempfile.NamedTemporaryFile(prefix='graftm_expand_search2') as \ hmmsearch_output_table: with tempfile.NamedTemporaryFile(prefix='graftm_expand_search3') as \ hit_reads_fasta: ss.search_and_extract_orfs_matching_protein_database(\ unpack, search_method, self.maximum_range, self.threads, self.evalue, self.min_orf_length, None, (self.diamond_database if self.diamond_database else None), hmmsearch_output_table.name, hit_reads_fasta.name, hit_reads_orfs_fasta.name) # Append to the file shutil.copyfileobj(open(hit_reads_orfs_fasta.name), orfs) # Now have a fasta file of ORFs. # Check to make sure the file is not zero-length orfs.flush() with tempfile.NamedTemporaryFile( prefix="graftm_expand_search_aln") as aln: if search_method == self.HMM_SEARCH_METHOD: # Check that there is more than one sequence to align. if len( seqio.read_fasta_file(orfs.name) ) <= 1: # Just to build on this, you need to check if there is > 1 hit # otherwise mafft will fail to align, causing a crash when hmmbuild is # run on an empty file. logging.warn( "Failed to find two or more matching ORFs in the expand_search contigs" ) return False # Run mafft to align them cmd = "mafft --auto %s >%s" % (orfs.name, aln.name) logging.info("Aligning expand_search hits..") extern.run(cmd) # Run hmmbuild to create an HMM cmd = "hmmbuild --amino %s %s >/dev/null" % ( output_database_file, aln.name) logging.info("Building HMM from expand_search hits..") extern.run(cmd) elif search_method == self.DIAMOND_SEARCH_METHOD: # Concatenate database with existing database with tempfile.NamedTemporaryFile( prefix="concatenated_database") as databasefile: for f in [orfs.name, self.unaligned_sequence_database]: for line in open(f): databasefile.write(line) databasefile.flush() # Run diamond make to create a diamond database cmd = "diamond makedb --in '%s' -d '%s'" % ( databasefile.name, output_database_file) logging.info( "Building a diamond database from expand_search hits.." ) extern.run(cmd) else: raise Exception("Search method not recognised: %s" % search_method) return False return True
def test_stars(self): urr = UnpackRawReads(None) self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10+"*"))
def graft(self): # The Graft pipeline: # Searches for reads using hmmer, and places them in phylogenetic # trees to derive a community structure. if self.args.graftm_package: gpkg = GraftMPackage.acquire(self.args.graftm_package) else: gpkg = None REVERSE_PIPE = (True if self.args.reverse else False) INTERLEAVED = (True if self.args.interleaved else False) base_list = [] seqs_list = [] search_results = [] hit_read_count_list = [] db_search_results = [] if gpkg: maximum_range = gpkg.maximum_range() if self.args.search_diamond_file: self.args.search_method = self.hk.DIAMOND_SEARCH_METHOD diamond_db = self.args.search_diamond_file[0] else: diamond_db = gpkg.diamond_database_path() if self.args.search_method == self.hk.DIAMOND_SEARCH_METHOD: if not diamond_db: logging.error( "%s search method selected, but no diamond database specified. \ Please either provide a gpkg to the --graftm_package flag, or a diamond \ database to the --search_diamond_file flag." % self.args.search_method) raise Exception() else: # Get the maximum range, if none exists, make one from the HMM profile if self.args.maximum_range: maximum_range = self.args.maximum_range else: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: if not self.args.search_only: maximum_range = self.hk.get_maximum_range( self.args.aln_hmm_file) else: logging.debug( "Running search only pipeline. maximum_range not configured." ) maximum_range = None else: logging.warning( 'Cannot determine maximum range when using %s pipeline and with no GraftM package specified' % self.args.search_method) logging.warning( 'Setting maximum_range to None (linked hits will not be detected)' ) maximum_range = None if self.args.search_diamond_file: diamond_db = self.args.search_diamond_file else: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: diamond_db = None else: logging.error( "%s search method selected, but no gpkg or diamond database selected" % self.args.search_method) if self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT: if self.args.reverse: logging.warn( "--reverse reads specified with --assignment_method diamond. Reverse reads will be ignored." ) self.args.reverse = None # If merge reads is specified, check that there are reverse reads to merge with if self.args.merge_reads and not hasattr(self.args, 'reverse'): raise Exception("Programming error") # Set the output directory if not specified and create that directory logging.debug('Creating working directory: %s' % self.args.output_directory) self.hk.make_working_directory(self.args.output_directory, self.args.force) # Set pipeline and evalue by checking HMM format if self.args.search_only: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: hmm_type, hmm_tc = self.hk.setpipe( self.args.search_hmm_files[0]) logging.debug("HMM type: %s Trusted Cutoff: %s" % (hmm_type, hmm_tc)) else: hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file) logging.debug("HMM type: %s Trusted Cutoff: %s" % (hmm_type, hmm_tc)) if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: setattr(self.args, 'type', hmm_type) if hmm_tc: setattr(self.args, 'evalue', '--cut_tc') else: setattr(self.args, 'type', self.PIPELINE_AA) if self.args.filter_minimum is not None: filter_minimum = self.args.filter_minimum else: if self.args.type == self.PIPELINE_NT: filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES else: filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES # Generate expand_search database if required if self.args.expand_search_contigs: if self.args.graftm_package: pkg = GraftMPackage.acquire(self.args.graftm_package) else: pkg = None boots = ExpandSearcher(search_hmm_files=self.args.search_hmm_files, maximum_range=self.args.maximum_range, threads=self.args.threads, evalue=self.args.evalue, min_orf_length=self.args.min_orf_length, graftm_package=pkg) # this is a hack, it should really use GraftMFiles but that class isn't currently flexible enough new_database = (os.path.join(self.args.output_directory, "expand_search.hmm") \ if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD \ else os.path.join(self.args.output_directory, "expand_search") ) if boots.generate_expand_search_database_from_contigs( self.args.expand_search_contigs, new_database, self.args.search_method): if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: self.ss.search_hmm.append(new_database) else: diamond_db = new_database first_search_method = self.args.search_method if self.args.decoy_database: decoy_filter = DecoyFilter( Diamond(diamond_db, threads=self.args.threads), Diamond(self.args.decoy_database, threads=self.args.threads)) doing_decoy_search = True elif self.args.search_method == self.hk.HMMSEARCH_AND_DIAMOND_SEARCH_METHOD: decoy_filter = DecoyFilter( Diamond(diamond_db, threads=self.args.threads)) doing_decoy_search = True first_search_method = self.hk.HMMSEARCH_SEARCH_METHOD else: doing_decoy_search = False # For each pair (or single file passed to GraftM) logging.debug('Working with %i file(s)' % len(self.sequence_pair_list)) for pair in self.sequence_pair_list: # Guess the sequence file type, if not already specified to GraftM unpack = UnpackRawReads(pair[0], self.args.input_sequence_type, INTERLEAVED) # Set the basename, and make an entry to the summary table. base = unpack.basename() pair_direction = ['forward', 'reverse'] logging.info("Working on %s" % base) # Make the working base subdirectory self.hk.make_working_directory( os.path.join(self.args.output_directory, base), self.args.force) # for each of the paired end read files for read_file in pair: unpack = UnpackRawReads(read_file, self.args.input_sequence_type, INTERLEAVED) if read_file is None: # placeholder for interleaved (second file is None) continue if not os.path.isfile(read_file): # Check file exists logging.info('%s does not exist! Skipping this file..' % read_file) continue # Set the output file_name if len(pair) == 2: direction = 'interleaved' if pair[1] is None \ else pair_direction.pop(0) logging.info("Working on %s reads" % direction) self.gmf = GraftMFiles(base, self.args.output_directory, direction) self.hk.make_working_directory( os.path.join(self.args.output_directory, base, direction), self.args.force) else: direction = False self.gmf = GraftMFiles(base, self.args.output_directory, direction) if self.args.type == self.PIPELINE_AA: logging.debug("Running protein pipeline") try: search_time, ( result, complement_information) = self.ss.aa_db_search( self.gmf, base, unpack, first_search_method, maximum_range, self.args.threads, self.args.evalue, self.args.min_orf_length, self.args.restrict_read_length, diamond_db, self.args.diamond_performance_parameters, ) except NoInputSequencesException as e: logging.error( "No sufficiently long open reading frames were found, indicating" " either the input sequences are too short or the min orf length" " cutoff is too high. Cannot continue sorry. Alternatively, there" " is something amiss with the installation of OrfM. The specific" " command that failed was: %s" % e.command) exit(Run.NO_ORFS_EXITSTATUS) # Or the DNA pipeline elif self.args.type == self.PIPELINE_NT: logging.debug("Running nucleotide pipeline") search_time, ( result, complement_information) = self.ss.nt_db_search( self.gmf, base, unpack, self.args.euk_check, self.args.search_method, maximum_range, self.args.threads, self.args.evalue) reads_detected = True if not result.hit_fasta() or os.path.getsize( result.hit_fasta()) == 0: logging.info('No reads found in %s' % base) reads_detected = False if self.args.search_only: db_search_results.append(result) base_list.append(base) continue # Filter out decoys if specified if reads_detected and doing_decoy_search: with tempfile.NamedTemporaryFile(prefix="graftm_decoy", suffix='.fa') as f: tmpname = f.name any_remaining = decoy_filter.filter( result.hit_fasta(), tmpname) if any_remaining: shutil.move(tmpname, result.hit_fasta()) else: # No hits remain after decoy filtering. os.remove(result.hit_fasta()) continue if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT: logging.info( 'aligning reads to reference package database') hit_aligned_reads = self.gmf.aligned_fasta_output_path( base) if reads_detected: aln_time, aln_result = self.ss.align( result.hit_fasta(), hit_aligned_reads, complement_information, self.args.type, filter_minimum) else: aln_time = 'n/a' if not os.path.exists( hit_aligned_reads ): # If all were filtered out, or there just was none.. with open(hit_aligned_reads, 'w') as f: pass # just touch the file, nothing else seqs_list.append(hit_aligned_reads) db_search_results.append(result) base_list.append(base) search_results.append(result.search_result) hit_read_count_list.append(result.hit_count) # Write summary table srchtw = SearchTableWriter() srchtw.build_search_otu_table( [x.search_objects for x in db_search_results], base_list, self.gmf.search_otu_table()) if self.args.search_only: logging.info( 'Stopping before alignment and taxonomic assignment phase\n') exit(0) if self.args.merge_reads: # not run when diamond is the assignment mode- enforced by argparse grokking logging.debug("Running merge reads output") if self.args.interleaved: fwd_seqs = seqs_list rev_seqs = [] else: base_list = base_list[0::2] fwd_seqs = seqs_list[0::2] rev_seqs = seqs_list[1::2] merged_output=[GraftMFiles(base, self.args.output_directory, False).aligned_fasta_output_path(base) \ for base in base_list] logging.debug("merged reads to %s", merged_output) self.ss.merge_forev_aln(fwd_seqs, rev_seqs, merged_output) seqs_list = merged_output REVERSE_PIPE = False elif REVERSE_PIPE: base_list = base_list[0::2] # Leave the pipeline if search only was specified if self.args.search_and_align_only: logging.info('Stopping before taxonomic assignment phase\n') exit(0) elif not any(base_list): logging.error( 'No hits in any of the provided files. Cannot continue with no reads to assign taxonomy to.\n' ) exit(0) self.gmf = GraftMFiles('', self.args.output_directory, False) if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT: clusterer = Clusterer() # Classification steps seqs_list = clusterer.cluster(seqs_list, REVERSE_PIPE) logging.info("Placing reads into phylogenetic tree") taxonomic_assignment_time, assignments = self.p.place( REVERSE_PIPE, seqs_list, self.args.resolve_placements, self.gmf, self.args, result.slash_endings, gpkg.taxtastic_taxonomy_path(), clusterer) assignments = clusterer.uncluster_annotations( assignments, REVERSE_PIPE) elif self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT: logging.info("Assigning taxonomy with diamond") taxonomic_assignment_time, assignments = self._assign_taxonomy_with_diamond(\ base_list, db_search_results, gpkg, self.gmf, self.args.diamond_performance_parameters) aln_time = 'n/a' else: raise Exception("Unexpected assignment method encountered: %s" % self.args.placement_method) self.summarise(base_list, assignments, REVERSE_PIPE, [search_time, aln_time, taxonomic_assignment_time], hit_read_count_list, self.args.max_samples_for_krona)