def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if params['chimera_detection_method'] == 'blast_fragments': blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['reference_seqs_fp'], output_dir=working_dir) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db elif params['chimera_detection_method'] == 'ChimeraSlayer': # copy the reference files to working dir # ChimeraSlayer creates an index file of the ref and # will crash without write permission in the ref seqs dir aligned_reference_seqs_fp = params['aligned_reference_seqs_fp'] _, new_ref_filename = split(aligned_reference_seqs_fp) copy(aligned_reference_seqs_fp, working_dir) aligned_reference_seqs_fp = working_dir + "/" + new_ref_filename self.files_to_remove.append(aligned_reference_seqs_fp) params['aligned_reference_seqs_fp'] = aligned_reference_seqs_fp # if given, also copy the unaligned ref db reference_seqs_fp = params['reference_seqs_fp'] if reference_seqs_fp: _, new_ref_filename = split(reference_seqs_fp) copy(reference_seqs_fp, working_dir) reference_seqs_fp = working_dir + "/" + new_ref_filename else: # otherwise create it reference_seqs_fp = write_degapped_fasta_to_file( parse_fasta(open(aligned_reference_seqs_fp)), tmp_dir=working_dir) # delete it afterwards self.files_to_remove.append(reference_seqs_fp) params['reference_seqs_fp'] = reference_seqs_fp # build blast db of reference, otherwise ChimeraSlayer will do it # and parallel jobs clash _, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) self.files_to_remove += db_files_to_remove # make the index file globally # Reason: ChimeraSlayer first checks to see if the index file is # there. If not it tries to create it. This can lead to race # condition if several parallel jobs try to create it at the same # time. make_cidx_file(aligned_reference_seqs_fp) self.files_to_remove.append(aligned_reference_seqs_fp + ".cidx") else: raise ValueError("Unrecognized chimera detection method '%s'." % params['chimera_detection_method'])
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params["blast_db"]: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc. blast_db, db_files_to_remove = build_blast_db_from_fasta_path(params["reference_seqs_fp"]) self.files_to_remove += db_files_to_remove params["blast_db"] = blast_db
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['refseqs_fp']) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db
def _precommand_initiation( self, input_fp, output_dir, working_dir, params): if params['refseqs_path']: # Build the blast database from the refseqs_path -- all procs # will then access one db rather than create one per proc. blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['refseqs_path']) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db
def _precommand_initiation( self, input_fp, output_dir, working_dir, params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['template_fp'], output_dir=get_qiime_temp_dir()) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db if params['min_length'] < 0: params['min_length'] = compute_min_alignment_length( open(input_fp, 'U'))
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['template_fp'], output_dir=get_qiime_temp_dir()) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db if params['min_length'] < 0: params['min_length'] = compute_min_alignment_length( open(input_fp, 'U'))
def __init__(self, params): """Return new BlastFragmentsChimeraChecker object with specified params. """ _params = {'max_e_value': 1e-30, 'min_pct_id': 0.90, 'num_fragments': 3, 'taxonomy_depth': 4} _params.update(params) try: id_to_taxonomy_fp = params['id_to_taxonomy_fp'] except KeyError: raise ValueError( "id_to_taxonomy_filepath must be provided to %s" % self.Name) # Create the blast database if it hasn't been provided if 'blast_db' not in params or params['blast_db'] is None: try: reference_seqs_fp = params['reference_seqs_fp'] except KeyError: raise ValueError( "refseqs_fp or blast_db must be provided to %s" % self.Name) blast_db, self._db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) else: blast_db = params['blast_db'] self._db_files_to_remove = [] self._taxon_assigner = BlastTaxonAssigner( {'blast_db': blast_db, 'id_to_taxonomy_filepath': id_to_taxonomy_fp, 'Max E value': _params['max_e_value'], 'Min percent identity': _params['min_pct_id'] }) ChimeraChecker.__init__(self, _params)
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db,self.in_aln1_fp) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = \ build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db,self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. """ assert seq_path or seqs, \ "Must provide either seqs or seq_path when calling a BlastTaxonAssigner." # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # assign the blast database, either as a pre-exisiting database # specified as self.Params['blast_db'] or by creating a # temporary database from the sequence file specified # as self.Params['reference_seqs_filepath'] try: blast_db = self.Params['blast_db'] except KeyError: # build a temporary blast_db reference_seqs_path = self.Params['reference_seqs_filepath'] refseqs_dir, refseqs_name = os.path.split(reference_seqs_path) blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_path) # build the mapping of sequence identifier # (wrt to the blast db seqs) to taxonomy id_to_taxonomy_map = self._parse_id_to_taxonomy_file( open(self.Params['id_to_taxonomy_filepath'], 'U')) # Iterate over the input self.SeqsPerBlastRun seqs at a time. # There are two competing issues here when dealing with very large # inputs. If all sequences are read in at once, the containing object # can be very large, causing the system to page. On the other hand, # in such cases it would be very slow to treat each sequence # individually, since blast requires a filepath. Each call would # therefore involve writing a single sequence to file, opening/closing # and removing the file. To balance this, sequences are read in and # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time. # This appears to solve the problem with the largest sets I've worked # with so far. if seq_path: # Get a seq iterator seqs = parse_fasta(open(seq_path)) # Build object to keep track of the current set of sequence to be # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score) # mapping) current_seqs = [] result = {} # Iterate over the (seq_id, seq) pairs for seq_id, seq in seqs: # append the current seq_id,seq to list of seqs to be blasted current_seqs.append((seq_id, seq)) # When there are 1000 in the list, blast them if len(current_seqs) == self.SeqsPerBlastRun: # update the result object result.update( self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map)) # reset the list of seqs to be blasted current_seqs = [] # Assign taxonomy to the remaining sequences result.update( self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map)) # End iteration over the input self.SeqsPerBlastRun seqs at a time. # Write log data if we have a path (while the logger can handle # being called if we are not logging, some of these steps are slow). if log_path is not None: num_inspected = len(result) logger.info('Number of sequences inspected: %s' % num_inspected) num_null_hits = [r[1] for r in result.values()].count(None) logger.info('Number with no blast hits: %s' % num_null_hits) if result_path: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (lineage, confidence, blast_hit_id) in result.items(): of.write('%s\t%s\t%s\t%s\n' % (seq_id, lineage, confidence, blast_hit_id)) of.close() result = None logger.info('Result path: %s' % result_path) else: # Returning the data as a dict, so no modification to result # is necessary. pass # if no result_path was provided, return the data as a dict logger.info('Result path: None, returned as dict.') # clean-up temp blastdb files, if a temp blastdb was created if 'reference_seqs_filepath' in self.Params: map(remove, db_files_to_remove) # return the result return result
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. """ assert seq_path or seqs, \ "Must provide either seqs or seq_path when calling a BlastTaxonAssigner." # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # assign the blast database, either as a pre-exisiting database # specified as self.Params['blast_db'] or by creating a # temporary database from the sequence file specified # as self.Params['reference_seqs_filepath'] try: blast_db = self.Params['blast_db'] except KeyError: # build a temporary blast_db reference_seqs_path = self.Params['reference_seqs_filepath'] refseqs_dir, refseqs_name = os.path.split(reference_seqs_path) blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_path) # build the mapping of sequence identifier # (wrt to the blast db seqs) to taxonomy id_to_taxonomy_map = self._parse_id_to_taxonomy_file( open(self.Params['id_to_taxonomy_filepath'], 'U')) # Iterate over the input self.SeqsPerBlastRun seqs at a time. # There are two competing issues here when dealing with very large # inputs. If all sequences are read in at once, the containing object # can be very large, causing the system to page. On the other hand, # in such cases it would be very slow to treat each sequence # individually, since blast requires a filepath. Each call would # therefore involve writing a single sequence to file, opening/closing # and removing the file. To balance this, sequences are read in and # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time. # This appears to solve the problem with the largest sets I've worked # with so far. if seq_path: # Get a seq iterator seqs = parse_fasta(open(seq_path)) # Build object to keep track of the current set of sequence to be # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score) # mapping) current_seqs = [] result = {} # Iterate over the (seq_id, seq) pairs for seq_id, seq in seqs: # append the current seq_id,seq to list of seqs to be blasted current_seqs.append((seq_id, seq)) # When there are 1000 in the list, blast them if len(current_seqs) == self.SeqsPerBlastRun: # update the result object result.update(self._seqs_to_taxonomy( current_seqs, blast_db, id_to_taxonomy_map)) # reset the list of seqs to be blasted current_seqs = [] # Assign taxonomy to the remaining sequences result.update(self._seqs_to_taxonomy( current_seqs, blast_db, id_to_taxonomy_map)) # End iteration over the input self.SeqsPerBlastRun seqs at a time. # Write log data if we have a path (while the logger can handle # being called if we are not logging, some of these steps are slow). if log_path is not None: num_inspected = len(result) logger.info('Number of sequences inspected: %s' % num_inspected) num_null_hits = [r[1] for r in result.values()].count(None) logger.info('Number with no blast hits: %s' % num_null_hits) if result_path: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (lineage, confidence, blast_hit_id) in result.items(): of.write('%s\t%s\t%s\t%s\n' % (seq_id, lineage, confidence, blast_hit_id)) of.close() result = None logger.info('Result path: %s' % result_path) else: # Returning the data as a dict, so no modification to result # is necessary. pass # if no result_path was provided, return the data as a dict logger.info('Result path: None, returned as dict.') # clean-up temp blastdb files, if a temp blastdb was created if 'reference_seqs_filepath' in self.Params: map(remove, db_files_to_remove) # return the result return result