def test_init(self): """NastLogger.__init__ should store log filename in Filename attribute""" null_logger = NastLogger() self.assertEqual(null_logger.Filename, None) file_logger = NastLogger(self.filename) self.assertEqual(file_logger.Filename, self.filename)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError('Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_record(self): """NastLogger.__init__ should record tab-separated values to log file""" logger = NastLogger(self.filename) logger.record('hello', 'world') f = open(self.filename, 'r') obs_header = f.readline() obs_message = f.readline() f.close() self.assertEqual(obs_message, 'hello\tworld\n')
def test_header(self): """NastLogger.__init__ should write correct header to log file""" logger = NastLogger(self.filename) f = open(self.filename, 'r') header = f.readline() f.close() exp_header = ( 'candidate sequence ID\tcandidate nucleotide count\terrors\t' 'template ID\tBLAST percent identity to template\t' 'candidate nucleotide count post-NAST\n') self.assertEqual(header, exp_header)
class PyNastAligner(Aligner): Name = 'PyNastAligner' def __init__(self, params): """Return new PyNastAligner object with specified params. """ _params = { 'min_pct': 75.0, 'min_len': 150, 'blast_db': None, 'template_filepath': None, 'pairwise_alignment_method': 'blast', 'Application': 'PyNAST', 'Algorithm': 'NAST', } _params.update(params) Aligner.__init__(self, _params) def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = MinimalFastaParser(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in MinimalFastaParser(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id,seq.replace('.','-').upper())) try: template_alignment = LoadSeqs(data=template_alignment,moltype=DNA,\ aligned=DenseAlignment) except KeyError, e: raise KeyError,\ 'Only ACGT-. characters can be contained in template alignments.'+\ ' The offending character was: %s' % e # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path,'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path,'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned,aligned=DenseAlignment) except ValueError: return {}
def ipynast_seqs(candidate_sequences, template_alignment, max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None, log_fp=None, logger=None, temp_dir=get_pynast_temp_dir(), **kwargs): """Iterator that yields results of pynast on candidate_sequences This function yields the sequence and exit status of the alignment step, as (sequence, exit status) tuples. Status values can be: 0 : indicates a sucessful alignment, in which case the sequence will be aligned 1 : indicates unsucessful sequence search, in which case the sequence will be unaligned 2 : indicates alignment did not meet minimum requirements, in which case the sequence will be unaligned All sequences are returned as DNA sequence objects. candidate_sequences an iterable object (e.g., a list) containing tuples of (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser) or a fasta filepath template_alignment a PyCogent alignment object containing the template alignment or a fasta filepath max_hits Maximum number of uclust hits to return min_pct minimum % identity for best database match min_len minimum length of match for alignment align_unaligned_seqs_f Function to align sequences. Must be of the form: align_unaligned_seqs(seqs, moltype, params=None) see cogent.app.muscle_v38.align_unaligned_seqs log_fp Optional path to log file logger Optional NastLogger object, takes precedence over log_fp """ deprecation_warning(kwargs) files_to_remove = [] if type(candidate_sequences) == str: # filepath provided for candidate sequences candidate_sequences = MinimalFastaParser(open(candidate_sequences)) # sequence list provided for candidate sequence -- write # the seqs to a temp file to pass to uclust. This is done in all # cases to convert the sequences to uppercase in case they're not already. # The bad handling of upper versus lower-cased sequences is a uclust issue. # Note that delete = False here because we don't want these to # be deleted when they are closed (since we need to pass # the filepaths around after we write and close them). The files # are deleted explicitly at the end of this function. candidate_fasta_f = NamedTemporaryFile(prefix='pynast_candidate', suffix='.fasta', dir=temp_dir, delete=False) candidate_fasta_filepath = candidate_fasta_f.name for seq_id, seq in candidate_sequences: candidate_fasta_f.write('>%s\n%s\n' % (seq_id, str(seq).upper())) candidate_fasta_f.close() files_to_remove.append(candidate_fasta_filepath) # degap the template alignment for the sequence searching step and # write it to file. See above comment about delete=False template_fasta_f = NamedTemporaryFile(prefix='pynast_template', suffix='.fasta', dir=temp_dir, delete=False) template_fasta_filepath = template_fasta_f.name if type(template_alignment) == str: # the template alignment was received as a filepath try: template_alignment_f = open(template_alignment) except IOError: raise IOError,\ "Cannot open specified filepath: %s" % template_alignment # template alignment provided as filepath -- process it iteratively # to handle potentially massive template_alignments template_alignment = {} for seq_id, seq in MinimalFastaParser(template_alignment_f): template_alignment[seq_id] = seq seq = Sequence(seq=seq, moltype=DNA) template_fasta_f.write('>%s\n%s\n' % (seq_id, seq.degap())) else: # the template alignment was received as a filepath template_fasta_f.write(template_alignment.degap().toFasta()) template_fasta_f.close() files_to_remove.append(template_fasta_filepath) # Set up logging. NastLogger object takes precedence over log # file path, if both are provided. if logger is not None: logger = logger elif log_fp is not None: logger = NastLogger(log_fp) else: logger = NastLogger() min_pct /= 100. # get the alignment iterator pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath( candidate_fasta_filepath, template_fasta_filepath, percent_ID=min_pct, enable_rev_strand_matching=True, tmp_dir=temp_dir) try: current_result = pw_alignment_iterator.next() except StopIteration: current_result = None for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)): seq_len = len(seq) if '-' in seq: # clean-up temporary blast database files if any were created pw_alignment_iterator.close() remove_files(files_to_remove, error_on_missing=False) raise ValueError, "Candidate sequence contains gaps. This is not supported." try: candidate_seq_id, template_seq_id, pw_aligned_candidate,\ pw_aligned_template, pct_identity = current_result except TypeError: pass if not current_result or seq_id.split()[0] != candidate_seq_id.split( )[0]: # a suitable match was not found - don't align the sequence # log the failure logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 1 else: # this sequence was aligned if align_unaligned_seqs_f: # if an alternate pairwise aligner was specified, unalign # and re-align the sequences. pw_aligned_template, pw_aligned_candidate =\ align_two_seqs(pw_aligned_template.replace('-',''), pw_aligned_candidate.replace('-',''), align_unaligned_seqs_f) # Cast the pairwise alignments to DNA sequence objects pw_aligned_candidate = \ DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id) pw_aligned_template = \ DNA.makeSequence(pw_aligned_template,Name=template_seq_id) # Remove any terminal gaps that were introduced into the template # sequence pw_aligned_candidate, pw_aligned_template = \ remove_template_terminal_gaps( pw_aligned_candidate, pw_aligned_template) candidate_seq_id = pw_aligned_candidate.Name # get the aligned template sequence from the template alignment try: template_aligned_seq = \ template_alignment.getGappedSeq(template_seq_id) except AttributeError: template_aligned_seq = \ Sequence(seq=template_alignment[template_seq_id],moltype=DNA) # reintroduce the gap spacing from the template alignment pw_aligned_template, pw_aligned_candidate, new_gaps =\ reintroduce_template_spacing(template_aligned_seq,\ pw_aligned_template,pw_aligned_candidate) # delete any new gaps that were introduced during the # pairwise alignment step pw_aligned_template, pw_aligned_candidate = adjust_alignment(\ pw_aligned_template,pw_aligned_candidate,new_gaps) # reintroduce any terminal gaps that were present in the template result = introduce_terminal_gaps(\ template_aligned_seq,pw_aligned_template,pw_aligned_candidate) unaligned_length = len(result.degap()) if unaligned_length < min_len: # alignment is too short - log this as a failure error = "Alignment does not meet minimum length "+\ "requirement for alignment (%d < %d)"\ % (seq_len,min_len) logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 2 else: # log the alignment logger.record( seq_id, # input sequence identifier len(seq), # input sequence length '', # Errors template_seq_id, # best template match id '%3.2f' % pct_identity, # pct id to template unaligned_length, # post alignment sequence length ) # yield the aligned sequence and sucess code yield DNA.makeSequence(result, Name=candidate_seq_id), 0 # get the next alignment try: current_result = pw_alignment_iterator.next() except StopIteration: # end of the input fasta file indicates completion, # not end of the aligned sequences continue # clean-up temporary blast database files if any were created remove_files(files_to_remove, error_on_missing=False)
def ipynast_seqs(candidate_sequences, template_alignment, max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None, log_fp=None, logger=None,**kwargs): """Iterator that yields results of pynast on candidate_sequences This function yields the sequence and exit status of the alignment step, as (sequence, exit status) tuples. Status values can be: 0 : indicates a sucessful alignment, in which case the sequence will be aligned 1 : indicates unsucessful sequence search, in which case the sequence will be unaligned 2 : indicates alignment did not meet minimum requirements, in which case the sequence will be unaligned All sequences are returned as DNA sequence objects. candidate_sequences an iterable object (e.g., a list) containing tuples of (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser) or a fasta filepath template_alignment a PyCogent alignment object containing the template alignment or a fasta filepath max_hits Maximum number of uclust hits to return min_pct minimum % identity for best database match min_len minimum length of match for alignment align_unaligned_seqs_f Function to align sequences. Must be of the form: align_unaligned_seqs(seqs, moltype, params=None) see cogent.app.muscle_v38.align_unaligned_seqs log_fp Optional path to log file logger Optional NastLogger object, takes precedence over log_fp """ deprecation_warning(kwargs) files_to_remove = [] if type(candidate_sequences) == str: # filepath provided for candidate sequences candidate_sequences = MinimalFastaParser(open(candidate_sequences)) # sequence list provided for candidate sequence -- write # the seqs to a temp file to pass to uclust. This is done in all # cases to convert the sequences to uppercase in case they're not already. # The bad handling of upper versus lower-cased sequences is a uclust issue. candidate_fasta_filepath = \ get_tmp_filename(prefix='pynast_candidate',suffix='.fasta') candidate_fasta_f = open(candidate_fasta_filepath,'w') for seq_id, seq in candidate_sequences: candidate_fasta_f.write('>%s\n%s\n' % (seq_id,str(seq).upper())) candidate_fasta_f.close() files_to_remove.append(candidate_fasta_filepath) # degap the template alignment for the sequence searching step and # write it to file template_fasta_filepath = \ get_tmp_filename(prefix='pynast_template',suffix='.fasta') template_fasta_f = open(template_fasta_filepath,'w') if type(template_alignment) == str: # the template alignment was received as a filepath try: template_alignment_f = open(template_alignment) except IOError: raise IOError,\ "Cannot open specified filepath: %s" % template_alignment # template alignment provided as filepath -- process it iteratively # to handle potentially massive template_alignments template_alignment = {} for seq_id,seq in MinimalFastaParser(template_alignment_f): template_alignment[seq_id] = seq seq = Sequence(seq=seq,moltype=DNA) template_fasta_f.write('>%s\n%s\n' % (seq_id,seq.degap())) else: # the template alignment was received as a filepath template_fasta_f.write(template_alignment.degap().toFasta()) template_fasta_f.close() files_to_remove.append(template_fasta_filepath) # Set up logging. NastLogger object takes precedence over log # file path, if both are provided. if logger is not None: logger = logger elif log_fp is not None: logger = NastLogger(log_fp) else: logger = NastLogger() min_pct /= 100. # get the alignment iterator pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath( candidate_fasta_filepath, template_fasta_filepath, percent_ID=min_pct, enable_rev_strand_matching=True) try: current_result = pw_alignment_iterator.next() except StopIteration: current_result = None for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)): seq_len = len(seq) if '-' in seq: # clean-up temporary blast database files if any were created pw_alignment_iterator.close() remove_files(files_to_remove,error_on_missing=False) raise ValueError, "Candidate sequence contains gaps. This is not supported." try: candidate_seq_id, template_seq_id, pw_aligned_candidate,\ pw_aligned_template, pct_identity = current_result except TypeError: pass if not current_result or seq_id.split()[0] != candidate_seq_id.split()[0]: # a suitable match was not found - don't align the sequence # log the failure logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq,Name=seq_id), 1 else: # this sequence was aligned if align_unaligned_seqs_f: # if an alternate pairwise aligner was specified, unalign # and re-align the sequences. pw_aligned_template, pw_aligned_candidate =\ align_two_seqs(pw_aligned_template.replace('-',''), pw_aligned_candidate.replace('-',''), align_unaligned_seqs_f) # Cast the pairwise alignments to DNA sequence objects pw_aligned_candidate = \ DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id) pw_aligned_template = \ DNA.makeSequence(pw_aligned_template,Name=template_seq_id) # Remove any terminal gaps that were introduced into the template # sequence pw_aligned_candidate, pw_aligned_template = \ remove_template_terminal_gaps( pw_aligned_candidate, pw_aligned_template) candidate_seq_id = pw_aligned_candidate.Name # get the aligned template sequence from the template alignment try: template_aligned_seq = \ template_alignment.getGappedSeq(template_seq_id) except AttributeError: template_aligned_seq = \ Sequence(seq=template_alignment[template_seq_id],moltype=DNA) # reintroduce the gap spacing from the template alignment pw_aligned_template, pw_aligned_candidate, new_gaps =\ reintroduce_template_spacing(template_aligned_seq,\ pw_aligned_template,pw_aligned_candidate) # delete any new gaps that were introduced during the # pairwise alignment step pw_aligned_template, pw_aligned_candidate = adjust_alignment(\ pw_aligned_template,pw_aligned_candidate,new_gaps) # reintroduce any terminal gaps that were present in the template result = introduce_terminal_gaps(\ template_aligned_seq,pw_aligned_template,pw_aligned_candidate) unaligned_length = len(result.degap()) if unaligned_length < min_len: # alignment is too short - log this as a failure error = "Alignment does not meet minimum length "+\ "requirement for alignment (%d < %d)"\ % (seq_len,min_len) logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq,Name=seq_id), 2 else: # log the alignment logger.record( seq_id, # input sequence identifier len(seq), # input sequence length '', # Errors template_seq_id, # best template match id '%3.2f' % pct_identity, # pct id to template unaligned_length, # post alignment sequence length ) # yield the aligned sequence and sucess code yield DNA.makeSequence(result,Name=candidate_seq_id), 0 # get the next alignment try: current_result = pw_alignment_iterator.next() except StopIteration: # end of the input fasta file indicates completion, # not end of the aligned sequences continue # clean-up temporary blast database files if any were created remove_files(files_to_remove,error_on_missing=False)