Exemple #1
0
 def test_uclust_search_and_align_from_fasta_filepath(self):
     """ uclust_search_and_align_from_fasta_filepath functions as expected """
     # rev comp matches allowed (default)
     actual = list(uclust_search_and_align_from_fasta_filepath(
      self.search_align_query1_fp,self.search_align_template1_fp))
     self.assertEqual(actual,self.search_align_out1_expected)
     
     # rev comp matches not allowed
     actual = list(uclust_search_and_align_from_fasta_filepath(
      self.search_align_query1_fp,self.search_align_template1_fp,
      enable_rev_strand_matching=False))
     self.assertEqual(actual,self.search_align_out1_expected[:2])
Exemple #2
0
 def test_uclust_search_and_align_from_fasta_filepath_protein(self):
     """ uclust_search_and_align_from_fasta_filepath functions with protein """
     # rev comp matches allowed (default)
     actual = list(
         uclust_search_and_align_from_fasta_filepath(self.search_align_query2_fp, self.search_align_template2_fp)
     )
     self.assertEqual(actual, self.search_align_out2_expected)
Exemple #3
0
def ipynast_seqs(candidate_sequences, template_alignment,
    max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None,
    log_fp=None, logger=None,**kwargs):
    """Iterator that yields results of pynast on candidate_sequences
    
    This function yields the sequence and exit status of the alignment step,
     as (sequence, exit status) tuples.
     Status values can be:
       0 : indicates a sucessful alignment, in which case the sequence will be
            aligned
       1 : indicates unsucessful sequence search, in which case the sequence 
            will be unaligned
       2 : indicates alignment did not meet minimum requirements, in which case 
            the sequence will be unaligned
            
     All sequences are returned as DNA sequence objects.
    
    candidate_sequences
        an iterable object (e.g., a list) containing tuples of
        (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser)
        or a fasta filepath
    template_alignment
        a PyCogent alignment object containing the template alignment
        or a fasta filepath
    max_hits
      Maximum number of uclust hits to return
    min_pct
      minimum % identity for best database match
    min_len
      minimum length of match for alignment     
    align_unaligned_seqs_f
      Function to align sequences. Must be of the form:
       align_unaligned_seqs(seqs, moltype, params=None)
       see cogent.app.muscle_v38.align_unaligned_seqs
    log_fp
      Optional path to log file
    logger
      Optional NastLogger object, takes precedence over log_fp
      
    """
    deprecation_warning(kwargs)
    
    files_to_remove = []
    if type(candidate_sequences) == str:
        # filepath provided for candidate sequences
        candidate_sequences = MinimalFastaParser(open(candidate_sequences))

    # sequence list provided for candidate sequence -- write 
    # the seqs to a temp file to pass to uclust. This is done in all
    # cases to convert the sequences to uppercase in case they're not already.
    # The bad handling of upper versus lower-cased sequences is a uclust issue.
    candidate_fasta_filepath = \
     get_tmp_filename(prefix='pynast_candidate',suffix='.fasta')
    candidate_fasta_f = open(candidate_fasta_filepath,'w')
    for seq_id, seq in candidate_sequences:
        candidate_fasta_f.write('>%s\n%s\n' % (seq_id,str(seq).upper()))
    candidate_fasta_f.close()
    files_to_remove.append(candidate_fasta_filepath)

    # degap the template alignment for the sequence searching step and
    # write it to file
    template_fasta_filepath = \
     get_tmp_filename(prefix='pynast_template',suffix='.fasta')
    template_fasta_f = open(template_fasta_filepath,'w')
    
    if type(template_alignment) == str:
        # the template alignment was received as a filepath
        try:
            template_alignment_f = open(template_alignment)
        except IOError:
            raise IOError,\
             "Cannot open specified filepath: %s" % template_alignment
        # template alignment provided as filepath -- process it iteratively
        # to handle potentially massive template_alignments
        template_alignment = {}
        for seq_id,seq in MinimalFastaParser(template_alignment_f):
            template_alignment[seq_id] = seq
            seq = Sequence(seq=seq,moltype=DNA)
            template_fasta_f.write('>%s\n%s\n' % (seq_id,seq.degap()))
    else:
        # the template alignment was received as a filepath
        template_fasta_f.write(template_alignment.degap().toFasta())
    template_fasta_f.close()
    files_to_remove.append(template_fasta_filepath)
         
    # Set up logging.  NastLogger object takes precedence over log
    # file path, if both are provided.
    if logger is not None:
        logger = logger
    elif log_fp is not None:
        logger = NastLogger(log_fp)
    else:
        logger = NastLogger()
    
    min_pct /= 100.
    # get the alignment iterator
    pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath(
            candidate_fasta_filepath,
            template_fasta_filepath,
            percent_ID=min_pct,
            enable_rev_strand_matching=True)

    try:
        current_result = pw_alignment_iterator.next()
    except StopIteration:
        current_result = None
        
    for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)):
        seq_len = len(seq)
        if '-' in seq:
            # clean-up temporary blast database files if any were created
            pw_alignment_iterator.close()
            remove_files(files_to_remove,error_on_missing=False)
            raise ValueError, "Candidate sequence contains gaps. This is not supported."
        
        try:
            candidate_seq_id, template_seq_id, pw_aligned_candidate,\
             pw_aligned_template, pct_identity = current_result
        except TypeError:
            pass
        
        if not current_result or seq_id.split()[0] != candidate_seq_id.split()[0]:
            # a suitable match was not found - don't align the sequence
            # log the failure
            logger.record(
                seq_id, # input sequence identifier
                len(seq), # input sequence length
                "No search results.")
            # yield the unaligned sequence and failure code
            yield DNA.makeSequence(seq,Name=seq_id), 1
        else:
            # this sequence was aligned
            if align_unaligned_seqs_f:
                # if an alternate pairwise aligner was specified, unalign
                # and re-align the sequences.
                pw_aligned_template, pw_aligned_candidate =\
                 align_two_seqs(pw_aligned_template.replace('-',''),
                                pw_aligned_candidate.replace('-',''),
                                align_unaligned_seqs_f)
                                    
            # Cast the pairwise alignments to DNA sequence objects
            pw_aligned_candidate = \
             DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id)
            pw_aligned_template = \
             DNA.makeSequence(pw_aligned_template,Name=template_seq_id)
    
            # Remove any terminal gaps that were introduced into the template
            # sequence
            pw_aligned_candidate, pw_aligned_template = \
                remove_template_terminal_gaps(
                pw_aligned_candidate, pw_aligned_template)
            candidate_seq_id = pw_aligned_candidate.Name
    
            # get the aligned template sequence from the template alignment
            try:
                template_aligned_seq = \
                 template_alignment.getGappedSeq(template_seq_id)
            except AttributeError:
                template_aligned_seq = \
                 Sequence(seq=template_alignment[template_seq_id],moltype=DNA)
    
            # reintroduce the gap spacing from the template alignment
            pw_aligned_template, pw_aligned_candidate, new_gaps =\
              reintroduce_template_spacing(template_aligned_seq,\
              pw_aligned_template,pw_aligned_candidate)
    
            # delete any new gaps that were introduced during the 
            # pairwise alignment step
            pw_aligned_template, pw_aligned_candidate = adjust_alignment(\
             pw_aligned_template,pw_aligned_candidate,new_gaps)
     
            # reintroduce any terminal gaps that were present in the template
            result = introduce_terminal_gaps(\
                template_aligned_seq,pw_aligned_template,pw_aligned_candidate)
        
            unaligned_length = len(result.degap())
            if unaligned_length < min_len:
                # alignment is too short - log this as a failure
                error = "Alignment does not meet minimum length "+\
                            "requirement for alignment (%d < %d)"\
                             % (seq_len,min_len)
                logger.record(
                    seq_id, # input sequence identifier
                    len(seq), # input sequence length
                    "No search results.")
                # yield the unaligned sequence and failure code
                yield DNA.makeSequence(seq,Name=seq_id), 2
            else:        
                # log the alignment
                logger.record(
                    seq_id, # input sequence identifier
                    len(seq), # input sequence length
                    '',                  # Errors
                    template_seq_id, # best template match id
                    '%3.2f' % pct_identity, # pct id to template
                    unaligned_length, # post alignment sequence length
                    )

                # yield the aligned sequence and sucess code
                yield DNA.makeSequence(result,Name=candidate_seq_id), 0
                
            # get the next alignment
            try:
                current_result = pw_alignment_iterator.next()
            except StopIteration:
                # end of the input fasta file indicates completion,
                # not end of the aligned sequences
                continue

    # clean-up temporary blast database files if any were created
    remove_files(files_to_remove,error_on_missing=False)
Exemple #4
0
 def test_uclust_search_and_align_from_fasta_filepath_protein(self):
     """ uclust_search_and_align_from_fasta_filepath functions with protein """
     # rev comp matches allowed (default)
     actual = list(uclust_search_and_align_from_fasta_filepath(
      self.search_align_query2_fp,self.search_align_template2_fp))
     self.assertEqual(actual,self.search_align_out2_expected)