Esempio n. 1
0
    def test_get_representatives(self):
        """get_representatives should return the representatives as list of Sequence."""

        result = """>1: 5
ABABABA
>3: 1
BABA
>4: 1
ABABAA
>8: 2
BABBA"""
        seqs = self.data.iteritems
        mapping = self.mapping
        test_result = list(get_representatives(mapping, seqs()))
        test_result_as_fasta = "\n".join(
            map(lambda a: a.toFasta(), test_result))

        self.assertEqual(test_result_as_fasta, result)

        # another example
        mapping = {'1': ('a', 'b', 'c'), '2': ('d', 'e', 'f')}
        seqs = [('1', "ACGT"), ('2', "TAGC"), ('a', "TTTTT")]

        observed = list(get_representatives(mapping, seqs))
        expected = [
            Sequence(name=">1", seq="ACGT"),
            Sequence(name='2', seq="TAGC")
        ]
        self.assertEqual(observed, expected)
Esempio n. 2
0
 def test_hasTerminalStop(self):
     """test check for terminal stop codons"""
     seq = Sequence(DNA, seq='ACTTAA')
     assert seq.hasTerminalStop() == True
     seq = Sequence(DNA, seq='ACTTAT') == False
     try:
         # only sequences with length divisible by 3 should work
         seq = Sequence(DNA, seq='ACTTA')
         seq.hasTerminalStop()
     except AssertionError:
         pass
 def test_reversecomplement(self):
     """testing reversal and complementing of a sequence"""
     seq = Sequence(DNA, seq='ACTGTAA')
     rev = seq.reversecomplement()
     self.assertEqual(str(rev), 'TTACAGT')
     seq = Sequence(DNA, seq='ACTG-TAA')
     rev = seq.reversecomplement()
     self.assertEqual(str(rev), 'TTA-CAGT')
     #try amigbuities
     seq = Sequence(DNA, seq='ACHNRTAA')
     rev = seq.reversecomplement()
     self.assertEqual(str(rev), 'TTAYNDGT')
    def test_hasTerminalStop(self):
        """test check for terminal stop codons"""
        seq = Sequence(DNA, seq='ACTTAA')
        assert seq.hasTerminalStop() == True
        seq = Sequence(DNA, seq='ACTTAT') == False

        # for sequence not divisible by 3
        seq = Sequence(DNA, seq='ACTTA')
        # fail
        self.assertRaises(ValueError, seq.hasTerminalStop)
        # unless explicitly over-ride length issue using allow_partial
        # in which case, returns False
        self.assertFalse(seq.hasTerminalStop(allow_partial=True))
    def test_withoutTerminalStopCodon(self):
        """testing deleting terminal stop"""
        # for standard code
        seq = Sequence(DNA, seq='ACTTAA')
        seq2 = seq.withoutTerminalStopCodon()
        self.assertEqual(str(seq2), "ACT")

        # for sequence not divisible by 3
        seq = Sequence(DNA, seq='ACTTA')
        # fail
        self.assertRaises(ValueError, seq.withoutTerminalStopCodon)
        # unless explicitly over-ride length issue using allow_partial
        seq2 = seq.withoutTerminalStopCodon(allow_partial=True)
Esempio n. 6
0
def write_Fasta_from_name_seq_pairs(name_seqs, fh):
    """writes a list of (name,seqs) to filehandle.

    name_seqs: (name,seqs) pair such as from MinimalFASTAParser
    fh: an open filehandle
    """
    if fh==None:
        raise ValueError,"Need open file handle to write to." 

    for (name,seq) in name_seqs:
        fh.write("%s\n"% Sequence(name=name, seq = seq).toFasta())
Esempio n. 7
0
def write_Fasta_from_name_seq_pairs(name_seqs, fh):
    """writes a list of (name,seqs) to filehandle.

    name_seqs: (name,seqs) pair such as from parse_fasta
    fh: an open filehandle
    """
    if fh is None:
        raise ValueError("Need open file handle to write to.")

    for (name, seq) in name_seqs:
        fh.write("%s\n" % Sequence(name=name, seq=seq).toFasta())
Esempio n. 8
0
def get_representatives(mapping, seqs):
    """Returns representative seqs.

    mapping: The prefix mapping dict
    
    seqs_fh: An open Fasta filehandle
    """
    for (label, seq) in seqs:
        if (mapping.has_key(label)):
            seq = Sequence(name="%s: %d" % (label, len(mapping[label]) + 1),
                           seq=seq)
            yield seq
Esempio n. 9
0
def degap_fasta_aln(seqs):
    """degap a Fasta aligment.

    seqs: list of label,seq pairs
    """

    for (label, seq) in seqs:
        degapped_seq = Sequence(moltype=DNA_with_more_gaps,
                                seq=seq,
                                name=label).degap()
        degapped_seq.Name = label
        yield degapped_seq
Esempio n. 10
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

    # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], MinimalFastaParser(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in MinimalFastaParser(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(Sequence(name=newlabel, seq=seq).toFasta() + "\n")
Esempio n. 11
0
def ipynast_seqs(candidate_sequences,
                 template_alignment,
                 max_hits=30,
                 min_pct=75.0,
                 min_len=1000,
                 align_unaligned_seqs_f=None,
                 log_fp=None,
                 logger=None,
                 temp_dir=get_pynast_temp_dir(),
                 **kwargs):
    """Iterator that yields results of pynast on candidate_sequences
    
    This function yields the sequence and exit status of the alignment step,
     as (sequence, exit status) tuples.
     Status values can be:
       0 : indicates a sucessful alignment, in which case the sequence will be
            aligned
       1 : indicates unsucessful sequence search, in which case the sequence 
            will be unaligned
       2 : indicates alignment did not meet minimum requirements, in which case 
            the sequence will be unaligned
            
     All sequences are returned as DNA sequence objects.
    
    candidate_sequences
        an iterable object (e.g., a list) containing tuples of
        (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser)
        or a fasta filepath
    template_alignment
        a PyCogent alignment object containing the template alignment
        or a fasta filepath
    max_hits
      Maximum number of uclust hits to return
    min_pct
      minimum % identity for best database match
    min_len
      minimum length of match for alignment     
    align_unaligned_seqs_f
      Function to align sequences. Must be of the form:
       align_unaligned_seqs(seqs, moltype, params=None)
       see cogent.app.muscle_v38.align_unaligned_seqs
    log_fp
      Optional path to log file
    logger
      Optional NastLogger object, takes precedence over log_fp
      
    """
    deprecation_warning(kwargs)

    files_to_remove = []
    if type(candidate_sequences) == str:
        # filepath provided for candidate sequences
        candidate_sequences = MinimalFastaParser(open(candidate_sequences))

    # sequence list provided for candidate sequence -- write
    # the seqs to a temp file to pass to uclust. This is done in all
    # cases to convert the sequences to uppercase in case they're not already.
    # The bad handling of upper versus lower-cased sequences is a uclust issue.
    # Note that delete = False here because we don't want these to
    # be deleted when they are closed (since we need to pass
    # the filepaths around after we write and close them). The files
    # are deleted explicitly at the end of this function.
    candidate_fasta_f = NamedTemporaryFile(prefix='pynast_candidate',
                                           suffix='.fasta',
                                           dir=temp_dir,
                                           delete=False)
    candidate_fasta_filepath = candidate_fasta_f.name
    for seq_id, seq in candidate_sequences:
        candidate_fasta_f.write('>%s\n%s\n' % (seq_id, str(seq).upper()))
    candidate_fasta_f.close()
    files_to_remove.append(candidate_fasta_filepath)

    # degap the template alignment for the sequence searching step and
    # write it to file. See above comment about delete=False
    template_fasta_f = NamedTemporaryFile(prefix='pynast_template',
                                          suffix='.fasta',
                                          dir=temp_dir,
                                          delete=False)
    template_fasta_filepath = template_fasta_f.name

    if type(template_alignment) == str:
        # the template alignment was received as a filepath
        try:
            template_alignment_f = open(template_alignment)
        except IOError:
            raise IOError,\
             "Cannot open specified filepath: %s" % template_alignment
        # template alignment provided as filepath -- process it iteratively
        # to handle potentially massive template_alignments
        template_alignment = {}
        for seq_id, seq in MinimalFastaParser(template_alignment_f):
            template_alignment[seq_id] = seq
            seq = Sequence(seq=seq, moltype=DNA)
            template_fasta_f.write('>%s\n%s\n' % (seq_id, seq.degap()))
    else:
        # the template alignment was received as a filepath
        template_fasta_f.write(template_alignment.degap().toFasta())
    template_fasta_f.close()
    files_to_remove.append(template_fasta_filepath)

    # Set up logging.  NastLogger object takes precedence over log
    # file path, if both are provided.
    if logger is not None:
        logger = logger
    elif log_fp is not None:
        logger = NastLogger(log_fp)
    else:
        logger = NastLogger()

    min_pct /= 100.
    # get the alignment iterator
    pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath(
        candidate_fasta_filepath,
        template_fasta_filepath,
        percent_ID=min_pct,
        enable_rev_strand_matching=True,
        tmp_dir=temp_dir)

    try:
        current_result = pw_alignment_iterator.next()
    except StopIteration:
        current_result = None

    for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)):
        seq_len = len(seq)
        if '-' in seq:
            # clean-up temporary blast database files if any were created
            pw_alignment_iterator.close()
            remove_files(files_to_remove, error_on_missing=False)
            raise ValueError, "Candidate sequence contains gaps. This is not supported."

        try:
            candidate_seq_id, template_seq_id, pw_aligned_candidate,\
             pw_aligned_template, pct_identity = current_result
        except TypeError:
            pass

        if not current_result or seq_id.split()[0] != candidate_seq_id.split(
        )[0]:
            # a suitable match was not found - don't align the sequence
            # log the failure
            logger.record(
                seq_id,  # input sequence identifier
                len(seq),  # input sequence length
                "No search results.")
            # yield the unaligned sequence and failure code
            yield DNA.makeSequence(seq, Name=seq_id), 1
        else:
            # this sequence was aligned
            if align_unaligned_seqs_f:
                # if an alternate pairwise aligner was specified, unalign
                # and re-align the sequences.
                pw_aligned_template, pw_aligned_candidate =\
                 align_two_seqs(pw_aligned_template.replace('-',''),
                                pw_aligned_candidate.replace('-',''),
                                align_unaligned_seqs_f)

            # Cast the pairwise alignments to DNA sequence objects
            pw_aligned_candidate = \
             DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id)
            pw_aligned_template = \
             DNA.makeSequence(pw_aligned_template,Name=template_seq_id)

            # Remove any terminal gaps that were introduced into the template
            # sequence
            pw_aligned_candidate, pw_aligned_template = \
                remove_template_terminal_gaps(
                pw_aligned_candidate, pw_aligned_template)
            candidate_seq_id = pw_aligned_candidate.Name

            # get the aligned template sequence from the template alignment
            try:
                template_aligned_seq = \
                 template_alignment.getGappedSeq(template_seq_id)
            except AttributeError:
                template_aligned_seq = \
                 Sequence(seq=template_alignment[template_seq_id],moltype=DNA)

            # reintroduce the gap spacing from the template alignment
            pw_aligned_template, pw_aligned_candidate, new_gaps =\
              reintroduce_template_spacing(template_aligned_seq,\
              pw_aligned_template,pw_aligned_candidate)

            # delete any new gaps that were introduced during the
            # pairwise alignment step
            pw_aligned_template, pw_aligned_candidate = adjust_alignment(\
             pw_aligned_template,pw_aligned_candidate,new_gaps)

            # reintroduce any terminal gaps that were present in the template
            result = introduce_terminal_gaps(\
                template_aligned_seq,pw_aligned_template,pw_aligned_candidate)

            unaligned_length = len(result.degap())
            if unaligned_length < min_len:
                # alignment is too short - log this as a failure
                error = "Alignment does not meet minimum length "+\
                            "requirement for alignment (%d < %d)"\
                             % (seq_len,min_len)
                logger.record(
                    seq_id,  # input sequence identifier
                    len(seq),  # input sequence length
                    "No search results.")
                # yield the unaligned sequence and failure code
                yield DNA.makeSequence(seq, Name=seq_id), 2
            else:
                # log the alignment
                logger.record(
                    seq_id,  # input sequence identifier
                    len(seq),  # input sequence length
                    '',  # Errors
                    template_seq_id,  # best template match id
                    '%3.2f' % pct_identity,  # pct id to template
                    unaligned_length,  # post alignment sequence length
                )

                # yield the aligned sequence and sucess code
                yield DNA.makeSequence(result, Name=candidate_seq_id), 0

            # get the next alignment
            try:
                current_result = pw_alignment_iterator.next()
            except StopIteration:
                # end of the input fasta file indicates completion,
                # not end of the aligned sequences
                continue

    # clean-up temporary blast database files if any were created
    remove_files(files_to_remove, error_on_missing=False)
Esempio n. 12
0
 def test_ambig_translate(self):
     """test of translating seqs"""
     seq = Sequence(DNA, 'CGNTGN???---').getTranslation()
     self.assertEqual(str(seq), 'RX?-')
Esempio n. 13
0
 def test_translate(self):
     """test of translating seqs"""
     seq = Sequence(DNA, 'ATGACGTTGCGTAGCATAGCTCGA').getTranslation()
     self.assertEqual(str(seq), 'MTLRSIAR')
Esempio n. 14
0
 def setUp(self):
     self.seq = Sequence(DNA, 'ATGACGTTGCGTAGCATAGCTCGA')
Esempio n. 15
0
 def test_withoutTerminalStopCodon(self):
     """testing deleting terminal stop"""
     # for standard code
     seq = Sequence(DNA, seq='ACTTAA')
     seq2 = seq.withoutTerminalStopCodon()
     self.assertEqual(str(seq2), "ACT")