def test_trim_stop_codons_info(self):
        """trim_stop_codons should preserve info attribute"""
        seq_coll = SequenceCollection(
            data={
                "seq1": "ACGTAA",
                "seq2": "ACGACG",
                "seq3": "ACGCGT"
            },
            moltype=DNA,
            info={"key": "value"},
        )
        seq_coll = seq_coll.trim_stop_codons()
        self.assertEqual(seq_coll.info["key"], "value")

        # aligned
        aln = ArrayAlignment(
            data={
                "seq1": "ACGTAA",
                "seq2": "ACGTGA",
                "seq3": "ACGTAA"
            },
            moltype=DNA,
            info={"key": "value"},
        )
        aln = aln.trim_stop_codons()
        self.assertEqual(aln.info["key"], "value")
Example #2
0
def clustal_from_alignment(aln, interleave_len=None):
    """Returns a string in Clustal format.

        - aln: can be an Alignment object or a dict.
        - interleave_len: sequence line width.  Only available if sequences are
            aligned.
    """
    if not aln:
        return ""

    # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = list(aln.keys())
        order.sort()

    seqs = SequenceCollection(aln)
    clustal_list = ["CLUSTAL\n"]

    if seqs.is_ragged():
        raise ValueError(
            "Sequences in alignment are not all the same length." +
            "Cannot generate Clustal format.")

    aln_len = seqs.seq_len
    # Get all labels
    labels = copy(seqs.names)

    # Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max + 4

    # Get ordered seqs
    ordered_seqs = [seqs.named_seqs[label] for label in order]

    if interleave_len is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            clustal_list.extend([
                "%s%s%s" % (
                    x,
                    " " * (max_spaces - len(x)),
                    y[curr_ix:curr_ix + interleave_len],
                ) for x, y in zip(order, ordered_seqs)
            ])
            clustal_list.append("")
            curr_ix += interleave_len
    else:
        clustal_list.extend([
            "%s%s%s" % (x, " " * (max_spaces - len(x)), y)
            for x, y in zip(order, ordered_seqs)
        ])
        clustal_list.append("")

    return "\n".join(clustal_list)
Example #3
0
 def test_reverse_complement_info(self):
     """reverse_complement should preserve info attribute"""
     dna = {
         "seq1": "--ACGT--GT---",
         "seq2": "TTACGTA-GT---",
         "seq3": "--ACGTA-GCC--",
     }
     # alignment with gaps
     aln = ArrayAlignment(data=dna, moltype=DNA, info={"key": "value"})
     aln_rc = aln.rc()
     self.assertEqual(aln_rc.info["key"], "value")
     # check collection, with gaps
     coll = SequenceCollection(data=dna, moltype=DNA, info={"key": "value"})
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.info["key"], "value")
Example #4
0
def make_unaligned_seqs(
    data, moltype=None, label_to_name=None, info=None, source=None, **kw
):
    """Initialize an unaligned collection of sequences.

    Parameters
    ----------
    data
        sequences
    moltype
        the moltype, eg DNA, PROTEIN, 'dna', 'protein'
    label_to_name
        function for converting original name into another name.
    info
        a dict from which to make an info object
    source
        origins of this data, defaults to 'unknown'
    **kw
        other keyword arguments passed to SequenceCollection
    """

    if moltype is not None:
        moltype = get_moltype(moltype)

    info = info or {}
    for other_kw in ("constructor_kw", "kw"):
        other_kw = kw.pop(other_kw, None) or {}
        kw.update(other_kw)
    assert isinstance(info, dict), "info must be a dict"
    info["source"] = source or "unknown"

    return SequenceCollection(
        data=data, moltype=moltype, label_to_name=label_to_name, info=info, **kw
    )
Example #5
0
    def get_translatable(self, seqs):
        """returns the translatable sequences from seqs.

        translation errors are stroed in the info object"""
        seqs = seqs.degap()
        if self._moltype and self._moltype != seqs.moltype:
            seqs = seqs.to_moltype(self._moltype)

        translatable = []
        error_log = []
        for seq in seqs.seqs:
            try:
                frame = best_frame(seq, self._gc, allow_rc=self._allow_rc)
                if frame < 0:
                    seq = seq.rc()
                    frame *= -1
                frame -= 1  # returned from best frame as 1, 2, 3
                num_codons = (len(seq) - frame) // 3
                seq = seq[frame:frame + (num_codons * 3)]
                if self._trim_terminal_stop:
                    seq = seq.trim_stop_codon(gc=self._gc)
                translatable.append([seq.name, seq])
            except ValueError as msg:
                # TODO handle case where incomplete at end OR beginning
                # plus case where is divisible by 3 but not in frame
                # if not divisible by 3, then calc remainder as len(seq) % 3
                # try translating new[remainder:] and new[:-remainder]
                error_log.append([seq.name, msg.args[0]])

        if translatable:
            translatable = SequenceCollection(data=translatable,
                                              moltype=self._moltype,
                                              info=seqs.info)
            translatable.info["translation_errors"] = error_log
        else:
            translatable = NotCompleted("FALSE",
                                        self,
                                        " ".join(error_log),
                                        source=seqs)

        return translatable
Example #6
0
 def get_seq_collection(self, feature_types=None, where_feature=None):
     """returns a SequenceCollection instance of the unaligned sequences"""
     seqs = []
     for member in self.members:
         if feature_types:
             seq = member.get_annotated_seq(feature_types, where_feature)
         else:
             seq = member.seq
         if seq is None:
             continue
         seqs.append((seq.name, seq))
     return SequenceCollection(data=seqs, moltype=DNA)