def test_trim_stop_codons_info(self): """trim_stop_codons should preserve info attribute""" seq_coll = SequenceCollection( data={ "seq1": "ACGTAA", "seq2": "ACGACG", "seq3": "ACGCGT" }, moltype=DNA, info={"key": "value"}, ) seq_coll = seq_coll.trim_stop_codons() self.assertEqual(seq_coll.info["key"], "value") # aligned aln = ArrayAlignment( data={ "seq1": "ACGTAA", "seq2": "ACGTGA", "seq3": "ACGTAA" }, moltype=DNA, info={"key": "value"}, ) aln = aln.trim_stop_codons() self.assertEqual(aln.info["key"], "value")
def clustal_from_alignment(aln, interleave_len=None): """Returns a string in Clustal format. - aln: can be an Alignment object or a dict. - interleave_len: sequence line width. Only available if sequences are aligned. """ if not aln: return "" # get seq output order try: order = aln.RowOrder except: order = list(aln.keys()) order.sort() seqs = SequenceCollection(aln) clustal_list = ["CLUSTAL\n"] if seqs.is_ragged(): raise ValueError( "Sequences in alignment are not all the same length." + "Cannot generate Clustal format.") aln_len = seqs.seq_len # Get all labels labels = copy(seqs.names) # Find all label lengths in order to get padding. label_lengths = [len(l) for l in labels] label_max = max(label_lengths) max_spaces = label_max + 4 # Get ordered seqs ordered_seqs = [seqs.named_seqs[label] for label in order] if interleave_len is not None: curr_ix = 0 while curr_ix < aln_len: clustal_list.extend([ "%s%s%s" % ( x, " " * (max_spaces - len(x)), y[curr_ix:curr_ix + interleave_len], ) for x, y in zip(order, ordered_seqs) ]) clustal_list.append("") curr_ix += interleave_len else: clustal_list.extend([ "%s%s%s" % (x, " " * (max_spaces - len(x)), y) for x, y in zip(order, ordered_seqs) ]) clustal_list.append("") return "\n".join(clustal_list)
def test_reverse_complement_info(self): """reverse_complement should preserve info attribute""" dna = { "seq1": "--ACGT--GT---", "seq2": "TTACGTA-GT---", "seq3": "--ACGTA-GCC--", } # alignment with gaps aln = ArrayAlignment(data=dna, moltype=DNA, info={"key": "value"}) aln_rc = aln.rc() self.assertEqual(aln_rc.info["key"], "value") # check collection, with gaps coll = SequenceCollection(data=dna, moltype=DNA, info={"key": "value"}) coll_rc = coll.rc() self.assertEqual(coll_rc.info["key"], "value")
def make_unaligned_seqs( data, moltype=None, label_to_name=None, info=None, source=None, **kw ): """Initialize an unaligned collection of sequences. Parameters ---------- data sequences moltype the moltype, eg DNA, PROTEIN, 'dna', 'protein' label_to_name function for converting original name into another name. info a dict from which to make an info object source origins of this data, defaults to 'unknown' **kw other keyword arguments passed to SequenceCollection """ if moltype is not None: moltype = get_moltype(moltype) info = info or {} for other_kw in ("constructor_kw", "kw"): other_kw = kw.pop(other_kw, None) or {} kw.update(other_kw) assert isinstance(info, dict), "info must be a dict" info["source"] = source or "unknown" return SequenceCollection( data=data, moltype=moltype, label_to_name=label_to_name, info=info, **kw )
def get_translatable(self, seqs): """returns the translatable sequences from seqs. translation errors are stroed in the info object""" seqs = seqs.degap() if self._moltype and self._moltype != seqs.moltype: seqs = seqs.to_moltype(self._moltype) translatable = [] error_log = [] for seq in seqs.seqs: try: frame = best_frame(seq, self._gc, allow_rc=self._allow_rc) if frame < 0: seq = seq.rc() frame *= -1 frame -= 1 # returned from best frame as 1, 2, 3 num_codons = (len(seq) - frame) // 3 seq = seq[frame:frame + (num_codons * 3)] if self._trim_terminal_stop: seq = seq.trim_stop_codon(gc=self._gc) translatable.append([seq.name, seq]) except ValueError as msg: # TODO handle case where incomplete at end OR beginning # plus case where is divisible by 3 but not in frame # if not divisible by 3, then calc remainder as len(seq) % 3 # try translating new[remainder:] and new[:-remainder] error_log.append([seq.name, msg.args[0]]) if translatable: translatable = SequenceCollection(data=translatable, moltype=self._moltype, info=seqs.info) translatable.info["translation_errors"] = error_log else: translatable = NotCompleted("FALSE", self, " ".join(error_log), source=seqs) return translatable
def get_seq_collection(self, feature_types=None, where_feature=None): """returns a SequenceCollection instance of the unaligned sequences""" seqs = [] for member in self.members: if feature_types: seq = member.get_annotated_seq(feature_types, where_feature) else: seq = member.seq if seq is None: continue seqs.append((seq.name, seq)) return SequenceCollection(data=seqs, moltype=DNA)