def _build_logodata(options): motif_flag = False fin = options.fin if options.upload is None: if fin is None: fin = StringIO(sys.stdin.read()) else: if fin is None: from . import _from_URL_fileopen fin = _from_URL_fileopen(options.upload) else: raise ValueError( "error: options --fin and --upload are incompatible") try: # Try reading data in transfac format first. from corebio.matrix import Motif motif = Motif.read_transfac(fin, alphabet=options.alphabet) motif_flag = True except ValueError as motif_err: # Failed reading Motif, try reading as multiple sequence data. if options.input_parser == "transfac": raise motif_err # Adding transfac as str insted of parser is a bit of a ugly kludge seqs = read_seq_data(fin, options.input_parser.read, alphabet=options.alphabet, ignore_lower_case=options.ignore_lower_case) if motif_flag: if options.ignore_lower_case: raise ValueError( "error: option --ignore-lower-case incompatible with matrix input" ) if options.reverse or options.revcomp: motif.reverse() if options.complement or options.revcomp: motif.complement() prior = parse_prior(options.composition, motif.alphabet, options.weight) data = LogoData.from_counts(motif.alphabet, motif, prior) else: if options.reverse or options.revcomp: seqs = SeqList([s.reverse() for s in seqs], seqs.alphabet) if options.complement or options.revcomp: if not nucleic_alphabet.alphabetic(seqs.alphabet): raise ValueError('non-nucleic sequence cannot be complemented') aaa = seqs.alphabet seqs.alphabet = nucleic_alphabet seqs = SeqList([Seq(s, seqs.alphabet).complement() for s in seqs], seqs.alphabet) seqs.alphabet = aaa prior = parse_prior(options.composition, seqs.alphabet, options.weight) data = LogoData.from_seqs(seqs, prior) return data
def complement(self): """Complement nucleic acid sequence.""" from corebio.seq import Seq, Alphabet alphabet = self.alphabet complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement()) self.alphabets = (None, complement_alphabet) m = self.reindex(alphabet) self.alphabets = (None, alphabet) self.array = m.array
def read(fin, alphabet=None): """ Extract sequence data from a nexus file.""" n = Nexus(fin) seqs = [] for taxon in n.taxlabels: name = safename(taxon) r = n.matrix[taxon] if alphabet is None: s = Seq(r, name=name, alphabet=r.alphabet) else: s = Seq(r, name=name, alphabet=alphabet) seqs.append(s) if len(seqs) == 0: # Something went terrible wrong. raise ValueError("Cannot parse file") return SeqList(seqs)
def back_translate(self, seq): """Convert protein back into coding DNA. Args: -- seq - A polypeptide sequence. Returns : -- Seq - A DNA sequence """ # TODO: Optimzie # TODO: Insanity check alphabet. table = self.back_table seq = str(seq) trans = [table[a] for a in seq] return Seq(''.join(trans), dna_alphabet)
def translate(self, seq, frame=0): """Translate a DNA sequence to a polypeptide using full IUPAC ambiguities in DNA/RNA and amino acid codes. Returns : -- Seq - A polypeptide sequence """ # TODO: Optimize. # TODO: Insanity check alphabet. seq = str(seq) table = self.table trans = [] L = len(seq) for i in range(frame, L - 2, 3): codon = seq[i:i + 3].upper() trans.append(table[codon]) return Seq(''.join(trans), protein_alphabet)
>>> from corebio.secstruc import * >>> record = dssp.DsspRecord( open('test_corebio/data/1crn.dssp') ) >>> record.secondary() ' EE SSHHHHHHHHHHHTTT HHHHHHHHS EE SSS GGG ' >>> fa_reduce_secstruc_to_ehl(record.secondary()) 'LEELLLHHHHHHHHHHHLLLLLHHHHHHHHLLEELLLLLLLLLLLL' """ __all__ = ['dssp', 'stride','secstruc_alphabet','secstruc_ehl_alphabet', 'fa_reduce_secstruc_to_ehl', 'ehl_reduce_secstruc_to_ehl'] from corebio.seq import Alphabet, Seq from corebio.transform import Transform # ------------------- SECONDARY STRUCTURE ALPHABETS ------------------- secstruc_alphabet = Alphabet("HGIEBbTSC _-L?X") secstruc_ehl_alphabet = Alphabet("EHLX") fa_reduce_secstruc_to_ehl = \ Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet), Seq("HLLELLLLLLLLLXX", secstruc_ehl_alphabet) ) ehl_reduce_secstruc_to_ehl = \ Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet), Seq("HHHEEELLLLLLLXX", secstruc_ehl_alphabet) )
def secondary(self): """Return the secondary structure of the protein as a Seq object""" return Seq(''.join([r.secstruc for r in self.residues]), stride_alphabet)
def primary(self): """ Return the protein primary sequence as a Seq object.""" return Seq(''.join([r.aa for r in self.residues]), protein_alphabet)
ignore_lower_case = options.ignore_lower_case) if motif_flag : if options.ignore_lower_case: raise ValueError("error: option --ignore-lower-case incompatible with matrix input") if options.reverse: motif.reverse() if options.complement: motif.complement() prior = parse_prior( options.composition,motif.alphabet, options.weight) data = LogoData.from_counts(motif.alphabet, motif, prior) else : if options.reverse: seqs = SeqList([s.reverse() for s in seqs], seqs.alphabet) if options.complement : seqs= SeqList( [Seq(s,seqs.alphabet).complement() for s in seqs], seqs.alphabet) prior = parse_prior( options.composition,seqs.alphabet, options.weight) data = LogoData.from_seqs(seqs, prior) return data def _build_logoformat( logodata, opts) : """ Extract and process relevant option values and return a LogoFormat object."""
def iterseq(fin, alphabet=None) : assert fin is not None if alphabet is not None : pass yield Seq('') return
self.description = description def __call__(self, seq): """Translate sequence.""" if not self.source.alphabet.alphabetic(seq): raise ValueError("Incompatible alphabets") s = str.translate(seq, self.table) cls = self.target.__class__ return cls(s, self.target.alphabet, seq.name, seq.description) # End class Translation # FIXME: Test, document, add to seq. dna_complement = Transform( Seq("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", dna_alphabet), Seq("TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~", dna_alphabet), ) def mask_low_complexity(seq, width=12, trigger=1.8, extension=2.0, mask='X'): """ Mask low complexity regions in protein sequences. Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence into regions of high and low complexity. The sequence is divided into overlapping windows. Low complexity windows either have a sequence entropy less than the trigger complexity, or have an entropy less than the extension complexity and neighbor other low-complexity windows. The sequence within a low complexity region is replaced with the mask character (default 'X'), and the masked alphabetic sequence is returned.