Example #1
0
def _build_logodata(options):
    motif_flag = False

    fin = options.fin

    if options.upload is None:
        if fin is None:
            fin = StringIO(sys.stdin.read())
    else:
        if fin is None:
            from . import _from_URL_fileopen
            fin = _from_URL_fileopen(options.upload)
        else:
            raise ValueError(
                "error: options --fin and --upload are incompatible")

    try:
        # Try reading data in transfac format first.
        from corebio.matrix import Motif
        motif = Motif.read_transfac(fin, alphabet=options.alphabet)
        motif_flag = True
    except ValueError as motif_err:
        # Failed reading Motif, try reading as multiple sequence data.
        if options.input_parser == "transfac":
            raise motif_err  # Adding transfac as str insted of parser is a bit of a ugly kludge
        seqs = read_seq_data(fin,
                             options.input_parser.read,
                             alphabet=options.alphabet,
                             ignore_lower_case=options.ignore_lower_case)

    if motif_flag:
        if options.ignore_lower_case:
            raise ValueError(
                "error: option --ignore-lower-case incompatible with matrix input"
            )
        if options.reverse or options.revcomp:
            motif.reverse()
        if options.complement or options.revcomp:
            motif.complement()

        prior = parse_prior(options.composition, motif.alphabet,
                            options.weight)
        data = LogoData.from_counts(motif.alphabet, motif, prior)
    else:
        if options.reverse or options.revcomp:
            seqs = SeqList([s.reverse() for s in seqs], seqs.alphabet)

        if options.complement or options.revcomp:
            if not nucleic_alphabet.alphabetic(seqs.alphabet):
                raise ValueError('non-nucleic sequence cannot be complemented')
            aaa = seqs.alphabet
            seqs.alphabet = nucleic_alphabet
            seqs = SeqList([Seq(s, seqs.alphabet).complement() for s in seqs],
                           seqs.alphabet)
            seqs.alphabet = aaa

        prior = parse_prior(options.composition, seqs.alphabet, options.weight)
        data = LogoData.from_seqs(seqs, prior)

    return data
Example #2
0
    def complement(self):
        """Complement nucleic acid sequence."""
        from corebio.seq import Seq, Alphabet
        alphabet = self.alphabet
        complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement())
        self.alphabets = (None, complement_alphabet)

        m = self.reindex(alphabet)
        self.alphabets = (None, alphabet)
        self.array = m.array
Example #3
0
def read(fin, alphabet=None):
    """ Extract sequence data from a nexus file."""
    n = Nexus(fin)

    seqs = []
    for taxon in n.taxlabels:
        name = safename(taxon)
        r = n.matrix[taxon]
        if alphabet is None:
            s = Seq(r, name=name, alphabet=r.alphabet)
        else:
            s = Seq(r, name=name, alphabet=alphabet)
        seqs.append(s)

    if len(seqs) == 0:
        # Something went terrible wrong.
        raise ValueError("Cannot parse file")

    return SeqList(seqs)
Example #4
0
 def back_translate(self, seq):
     """Convert protein back into coding DNA.
     
     Args:
     -- seq - A polypeptide sequence.
     
     Returns :
     -- Seq - A DNA sequence
     """
     # TODO: Optimzie
     # TODO: Insanity check alphabet.
     table = self.back_table
     seq = str(seq)
     trans = [table[a] for a in seq]
     return Seq(''.join(trans), dna_alphabet)
Example #5
0
 def translate(self, seq, frame=0):
     """Translate a DNA sequence to a polypeptide using full
     IUPAC ambiguities in DNA/RNA and amino acid codes.
     
     Returns : 
     -- Seq - A polypeptide sequence 
     """
     # TODO: Optimize.
     # TODO: Insanity check alphabet.
     seq = str(seq)
     table = self.table
     trans = []
     L = len(seq)
     for i in range(frame, L - 2, 3):
         codon = seq[i:i + 3].upper()
         trans.append(table[codon])
     return Seq(''.join(trans), protein_alphabet)
Example #6
0
>>> from corebio.secstruc import *
>>> record = dssp.DsspRecord( open('test_corebio/data/1crn.dssp') )
>>> record.secondary()
' EE SSHHHHHHHHHHHTTT  HHHHHHHHS EE SSS   GGG  '
>>> fa_reduce_secstruc_to_ehl(record.secondary())
'LEELLLHHHHHHHHHHHLLLLLHHHHHHHHLLEELLLLLLLLLLLL' 

""" 

__all__ = ['dssp', 'stride','secstruc_alphabet','secstruc_ehl_alphabet', 
    'fa_reduce_secstruc_to_ehl', 'ehl_reduce_secstruc_to_ehl']

from corebio.seq import Alphabet, Seq
from corebio.transform import Transform

# ------------------- SECONDARY STRUCTURE ALPHABETS -------------------
secstruc_alphabet = Alphabet("HGIEBbTSC _-L?X")
secstruc_ehl_alphabet = Alphabet("EHLX")

fa_reduce_secstruc_to_ehl = \
    Transform(  Seq("HGIEBbTSC _-L?X", secstruc_alphabet),
                  Seq("HLLELLLLLLLLLXX", secstruc_ehl_alphabet) )

ehl_reduce_secstruc_to_ehl = \
    Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet),
                 Seq("HHHEEELLLLLLLXX", secstruc_ehl_alphabet) )
                 
                 
                 
Example #7
0
 def secondary(self):
     """Return the secondary structure of the protein as a Seq object"""
     return Seq(''.join([r.secstruc for r in self.residues]),
                stride_alphabet)
Example #8
0
 def primary(self):
     """ Return the protein primary sequence as a Seq object."""
     return Seq(''.join([r.aa for r in self.residues]), protein_alphabet)
Example #9
0
            ignore_lower_case = options.ignore_lower_case)   

    if motif_flag :
        if options.ignore_lower_case:
            raise ValueError("error: option --ignore-lower-case incompatible with matrix input")
        if options.reverse: motif.reverse()
        if options.complement: motif.complement()

        prior = parse_prior( options.composition,motif.alphabet, options.weight)
        data = LogoData.from_counts(motif.alphabet, motif, prior)
    else :
        if options.reverse: 
            seqs = SeqList([s.reverse() for s in seqs], seqs.alphabet)
        
        if options.complement :
            seqs= SeqList( [Seq(s,seqs.alphabet).complement() for s in seqs], seqs.alphabet)

        prior = parse_prior( options.composition,seqs.alphabet, options.weight)
        data = LogoData.from_seqs(seqs, prior)




    return data
     
 
             
def _build_logoformat( logodata, opts) :
    """ Extract and process relevant option values and return a 
    LogoFormat object.""" 
Example #10
0
def iterseq(fin, alphabet=None) :
    assert fin is not None
    if alphabet is not None : pass
    yield Seq('')
    return
Example #11
0
        self.description = description

    def __call__(self, seq):
        """Translate sequence."""
        if not self.source.alphabet.alphabetic(seq):
            raise ValueError("Incompatible alphabets")
        s = str.translate(seq, self.table)
        cls = self.target.__class__
        return cls(s, self.target.alphabet, seq.name, seq.description)


# End class Translation

# FIXME: Test, document, add to seq.
dna_complement = Transform(
    Seq("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", dna_alphabet),
    Seq("TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~", dna_alphabet),
)


def mask_low_complexity(seq, width=12, trigger=1.8, extension=2.0, mask='X'):
    """ Mask low complexity regions in protein sequences.
    
    Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence   
    into regions of high and low complexity. The sequence is divided into
    overlapping windows. Low complexity windows either have a sequence entropy
    less than the trigger complexity, or have an entropy less than the extension    
    complexity and neighbor other low-complexity windows. The sequence within   
    a low complexity region is replaced with the mask character (default 'X'), 
    and the masked alphabetic sequence is returned.