Example #1
0
class SeqLoader(MultiprocessingBase, FilenameParser):
    fasta_re = re.compile(r'.*\.f(asta|as|a|aa|fn|rn|na)$')

    schemas = {
        'fasta': fasta_re,
        'gb': re.compile(r'.*\.gb(ff|k)$'),
        'embl': re.compile(r'.*\.embl$'),
    }

    def __init__(self, abort_event):
        super(SeqLoader, self).__init__(abort_event)

    dna_letters = set(ambiguous_dna_letters.upper())
    rna_letters = set(ambiguous_rna_letters.upper())
    protein_letters = set(extended_protein_letters.upper())

    @classmethod
    def guess_alphabet(cls, seq):
        letters = set(seq[:10].upper())  # use just first 10 letters
        nletters = len(letters)
        if len(letters.intersection(cls.dna_letters)) == nletters:
            return generic_dna
        if len(letters.intersection(cls.protein_letters)) == nletters:
            return generic_protein
        return generic_alphabet

    @classmethod
    def correct_alphabet(cls, seq):
        cls._set_alphabet(seq, cls.guess_alphabet(seq))
        return seq

    @classmethod
    def _set_alphabet(cls, rec, abc):
        rec.seq.alphabet = abc
        return rec

    @classmethod
    def load_file(cls, filename, schema=None, guess_alphabet=False):
        if not os.path.isfile(filename):
            print 'No such file: %s' % filename
            return None
        if not schema: schema = cls.guess_schema(filename)
        try:
            if guess_alphabet:
                recs = list(
                    cls.correct_alphabet(rec)
                    for rec in SeqIO.parse(filename, schema))
            else:
                recs = list(SeqIO.parse(filename, schema))
            return recs
        except Exception, e:
            print 'Unable to parse %s as %s\n%s' % (filename, schema, str(e))
            return None
 def test_fastq_dna(self):
     """Read and write back simple example with ambiguous DNA"""
     #First in upper case...
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here",
               ambiguous_dna_letters.upper(),
               "".join(chr(33+q) for q in range(len(ambiguous_dna_letters))))
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
     #Now in lower case...
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here",
               ambiguous_dna_letters.lower(),
               "".join(chr(33+q) for q in range(len(ambiguous_dna_letters))))
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
Example #3
0
 def test_fastq_dna(self):
     """Read and write back simple example with ambiguous DNA"""
     #First in upper case...        
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here",
               ambiguous_dna_letters.upper(),
               "".join(chr(33+q) for q in range(len(ambiguous_dna_letters))))
     handle = StringIO("")
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
     #Now in lower case...
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here",
               ambiguous_dna_letters.lower(),
               "".join(chr(33+q) for q in range(len(ambiguous_dna_letters))))
     handle = StringIO("")
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())