class SeqLoader(MultiprocessingBase, FilenameParser): fasta_re = re.compile(r'.*\.f(asta|as|a|aa|fn|rn|na)$') schemas = { 'fasta': fasta_re, 'gb': re.compile(r'.*\.gb(ff|k)$'), 'embl': re.compile(r'.*\.embl$'), } def __init__(self, abort_event): super(SeqLoader, self).__init__(abort_event) dna_letters = set(ambiguous_dna_letters.upper()) rna_letters = set(ambiguous_rna_letters.upper()) protein_letters = set(extended_protein_letters.upper()) @classmethod def guess_alphabet(cls, seq): letters = set(seq[:10].upper()) # use just first 10 letters nletters = len(letters) if len(letters.intersection(cls.dna_letters)) == nletters: return generic_dna if len(letters.intersection(cls.protein_letters)) == nletters: return generic_protein return generic_alphabet @classmethod def correct_alphabet(cls, seq): cls._set_alphabet(seq, cls.guess_alphabet(seq)) return seq @classmethod def _set_alphabet(cls, rec, abc): rec.seq.alphabet = abc return rec @classmethod def load_file(cls, filename, schema=None, guess_alphabet=False): if not os.path.isfile(filename): print 'No such file: %s' % filename return None if not schema: schema = cls.guess_schema(filename) try: if guess_alphabet: recs = list( cls.correct_alphabet(rec) for rec in SeqIO.parse(filename, schema)) else: recs = list(SeqIO.parse(filename, schema)) return recs except Exception, e: print 'Unable to parse %s as %s\n%s' % (filename, schema, str(e)) return None
def test_fastq_dna(self): """Read and write back simple example with ambiguous DNA""" #First in upper case... data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", ambiguous_dna_letters.upper(), "".join(chr(33+q) for q in range(len(ambiguous_dna_letters)))) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue()) #Now in lower case... data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", ambiguous_dna_letters.lower(), "".join(chr(33+q) for q in range(len(ambiguous_dna_letters)))) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def test_fastq_dna(self): """Read and write back simple example with ambiguous DNA""" #First in upper case... data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", ambiguous_dna_letters.upper(), "".join(chr(33+q) for q in range(len(ambiguous_dna_letters)))) handle = StringIO("") self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue()) #Now in lower case... data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", ambiguous_dna_letters.lower(), "".join(chr(33+q) for q in range(len(ambiguous_dna_letters)))) handle = StringIO("") self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())