def _get_tile_info(self): identifiers = [] tiles = {} with FastqReader(self.filename) as f: for i, record in enumerate(f): if i < self.max_sample: identifier = Identifier(record.name) identifiers.append(identifier.info) tiles['x'] = [float(this['x_coordinate']) for this in identifiers] tiles['y'] = [float(this['y_coordinate']) for this in identifiers] tiles['tiles'] = [this['tile_number'] for this in identifiers] return tiles
def _get_qualities(self): from sequana import logger logger.info("Extracting qualities") qualities = [] with FastqReader(self.filename) as f: for i, record in enumerate(f): if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] qualities.append(quality) else: break return qualities
def test_context_manager(self): filename = "tests/data/simple.fastq" with open(filename) as f: assert not f.closed reads = list(openseq(f)) assert not f.closed assert f.closed with FastqReader(filename) as sr: tmp_sr = sr assert not sr._file.closed reads = list(sr) assert not sr._file.closed assert tmp_sr._file is None
def _get_info(self): """Populates the data structures for plotting. Will be called on request""" stats = {"A":0, "C":0, "G":0, "T":0, "N":0} stats["qualities"] = [] stats["mean_qualities"] = [] stats["mean_length"] = 0 stats["sequences"] = [] minimum = 1e6 maximum = 0 # FIXME this self.N takes time in the cosntructor # do we need it ? self.lengths = np.empty(self.N) self.gc_list = [] total_length = 0 C = defaultdict(int) if self.verbose: pb = Progress(self.N) sequences = [] mean_qualities = [] qualities = [] # could use multiprocessing # FastxFile has shown some errors while handling gzip files # created with zlib (e.g. from atropos). This is now replaced # by the Atropos FastqReader for now. #fastq = pysam.FastxFile(self.filename) with FastqReader(self.filename) as f: for i, record in enumerate(f): N = len(record.sequence) self.lengths[i] = N # we can store all qualities and sequences reads, so # just max_sample are stored: if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] mean_qualities.append(sum(quality) / N) qualities.append(quality) sequences.append(record.sequence) # store count of all qualities for k in record.qualities: C[k] += 1 GG = record.sequence.count('G') CC = record.sequence.count('C') self.gc_list.append((GG+CC)/float(N)*100) # not using a counter, or loop speed up the code stats["A"] += record.sequence.count("A") stats["C"] += CC stats["G"] += GG stats["T"] += record.sequence.count("T") stats["N"] += record.sequence.count("N") total_length += len(record.sequence) if self.verbose: pb.animate(i+1) # other data self.qualities = qualities self.mean_qualities = mean_qualities self.minimum = int(self.lengths.min()) self.maximum = int(self.lengths.max()) self.sequences = sequences self.gc_content = np.mean(self.gc_list) stats['mean_length'] = total_length / float(self.N) stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N'] stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp'] self.stats = stats
def test_alphabet(self): filename = "tests/data/bad_bases.fq" with FastqReader(filename, alphabet=ALPHABETS['dna']) as f: reads = list(f) assert reads[0].sequence == 'ACGNGGACT' assert reads[1].sequence == 'CGGACNNNC'
def test_fastq_incomplete(self): fastq = StringIO("@name\nACGT+\n") with raises(FormatError), FastqReader(fastq) as fq: list(fq)
def test_fastq_wrongformat(self): with raises(FormatError), FastqReader( "tests/data/withplus.fastq") as f: reads = list(f)
def test_fastqreader_dos(self): with FastqReader("tests/data/dos.fastq") as f: dos_reads = list(f) with FastqReader("tests/data/small.fastq") as f: unix_reads = list(f) assert dos_reads == unix_reads
def test_fastqreader(self): with FastqReader("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq