def read_fastq(fastq_file,maxcnt): gfr = GenericFastqFileReader(fastq_file) ecnt = 0 qseen = set() lenmax = 0 lenmin = float('inf') entries = [] bases = 0 while True: e = gfr.read_entry() if not e or ecnt > maxcnt: break ecnt += 1 slen = len(e['seq']) if slen < lenmin: lenmin = slen if slen > lenmax: lenmax = slen seq = e['seq'] bases += len(seq) for v in [ord(x) for x in e['quality']]: qseen.add(v) entries.append(e) gfr.close() qmin = min(qseen) qmax = max(qseen) stats = {} stats['qmin'] = qmin stats['qmax'] = qmax stats['lenmin'] = lenmin stats['lenmax'] = lenmax stats['readcount'] = len(entries) stats['basecount'] = bases return [entries,stats]
def check_for_uniquely_named_reads(args): observed_reads = set() reads = {} if args.fastq_reads: gfr = GenericFastqFileReader(args.fastq_reads) while True: e = gfr.read_entry() if not e: break reads[e['name']] = e['seq'] if e['name'] in observed_reads: sys.stderr.write("ERROR observed reads must be uniquely named") sys.exit() observed_reads.add(e['name']) elif args.fasta_reads: gfr = GenericFastaFileReader(args.fasta_reads) while True: e = gfr.read_entry() if not e: break reads[e['name']] = e['seq'] if e['name'] in observed_reads: sys.stderr.write("ERROR observed reads must be uniquely named") sys.exit() observed_reads.add(e['name']) return reads