def parse_fast5_chunk(dn, cs, is_upper=False): reads = [] n_seqs = 0 n_bases = 0 size = 0 f5s = [os.path.join(dn, f) for f in os.listdir(dn) if f.endswith(".fast5")] for f5 in f5s: f5h = lq_nanopore.open_fast5(f5) top = lq_nanopore.list_toplevel(f5h) for k in top: if not k.startswith('read_'): continue fastq = lq_nanopore.get_fastq_from_multi_fast5(f5h, k).splitlines() name = fastq[0].split(" ")[0] if is_upper: reads.append([name, fastq[1].upper(), fastq[3]]) else: reads.append([name, fastq[1], fastq[3]]) size += sys.getsizeof(name) + sys.getsizeof( fastq[1]) + sys.getsizeof(fastq[3]) n_bases += len(fastq[1]) n_seqs += 1 if size >= cs: yield (reads, n_seqs, n_bases) size = 0 reads = [] yield (reads, n_seqs, n_bases)
def guess_format(fn): # assume fast5 is given in a dir. if os.path.isdir(fn): logger.info( "not a file but a direcory %s is given. looking for fast5 files.." % fn) for f in os.listdir(fn): if f.endswith(".fast5"): f5 = lq_nanopore.open_fast5(os.path.join(fn, f)) if '/UniqueGlobalKey' in f5: logger.error( "single read fast5 is included? it's not supported for sampleqc." ) return -1 return 4 logger.error("no fast5 is found.") return -1 try: fh = open(fn, 'rb') except: logger.error("cannot open %s" % fn) try: majic = os.read(fh.fileno(), 4) except: logger.error("cannot read %s" % fn) # pybam and/or biopython way if majic == 'BAM\1': fh.close() logger.debug("%s is an uncompressed BAM." % fn) return 0 elif b'\x1f\x8b' in majic: # YF memo: 1f 8b 08 04 code can exist in fq.gz either. # changed the logic. fh.close() with gzip.open(fn, 'rb') as f: l = f.read(4) if "BAM" in l.decode(): # this should be 'BAM\x01' logger.debug("%s is a compressed BAM." % fn) return 0 else: return __guess_sam_fastx(fn, isgzip=True) else: fh.close() return __guess_sam_fastx(fn, isgzip=False)
def guess_format(fn): # assume fast5 is given in a dir. if os.path.isdir(fn): logger.info( "not a file but a direcory %s is given. looking for fast5 files.." % fn) for f in os.listdir(fn): if f.endswith(".fast5"): f5 = lq_nanopore.open_fast5(os.path.join(fn, f)) if '/UniqueGlobalKey' in f5: logger.error( "single read fast5 is included? it's not supported for sampleqc." ) return -1 return 4 logger.error("no fast5 is found.") return -1 try: fh = open(fn, 'rb') except: logger.error("cannot open %s" % fn) try: majic = os.read(fh.fileno(), 4) except: logger.error("cannot read %s" % fn) # pybam and/or biopython way if majic == 'BAM\1': return 0 fh.close() elif majic == b'\x1f\x8b\x08\x04': # compressed bam return 0 fh.close() fh.close() try: fh = open(fn, 'r') except: logger.error("cannot open %s" % fn) # assume sam, fastx at_line_cnt = 0 for line in fh: if line[0] == '@': at_line_cnt += 1 continue elif at_line_cnt > 0: if at_line_cnt > 1: # header of sam fh.close() return 1 cn = len(line.split("\t")) if cn == 11: fh.close() return 1 at_line_cnt = 0 # fastq fh.close() return 2 elif line[0] == '>' and at_line_cnt == 0: # fasta fh.close() return 3 else: cn = len(line.split("\t")) if cn == 11: fh.close() return 1 at_line_cnt = 0 continue # something else fh.close() return -1