def run(parser, args): if args.full_tsv: files = 0 basecalled_files = 0 stats = defaultdict(list) print('starting') for fast5 in Fast5File.Fast5FileSet(args.files): files += 1 fas = fast5.get_fastas_dict() if len(fas) > 0: basecalled_files += 1 for (category, fa) in iteritems(fas): if fa is not None: stats[category].append(len(fa.seq)) if category == 'twodirections': if fast5.is_high_quality(): stats['2D_hq'].append(len(fa.seq)) fast5.close() print("files\ttotal reads\t%d" % (files)) print("files\ttotal base-called reads\t%d" % (basecalled_files)) for category in sorted(stats.keys()): sizes = stats[category] if len(sizes) > 0: print("%s\ttotal reads\t%d" % (category, len(sizes))) print("%s\ttotal base pairs\t%d" % (category, sum(sizes))) print("%s\tmean\t%.2f" % (category, stat.mean(sizes))) print("%s\tmedian\t%d" % (category, stat.median(sizes))) print("%s\tmin\t%d" % (category, min(sizes))) print("%s\tmax\t%d" % (category, max(sizes))) nxvalues = stat.NX(sizes, [25, 50, 75]) print("%s\tN25\t%d" % (category, nxvalues[25])) print("%s\tN50\t%d" % (category, nxvalues[50])) print("%s\tN75\t%d" % (category, nxvalues[75])) else: logger.warning("No valid sequences observed.\n") else: sizes = [] for fast5 in Fast5File.Fast5FileSet(args.files, group=args.group): fas = fast5.get_fastas(args.type) sizes.extend([len(fa.seq) for fa in fas if fa is not None]) fast5.close() if len(sizes) > 0: print("total reads\t%d" % (len(sizes))) print("total base pairs\t%d" % (sum(sizes))) print("mean\t%.2f" % (stat.mean(sizes))) print("median\t%d" % (stat.median(sizes))) print("min\t%d" % (min(sizes))) print("max\t%d" % (max(sizes))) nxvalues = stat.NX(sizes, [25, 50, 75]) print("N25\t%d" % (nxvalues[25])) print("N50\t%d" % (nxvalues[50])) print("N75\t%d" % (nxvalues[75])) else: logger.warning("No valid sequences observed.\n")
def run(parser, args): print( "start_time\tchannel_number\tread_number\ttemplate_events\tcomplement_events" ) for fast5 in Fast5File.Fast5FileSet(args.files): start_time = fast5.get_start_time() channel_number = fast5.get_channel_number() read_number = fast5.get_read_number() template_events = fast5.get_template_events() if template_events is not None: template_len = len(template_events) else: template_len = 0 complement_events = fast5.get_complement_events() if complement_events is not None: complement_len = len(complement_events) else: complement_len = 0 print("%s\t%s\t%s\t%s\t%s" % (start_time, channel_number, read_number, template_len, complement_len)) fast5.close()
def run(parser, args): print('\t'.join(['channel', 'filename', 'read_length', 'exp_starttime', 'unix_timestamp', 'duration', 'unix_timestamp_end', 'iso_timestamp', 'day', 'hour', 'minute'])) for fast5 in Fast5File.Fast5FileSet(args.files): if fast5.is_open: fq = fast5.get_fastq() start_time = fast5.get_start_time() if start_time is None: logger.warning("No start time for %s!" % (fast5.filename)) fast5.close() continue if fq is not None: read_length = len(fq.seq) else: read_length = 0 lt = localtime(start_time) print("\t".join([str(fast5.get_channel_number()), fast5.filename, str(read_length), str(fast5.get_exp_start_time()), str(start_time), \ str(fast5.get_duration()), str(fast5.get_end_time()), strftime('%Y-%m-%dT%H:%M:%S%z', lt), strftime('%d', lt), strftime('%H', lt), strftime('%M', lt)])) fast5.close()
def run(parser, args): tot_reads_per_pore = Counter() tot_bp_per_pore = Counter() print("\t".join(['channel_number', 'start_time', 'duration'])) for fast5 in Fast5File.Fast5FileSet(args.files): if fast5.is_open: fq = fast5.get_fastq() start_time = fast5.get_start_time() if start_time is None: logger.warning("No start time for %s!" % (fast5.filename)) fast5.close() continue pore_id = fast5.get_channel_number() tot_reads_per_pore[int(pore_id)] += 1 tot_bp_per_pore[int(pore_id)] += len(fq.seq) print("\t".join( [str(pore_id), str(start_time), str(fast5.get_duration())])) fast5.close() if args.plot_type == 'read_count': plot_performance(parser, args, tot_reads_per_pore) elif args.plot_type == 'total_bp': plot_performance(parser, args, tot_bp_per_pore)
def run(parser, args): start_times = [] read_lengths = [] files_processed = 0 for fast5 in Fast5File.Fast5FileSet(args.files): if fast5.is_open: fq = fast5.get_fastq() start_time = fast5.get_start_time() if start_time is None: logger.warning("No start time for %s!" % (fast5.filename)) fast5.close() continue start_times.append(start_time) if fq is not None: read_lengths.append(len(fq.seq)) else: read_lengths.append(0) fast5.close() files_processed += 1 if files_processed % 100 == 0: logger.info("%d files processed." % files_processed) # sort the data by start time start_times, read_lengths = (list(t) for t in zip( *sorted(zip(start_times, read_lengths)))) plot_collectors_curve(args, start_times, read_lengths)
def run(parser, args): if args.read: for i, fast5 in enumerate(Fast5File.Fast5FileSet(args.files)): for metadata_dict in fast5.read_metadata: if i == 0: header = metadata_dict.keys() print("\t".join(["filename"] + header)) print("\t".join([fast5.filename] + [str(metadata_dict[k]) for k in header])) else: print("asic_id\tasic_temp\theatsink_temp") for fast5 in Fast5File.Fast5FileSet(args.files): asic_temp = fast5.get_asic_temp() asic_id = fast5.get_asic_id() heatsink_temp = fast5.get_heatsink_temp() print( "%s\t%s\t%s" % (asic_id.decode(), asic_temp.decode(), heatsink_temp.decode())) fast5.close()
def run(parser, args): qual_count = Counter() total_nucs = 0 for fast5 in Fast5File.Fast5FileSet(args.files): fq = fast5.get_fastq() if fq is not None: for q in fq.qual: qual_count[ord(q)-33] += 1 total_nucs += 1 fast5.close() for q in qual_count: print('\t'.join(str(s) for s in [chr(q+33), q, qual_count[q], total_nucs, float(qual_count[q]) / float(total_nucs)]))
def run(parser, args): fast5_set = Fast5File.Fast5FileSet(args.files) first_fast5 = fast5_set.__next__() for fast5 in fast5_set: # only create a squiggle plot for multiple reads if saving to file. if args.saveas is None: sys.exit("""Please use --saveas when plotting""" """ multiple FAST5 files as input.\n""") if first_fast5 is not None: do_plot_squiggle(args, first_fast5) first_fast5 = None do_plot_squiggle(args, fast5) if first_fast5 is not None: do_plot_squiggle(args, first_fast5)
def run(parser, args): for fast5 in Fast5File.Fast5FileSet(args.files, args.group): if args.start_time or args.end_time: read_start_time = fast5.get_start_time() read_end_time = fast5.get_end_time() if args.start_time and args.start_time > read_start_time: fast5.close() continue if args.end_time and args.end_time < read_end_time: fast5.close() continue fas = fast5.get_fastas(args.type) # high quality 2D: means there are more nanopore events on the # complement strand than on the template strand. We also # require there to be a 2D base-called sequence from Metrichor. if args.high_quality: if (fast5.get_complement_events_count() <= \ fast5.get_template_events_count()) or not fast5.has_2D(): fast5.close() continue # norem quality 2D : means there are less (or equal) nanopore # events on the complement strand than on the template strand. # We also require there to be a 2D base-called sequence from Metrichor. if args.normal_quality: if (fast5.get_complement_events_count() > \ fast5.get_template_events_count()) or not fast5.has_2D(): fast5.close() continue for fa in fas: if fa is None or \ len(fa.seq) < args.min_length or \ (len(fa.seq) > args.max_length and \ args.max_length > 0): continue print(fa) fast5.close()
def run(parser, args): nuc_count = Counter() total_nucs = 0 for fast5 in Fast5File.Fast5FileSet(args.files): fq = fast5.get_fastq() if fq is not None: for n in fq.seq: nuc_count[n] += 1 total_nucs += 1 fast5.close() for n in sorted(nuc_count): print('\t'.join( str(s) for s in [ n, nuc_count[n], total_nucs, float(nuc_count[n]) / float(total_nucs) ]))