Exemple #1
0
def run(parser, args):
    if args.full_tsv:
        files = 0
        basecalled_files = 0
        stats = defaultdict(list)
        print('starting')
        for fast5 in Fast5File.Fast5FileSet(args.files):
            files += 1
            fas = fast5.get_fastas_dict()
            if len(fas) > 0:
                basecalled_files += 1
            for (category, fa) in iteritems(fas):
                if fa is not None:
                    stats[category].append(len(fa.seq))
                    if category == 'twodirections':
                        if fast5.is_high_quality():
                            stats['2D_hq'].append(len(fa.seq))

        fast5.close()

        print("files\ttotal reads\t%d" % (files))
        print("files\ttotal base-called reads\t%d" % (basecalled_files))
        for category in sorted(stats.keys()):
            sizes = stats[category]

            if len(sizes) > 0:
                print("%s\ttotal reads\t%d" % (category, len(sizes)))
                print("%s\ttotal base pairs\t%d" % (category, sum(sizes)))
                print("%s\tmean\t%.2f" % (category, stat.mean(sizes)))
                print("%s\tmedian\t%d" % (category, stat.median(sizes)))
                print("%s\tmin\t%d" % (category, min(sizes)))
                print("%s\tmax\t%d" % (category, max(sizes)))
                nxvalues = stat.NX(sizes, [25, 50, 75])
                print("%s\tN25\t%d" % (category, nxvalues[25]))
                print("%s\tN50\t%d" % (category, nxvalues[50]))
                print("%s\tN75\t%d" % (category, nxvalues[75]))
            else:
                logger.warning("No valid sequences observed.\n")
    else:
        sizes = []
        for fast5 in Fast5File.Fast5FileSet(args.files, group=args.group):
            fas = fast5.get_fastas(args.type)
            sizes.extend([len(fa.seq) for fa in fas if fa is not None])
            fast5.close()

        if len(sizes) > 0:
            print("total reads\t%d" % (len(sizes)))
            print("total base pairs\t%d" % (sum(sizes)))
            print("mean\t%.2f" % (stat.mean(sizes)))
            print("median\t%d" % (stat.median(sizes)))
            print("min\t%d" % (min(sizes)))
            print("max\t%d" % (max(sizes)))
            nxvalues = stat.NX(sizes, [25, 50, 75])
            print("N25\t%d" % (nxvalues[25]))
            print("N50\t%d" % (nxvalues[50]))
            print("N75\t%d" % (nxvalues[75]))
        else:
            logger.warning("No valid sequences observed.\n")
Exemple #2
0
def run(parser, args):

    print(
        "start_time\tchannel_number\tread_number\ttemplate_events\tcomplement_events"
    )

    for fast5 in Fast5File.Fast5FileSet(args.files):

        start_time = fast5.get_start_time()
        channel_number = fast5.get_channel_number()
        read_number = fast5.get_read_number()

        template_events = fast5.get_template_events()
        if template_events is not None:
            template_len = len(template_events)
        else:
            template_len = 0

        complement_events = fast5.get_complement_events()
        if complement_events is not None:
            complement_len = len(complement_events)
        else:
            complement_len = 0

        print("%s\t%s\t%s\t%s\t%s" % (start_time, channel_number, read_number,
                                      template_len, complement_len))

        fast5.close()
Exemple #3
0
def run(parser, args):
	print('\t'.join(['channel', 'filename', 'read_length', 
		'exp_starttime', 'unix_timestamp', 'duration', 
		'unix_timestamp_end', 'iso_timestamp', 'day', 
		'hour', 'minute']))
	
	for fast5 in Fast5File.Fast5FileSet(args.files):
		if fast5.is_open:
			
			fq = fast5.get_fastq()
			
			start_time = fast5.get_start_time()
			if start_time is None:
				logger.warning("No start time for %s!" % (fast5.filename))
				fast5.close()
				continue

			if fq is not None:
				read_length = len(fq.seq)
			else:
				read_length = 0

			lt = localtime(start_time)
			print("\t".join([str(fast5.get_channel_number()),
				fast5.filename, 
				str(read_length),
				str(fast5.get_exp_start_time()),
				str(start_time), \
				str(fast5.get_duration()),
				str(fast5.get_end_time()),
				strftime('%Y-%m-%dT%H:%M:%S%z', lt),
				strftime('%d', lt),
				strftime('%H', lt),
				strftime('%M', lt)]))
			fast5.close()
Exemple #4
0
def run(parser, args):

    tot_reads_per_pore = Counter()
    tot_bp_per_pore = Counter()

    print("\t".join(['channel_number', 'start_time', 'duration']))
    for fast5 in Fast5File.Fast5FileSet(args.files):
        if fast5.is_open:
            fq = fast5.get_fastq()

            start_time = fast5.get_start_time()
            if start_time is None:
                logger.warning("No start time for %s!" % (fast5.filename))
                fast5.close()
                continue

            pore_id = fast5.get_channel_number()
            tot_reads_per_pore[int(pore_id)] += 1
            tot_bp_per_pore[int(pore_id)] += len(fq.seq)

            print("\t".join(
                [str(pore_id),
                 str(start_time),
                 str(fast5.get_duration())]))
            fast5.close()

    if args.plot_type == 'read_count':
        plot_performance(parser, args, tot_reads_per_pore)
    elif args.plot_type == 'total_bp':
        plot_performance(parser, args, tot_bp_per_pore)
Exemple #5
0
def run(parser, args):

    start_times = []
    read_lengths = []
    files_processed = 0
    for fast5 in Fast5File.Fast5FileSet(args.files):
        if fast5.is_open:

            fq = fast5.get_fastq()

            start_time = fast5.get_start_time()
            if start_time is None:
                logger.warning("No start time for %s!" % (fast5.filename))
                fast5.close()
                continue

            start_times.append(start_time)
            if fq is not None:
                read_lengths.append(len(fq.seq))
            else:
                read_lengths.append(0)
            fast5.close()

        files_processed += 1
        if files_processed % 100 == 0:
            logger.info("%d files processed." % files_processed)

    # sort the data by start time
    start_times, read_lengths = (list(t) for t in zip(
        *sorted(zip(start_times, read_lengths))))
    plot_collectors_curve(args, start_times, read_lengths)
Exemple #6
0
def run(parser, args):

    if args.read:
        for i, fast5 in enumerate(Fast5File.Fast5FileSet(args.files)):
            for metadata_dict in fast5.read_metadata:
                if i == 0:
                    header = metadata_dict.keys()
                    print("\t".join(["filename"] + header))
                print("\t".join([fast5.filename] +
                                [str(metadata_dict[k]) for k in header]))
    else:
        print("asic_id\tasic_temp\theatsink_temp")
        for fast5 in Fast5File.Fast5FileSet(args.files):

            asic_temp = fast5.get_asic_temp()
            asic_id = fast5.get_asic_id()
            heatsink_temp = fast5.get_heatsink_temp()

            print(
                "%s\t%s\t%s" %
                (asic_id.decode(), asic_temp.decode(), heatsink_temp.decode()))

            fast5.close()
Exemple #7
0
def run(parser, args):

	qual_count = Counter()
	total_nucs = 0

	for fast5 in Fast5File.Fast5FileSet(args.files):
		fq = fast5.get_fastq()
		if fq is not None:
			for q in fq.qual:
				qual_count[ord(q)-33] += 1
				total_nucs += 1
		fast5.close()

	for q in qual_count:
		print('\t'.join(str(s) for s in [chr(q+33), q, qual_count[q], 
			total_nucs, float(qual_count[q]) / float(total_nucs)]))
Exemple #8
0
def run(parser, args):

    fast5_set = Fast5File.Fast5FileSet(args.files)

    first_fast5 = fast5_set.__next__()
    for fast5 in fast5_set:
        # only create a squiggle plot for multiple reads if saving to file.
        if args.saveas is None:
            sys.exit("""Please use --saveas when plotting"""
                     """ multiple FAST5 files as input.\n""")
        if first_fast5 is not None:
            do_plot_squiggle(args, first_fast5)
            first_fast5 = None
        do_plot_squiggle(args, fast5)

    if first_fast5 is not None:
        do_plot_squiggle(args, first_fast5)
Exemple #9
0
def run(parser, args):

    for fast5 in Fast5File.Fast5FileSet(args.files, args.group):

        if args.start_time or args.end_time:
            read_start_time = fast5.get_start_time()
            read_end_time = fast5.get_end_time()
            if args.start_time and args.start_time > read_start_time:
                fast5.close()
                continue
            if args.end_time and args.end_time < read_end_time:
                fast5.close()
                continue

        fas = fast5.get_fastas(args.type)

        # high quality 2D: means there are more nanopore events on the
        # complement strand than on the template strand. We also
        # require there to be a 2D base-called sequence from Metrichor.
        if args.high_quality:
            if (fast5.get_complement_events_count() <= \
               fast5.get_template_events_count()) or not fast5.has_2D():
                fast5.close()
                continue

        # norem quality 2D : means there are less (or equal) nanopore
        # events on the complement strand than on the template strand.
        # We also require there to be a 2D base-called sequence from Metrichor.
        if args.normal_quality:
            if (fast5.get_complement_events_count() > \
               fast5.get_template_events_count()) or not fast5.has_2D():
                fast5.close()
                continue

        for fa in fas:
            if fa is None or \
            len(fa.seq) < args.min_length or \
            (len(fa.seq) > args.max_length and \
            args.max_length > 0):
                continue

            print(fa)

        fast5.close()
Exemple #10
0
def run(parser, args):

    nuc_count = Counter()
    total_nucs = 0

    for fast5 in Fast5File.Fast5FileSet(args.files):
        fq = fast5.get_fastq()
        if fq is not None:
            for n in fq.seq:
                nuc_count[n] += 1
                total_nucs += 1
        fast5.close()

    for n in sorted(nuc_count):
        print('\t'.join(
            str(s) for s in [
                n, nuc_count[n], total_nucs,
                float(nuc_count[n]) / float(total_nucs)
            ]))