def extract_per_read_stats(input_file, output_file): """Dump per-read statistics to tab-separated values""" if not os.path.isfile(input_file): sys.exit('"{}" is not a valid file'.format(input_file)) pr_stats = tombo_stats.PerReadStats(input_file) with open(output_file, 'w') as out_fp: out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format( 'chrm', 'pos', 'strand', 'read_id', 'stat')) for (chrm, strand), cs_blocks in pr_stats.blocks_index.items(): for start, block_name in cs_blocks.items(): for pos, stat, read_id in pr_stats.get_region_per_read_stats( intervalData(chrm, start, start + pr_stats.region_size, strand)): out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format( chrm, pos, strand, read_id, stat))
from tombo import tombo_helper, tombo_stats import numpy as np import pandas as pd """ extract_tombo_per_read_results.py Author: Zaka Yuen JCSMR, ANU Created on June 2020 Access the per-read statistics files using the template from here: <https://nanoporetech.github.io/tombo/tombo.html#python-api-examples> This script is to: -extract per-read statistics into a text format using Tombo python API """ # specify region of interest (plus strand) below: reg_data_plus = tombo_helper.intervalData(chrm='NC_000913.3', start=412305, end=4584088, strand="+") # specify region of interest (minus strand) below: reg_data_minus = tombo_helper.intervalData(chrm='NC_000913.3', start=412305, end=4584088, strand="-") # Plus strand sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0]) reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats( reg_data_plus) reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1 plus = pd.DataFrame(reg_per_read_stats_plus) plus["strand"] = "+"
for row in infile: tmp = row.strip().split("\t") result[tmp[0]] = int(tmp[1]) infile.close() return result chromSizes = parseChromSizesFile(chromSizesInfile) outfile = open(outfileName, "w") per_read_stats = tombo_stats.PerReadStats(perReadStatsInfile) for chrm in chromSizes: ####save plus strand int_data = tombo_helper.intervalData(chrm=chrm, start=1, end=chromSizes[chrm], strand='+') reg_per_read_stats_plus = per_read_stats.get_region_per_read_stats( int_data) if isinstance(reg_per_read_stats_plus, np.ndarray): """ Structure of each cpg is as: (50214, 2.95450765, "b'3526811b-6958-49f8-b78c-a205c1b5fc6e'") """ for cpg in reg_per_read_stats_plus: outfile.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( chrm, cpg[0], cpg[0], cpg[2][2:-1], cpg[1], "+")) ####save minus strand int_data = tombo_helper.intervalData(chrm=chrm, start=1,
Access the per-read statistics files using the template from here: <https://nanoporetech.github.io/tombo/tombo.html#python-api-examples> This script is to: -extract per-read statistics into a text format using Tombo python API """ ################################################### ######## specify region of interest below: ######## chromosome = 'NC_000913.3' start_position = 412305 end_position = 4584088 ################################################### ################################################### reg_data_plus = tombo_helper.intervalData(chrm=chromosome, start=start_position, end=end_position, strand="+") # specify region of interest (minus strand) below: reg_data_minus = tombo_helper.intervalData(chrm=chromosome, start=start_position, end=end_position, strand="-") # Plus strand sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0]) reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats( reg_data_plus) reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1 plus = pd.DataFrame(reg_per_read_stats_plus) plus["strand"] = "+"
from tombo import tombo_helper, tombo_stats, resquiggle import h5py, mappy # specify region of interest reg_data = tombo_helper.intervalData(chrm='chr20', start=10000, end=10100, strand='+') # parse Tombo index from previously re-squiggled set of reads reads_index = tombo_helper.TomboReads([ '/home/mookse/workspace/DeepSimulator/fast5', ]) # extract reads that overlap this interval and then extract base signal # levels from 10 randomly selected reads reg_base_levels = reg_data.add_reads(reads_index).get_base_levels(num_reads=10) sample_per_read_stats = tombo_stats.PerReadStats( 'test_stats.alt_model.5mC.tombo.per_read_stats') # reg_per_read_stats contains a numpy array containing per-read stats # over all reads covering the region of interest reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats(reg_data) # set read values fast5_fn, reference_fn = '/home/mookse/workspace/DeepSimulator/fast5/signal_0_d1986e9e-afed-49d6-9b1a-dc997e107dfb.fast5', '/home/mookse/workspace/DeepSimulator/test_samples/adapter.fa' fast5_data = h5py.File(fast5_fn, 'r') seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data) # prep aligner, signal model and parameters aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1) std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
into a 2D table with a row for every read and a column for every position, then write that table to a CSV file at output_path''' # The three operations below that involve 'stat_level' are just to delete # extraneous labelling information from the table before we export to CSV (pd.DataFrame(recarray).set_index(['read_id', 'pos']).rename_axis( 'stat_level', axis=1).unstack('pos').stack('stat_level').reset_index( 'stat_level', drop=True).to_csv(output_path)) if __name__ == '__main__': from tombo import tombo_helper, tombo_stats args = cli.parser.parse_args() ERRMESS = ("The file {} already exists. Consider using the --overwrite " + " option.").format(args.output_path) assert args.overwrite or not os.path.exists(args.output_path), ERRMESS reg = tombo_helper.intervalData( chrm=args.chromosome, start=args.start, end=args.end, strand=args.strand, ) prs_recarray = (tombo_stats.PerReadStats( args.prs_path).get_region_per_read_stats(reg)) recarray_to_csv(prs_recarray, args.output_path)