def extract_per_read_stats(input_file, output_file): """Dump per-read statistics to tab-separated values""" if not os.path.isfile(input_file): sys.exit('"{}" is not a valid file'.format(input_file)) pr_stats = tombo_stats.PerReadStats(input_file) with open(output_file, 'w') as out_fp: out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format( 'chrm', 'pos', 'strand', 'read_id', 'stat')) for (chrm, strand), cs_blocks in pr_stats.blocks_index.items(): for start, block_name in cs_blocks.items(): for pos, stat, read_id in pr_stats.get_region_per_read_stats( intervalData(chrm, start, start + pr_stats.region_size, strand)): out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format( chrm, pos, strand, read_id, stat))
""" # specify region of interest (plus strand) below: reg_data_plus = tombo_helper.intervalData(chrm='NC_000913.3', start=412305, end=4584088, strand="+") # specify region of interest (minus strand) below: reg_data_minus = tombo_helper.intervalData(chrm='NC_000913.3', start=412305, end=4584088, strand="-") # Plus strand sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0]) reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats( reg_data_plus) reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1 plus = pd.DataFrame(reg_per_read_stats_plus) plus["strand"] = "+" # Minus strand reg_per_read_stats_minus = sample_per_read_stats.get_region_per_read_stats( reg_data_minus) reg_per_read_stats_minus['pos'] = reg_per_read_stats_minus['pos'] + 1 minus = pd.DataFrame(reg_per_read_stats_minus) minus["strand"] = "-" # Combine the results final = pd.concat([plus, minus])
# specify region of interest reg_data = tombo_helper.intervalData(chrm='chr20', start=10000, end=10100, strand='+') # parse Tombo index from previously re-squiggled set of reads reads_index = tombo_helper.TomboReads([ '/home/mookse/workspace/DeepSimulator/fast5', ]) # extract reads that overlap this interval and then extract base signal # levels from 10 randomly selected reads reg_base_levels = reg_data.add_reads(reads_index).get_base_levels(num_reads=10) sample_per_read_stats = tombo_stats.PerReadStats( 'test_stats.alt_model.5mC.tombo.per_read_stats') # reg_per_read_stats contains a numpy array containing per-read stats # over all reads covering the region of interest reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats(reg_data) # set read values fast5_fn, reference_fn = '/home/mookse/workspace/DeepSimulator/fast5/signal_0_d1986e9e-afed-49d6-9b1a-dc997e107dfb.fast5', '/home/mookse/workspace/DeepSimulator/test_samples/adapter.fa' fast5_data = h5py.File(fast5_fn, 'r') seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data) # prep aligner, signal model and parameters aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1) std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type) rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type) # extract data from FAST5
def parseChromSizesFile(infileName): result = {} infile = open(infileName, 'r') for row in infile: tmp = row.strip().split("\t") result[tmp[0]] = int(tmp[1]) infile.close() return result chromSizes = parseChromSizesFile(chromSizesInfile) outfile = open(outfileName, "w") per_read_stats = tombo_stats.PerReadStats(perReadStatsInfile) for chrm in chromSizes: ####save plus strand int_data = tombo_helper.intervalData(chrm=chrm, start=1, end=chromSizes[chrm], strand='+') reg_per_read_stats_plus = per_read_stats.get_region_per_read_stats( int_data) if isinstance(reg_per_read_stats_plus, np.ndarray): """ Structure of each cpg is as: (50214, 2.95450765, "b'3526811b-6958-49f8-b78c-a205c1b5fc6e'") """ for cpg in reg_per_read_stats_plus:
into a 2D table with a row for every read and a column for every position, then write that table to a CSV file at output_path''' # The three operations below that involve 'stat_level' are just to delete # extraneous labelling information from the table before we export to CSV (pd.DataFrame(recarray).set_index(['read_id', 'pos']).rename_axis( 'stat_level', axis=1).unstack('pos').stack('stat_level').reset_index( 'stat_level', drop=True).to_csv(output_path)) if __name__ == '__main__': from tombo import tombo_helper, tombo_stats args = cli.parser.parse_args() ERRMESS = ("The file {} already exists. Consider using the --overwrite " + " option.").format(args.output_path) assert args.overwrite or not os.path.exists(args.output_path), ERRMESS reg = tombo_helper.intervalData( chrm=args.chromosome, start=args.start, end=args.end, strand=args.strand, ) prs_recarray = (tombo_stats.PerReadStats( args.prs_path).get_region_per_read_stats(reg)) recarray_to_csv(prs_recarray, args.output_path)