def extract_per_read_stats(input_file, output_file):
    """Dump per-read statistics to tab-separated values"""
    if not os.path.isfile(input_file):
        sys.exit('"{}" is not a valid file'.format(input_file))

    pr_stats = tombo_stats.PerReadStats(input_file)

    with open(output_file, 'w') as out_fp:
        out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format(
            'chrm', 'pos', 'strand', 'read_id', 'stat'))
        for (chrm, strand), cs_blocks in pr_stats.blocks_index.items():
            for start, block_name in cs_blocks.items():
                for pos, stat, read_id in pr_stats.get_region_per_read_stats(
                        intervalData(chrm, start, start + pr_stats.region_size,
                                    strand)):
                    out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        chrm, pos, strand, read_id, stat))
Exemple #2
0
"""

# specify region of interest (plus strand) below:
reg_data_plus = tombo_helper.intervalData(chrm='NC_000913.3',
                                          start=412305,
                                          end=4584088,
                                          strand="+")

# specify region of interest (minus strand) below:
reg_data_minus = tombo_helper.intervalData(chrm='NC_000913.3',
                                           start=412305,
                                           end=4584088,
                                           strand="-")

# Plus strand
sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0])
reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats(
    reg_data_plus)
reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1
plus = pd.DataFrame(reg_per_read_stats_plus)
plus["strand"] = "+"

# Minus strand
reg_per_read_stats_minus = sample_per_read_stats.get_region_per_read_stats(
    reg_data_minus)
reg_per_read_stats_minus['pos'] = reg_per_read_stats_minus['pos'] + 1
minus = pd.DataFrame(reg_per_read_stats_minus)
minus["strand"] = "-"

# Combine the results
final = pd.concat([plus, minus])
Exemple #3
0
# specify region of interest
reg_data = tombo_helper.intervalData(chrm='chr20',
                                     start=10000,
                                     end=10100,
                                     strand='+')

# parse Tombo index from previously re-squiggled set of reads
reads_index = tombo_helper.TomboReads([
    '/home/mookse/workspace/DeepSimulator/fast5',
])
# extract reads that overlap this interval and then extract base signal
# levels from 10 randomly selected reads
reg_base_levels = reg_data.add_reads(reads_index).get_base_levels(num_reads=10)

sample_per_read_stats = tombo_stats.PerReadStats(
    'test_stats.alt_model.5mC.tombo.per_read_stats')
# reg_per_read_stats contains a numpy array containing per-read stats
# over all reads covering the region of interest
reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats(reg_data)

# set read values
fast5_fn, reference_fn = '/home/mookse/workspace/DeepSimulator/fast5/signal_0_d1986e9e-afed-49d6-9b1a-dc997e107dfb.fast5', '/home/mookse/workspace/DeepSimulator/test_samples/adapter.fa'
fast5_data = h5py.File(fast5_fn, 'r')
seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data)

# prep aligner, signal model and parameters
aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1)
std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type)

# extract data from FAST5

def parseChromSizesFile(infileName):
    result = {}
    infile = open(infileName, 'r')
    for row in infile:
        tmp = row.strip().split("\t")
        result[tmp[0]] = int(tmp[1])
    infile.close()
    return result


chromSizes = parseChromSizesFile(chromSizesInfile)
outfile = open(outfileName, "w")

per_read_stats = tombo_stats.PerReadStats(perReadStatsInfile)

for chrm in chromSizes:
    ####save plus strand
    int_data = tombo_helper.intervalData(chrm=chrm,
                                         start=1,
                                         end=chromSizes[chrm],
                                         strand='+')
    reg_per_read_stats_plus = per_read_stats.get_region_per_read_stats(
        int_data)
    if isinstance(reg_per_read_stats_plus, np.ndarray):
        """
        Structure of each cpg is as:
         (50214, 2.95450765, "b'3526811b-6958-49f8-b78c-a205c1b5fc6e'")
        """
        for cpg in reg_per_read_stats_plus:
Exemple #5
0
    into a 2D table with a row for every read and a column for every position,
    then write that table to a CSV file at output_path'''

    # The three operations below that involve 'stat_level' are just to delete
    # extraneous labelling information from the table before we export to CSV
    (pd.DataFrame(recarray).set_index(['read_id', 'pos']).rename_axis(
        'stat_level', axis=1).unstack('pos').stack('stat_level').reset_index(
            'stat_level', drop=True).to_csv(output_path))


if __name__ == '__main__':
    from tombo import tombo_helper, tombo_stats

    args = cli.parser.parse_args()

    ERRMESS = ("The file {} already exists. Consider using the --overwrite " +
               " option.").format(args.output_path)
    assert args.overwrite or not os.path.exists(args.output_path), ERRMESS

    reg = tombo_helper.intervalData(
        chrm=args.chromosome,
        start=args.start,
        end=args.end,
        strand=args.strand,
    )

    prs_recarray = (tombo_stats.PerReadStats(
        args.prs_path).get_region_per_read_stats(reg))

    recarray_to_csv(prs_recarray, args.output_path)