コード例 #1
0
def extract_per_read_stats(input_file, output_file):
    """Dump per-read statistics to tab-separated values"""
    if not os.path.isfile(input_file):
        sys.exit('"{}" is not a valid file'.format(input_file))

    pr_stats = tombo_stats.PerReadStats(input_file)

    with open(output_file, 'w') as out_fp:
        out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format(
            'chrm', 'pos', 'strand', 'read_id', 'stat'))
        for (chrm, strand), cs_blocks in pr_stats.blocks_index.items():
            for start, block_name in cs_blocks.items():
                for pos, stat, read_id in pr_stats.get_region_per_read_stats(
                        intervalData(chrm, start, start + pr_stats.region_size,
                                    strand)):
                    out_fp.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        chrm, pos, strand, read_id, stat))
コード例 #2
0
from tombo import tombo_helper, tombo_stats
import numpy as np
import pandas as pd
"""
	extract_tombo_per_read_results.py
	Author: Zaka Yuen JCSMR, ANU
	Created on June 2020
	Access the per-read statistics files using the template from here:
	<https://nanoporetech.github.io/tombo/tombo.html#python-api-examples>
	This script is to:
	-extract per-read statistics into a text format using Tombo python API
"""

# specify region of interest (plus strand) below:
reg_data_plus = tombo_helper.intervalData(chrm='NC_000913.3',
                                          start=412305,
                                          end=4584088,
                                          strand="+")

# specify region of interest (minus strand) below:
reg_data_minus = tombo_helper.intervalData(chrm='NC_000913.3',
                                           start=412305,
                                           end=4584088,
                                           strand="-")

# Plus strand
sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0])
reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats(
    reg_data_plus)
reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1
plus = pd.DataFrame(reg_per_read_stats_plus)
plus["strand"] = "+"
コード例 #3
0
    for row in infile:
        tmp = row.strip().split("\t")
        result[tmp[0]] = int(tmp[1])
    infile.close()
    return result


chromSizes = parseChromSizesFile(chromSizesInfile)
outfile = open(outfileName, "w")

per_read_stats = tombo_stats.PerReadStats(perReadStatsInfile)

for chrm in chromSizes:
    ####save plus strand
    int_data = tombo_helper.intervalData(chrm=chrm,
                                         start=1,
                                         end=chromSizes[chrm],
                                         strand='+')
    reg_per_read_stats_plus = per_read_stats.get_region_per_read_stats(
        int_data)
    if isinstance(reg_per_read_stats_plus, np.ndarray):
        """
        Structure of each cpg is as:
         (50214, 2.95450765, "b'3526811b-6958-49f8-b78c-a205c1b5fc6e'")
        """
        for cpg in reg_per_read_stats_plus:
            outfile.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                chrm, cpg[0], cpg[0], cpg[2][2:-1], cpg[1], "+"))

    ####save minus strand
    int_data = tombo_helper.intervalData(chrm=chrm,
                                         start=1,
コード例 #4
0
	Access the per-read statistics files using the template from here:
	<https://nanoporetech.github.io/tombo/tombo.html#python-api-examples>
	This script is to:
	-extract per-read statistics into a text format using Tombo python API
"""

###################################################
######## specify region of interest below: ########
chromosome = 'NC_000913.3'
start_position = 412305
end_position = 4584088
###################################################
###################################################

reg_data_plus = tombo_helper.intervalData(chrm=chromosome,
                                          start=start_position,
                                          end=end_position,
                                          strand="+")

# specify region of interest (minus strand) below:
reg_data_minus = tombo_helper.intervalData(chrm=chromosome,
                                           start=start_position,
                                           end=end_position,
                                           strand="-")

# Plus strand
sample_per_read_stats = tombo_stats.PerReadStats(snakemake.input[0])
reg_per_read_stats_plus = sample_per_read_stats.get_region_per_read_stats(
    reg_data_plus)
reg_per_read_stats_plus['pos'] = reg_per_read_stats_plus['pos'] + 1
plus = pd.DataFrame(reg_per_read_stats_plus)
plus["strand"] = "+"
コード例 #5
0
from tombo import tombo_helper, tombo_stats, resquiggle
import h5py, mappy

# specify region of interest
reg_data = tombo_helper.intervalData(chrm='chr20',
                                     start=10000,
                                     end=10100,
                                     strand='+')

# parse Tombo index from previously re-squiggled set of reads
reads_index = tombo_helper.TomboReads([
    '/home/mookse/workspace/DeepSimulator/fast5',
])
# extract reads that overlap this interval and then extract base signal
# levels from 10 randomly selected reads
reg_base_levels = reg_data.add_reads(reads_index).get_base_levels(num_reads=10)

sample_per_read_stats = tombo_stats.PerReadStats(
    'test_stats.alt_model.5mC.tombo.per_read_stats')
# reg_per_read_stats contains a numpy array containing per-read stats
# over all reads covering the region of interest
reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats(reg_data)

# set read values
fast5_fn, reference_fn = '/home/mookse/workspace/DeepSimulator/fast5/signal_0_d1986e9e-afed-49d6-9b1a-dc997e107dfb.fast5', '/home/mookse/workspace/DeepSimulator/test_samples/adapter.fa'
fast5_data = h5py.File(fast5_fn, 'r')
seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data)

# prep aligner, signal model and parameters
aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1)
std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
コード例 #6
0
ファイル: prsconv2.py プロジェクト: Chris-Kimmel/prsconv2
    into a 2D table with a row for every read and a column for every position,
    then write that table to a CSV file at output_path'''

    # The three operations below that involve 'stat_level' are just to delete
    # extraneous labelling information from the table before we export to CSV
    (pd.DataFrame(recarray).set_index(['read_id', 'pos']).rename_axis(
        'stat_level', axis=1).unstack('pos').stack('stat_level').reset_index(
            'stat_level', drop=True).to_csv(output_path))


if __name__ == '__main__':
    from tombo import tombo_helper, tombo_stats

    args = cli.parser.parse_args()

    ERRMESS = ("The file {} already exists. Consider using the --overwrite " +
               " option.").format(args.output_path)
    assert args.overwrite or not os.path.exists(args.output_path), ERRMESS

    reg = tombo_helper.intervalData(
        chrm=args.chromosome,
        start=args.start,
        end=args.end,
        strand=args.strand,
    )

    prs_recarray = (tombo_stats.PerReadStats(
        args.prs_path).get_region_per_read_stats(reg))

    recarray_to_csv(prs_recarray, args.output_path)