fast5 = h5py.File(
    '/media/mookse/DATA1/minion_data/bulk/mookse_Veriton_X4650G_20180613_FAH54029_MN21778_sequencing_run_RNA3_G4_false_79563.fast5'
)

events = fast5['IntermediateData']['Channel_50']['Events'][()]
states = fast5['StateData']['Channel_50']['States'][()]

data = extract_adapter_events(events, states)

print('Adapter matches:', len(data))

#tombo_model = '/home/mookse/anaconda3/pkgs/ont-tombo-1.4-py36r341h24bf2e0_0/lib/python3.6/site-packages/tombo/tombo_models/tombo.DNA.model'
tombo_model = '/home/mookse/anaconda3/pkgs/ont-tombo-1.5-py36r341h24bf2e0_0/lib/python3.6/site-packages/tombo/tombo_models/tombo.DNA.model'
reference_fn = 'GGCTTCTTCTTGCTCTTAGGTAGTAGGTTC'

instance = tombo_stats.TomboModel(tombo_model)
print([
    func for func in dir(tombo_stats.TomboModel)
    if callable(getattr(tombo_stats.TomboModel, func))
])

std_model = instance.get_exp_levels_from_seq(reference_fn, rev_strand=False)

model_new = np.repeat(std_model[0], 48, axis=0)
model_new = np.concatenate((np.zeros(2000), model_new, np.zeros(2000)),
                           axis=None)

ax = plt.subplot(np.rint((len(data) + 1) / 2), 2, 1)
plt.plot(np.arange(0, len(model_new)), model_new, 'r')

#data = np.array(data)
Beispiel #2
0
def process_fast5(fast5, ref, rna=True, sensitive=False):
    """Process individual Fast5 files"""
    outfn = "%s.bam"%fast5 #.d2r
    if os.path.isfile(outfn): return outfn
    faidx = pysam.FastaFile(ref)
    ref2len = {r: l for r, l in zip(faidx.references, faidx.lengths)}
    # load model & its parameters
    if rna:
        seq_samp_type = tombo_helper.seqSampleType('RNA', True)
        rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type)
        std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
        spliced = True
    else:
        seq_samp_type = tombo_helper.seqSampleType('DNA', False)
        rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type)
        spliced = False
        std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
    # get resquiggle parameters
    i, errors = 0, {} 
    # prep aligner, signal model and parameters
    aligner = minimap2_proc(ref, fast5, sensitive=sensitive, spliced=spliced)
    sam = pysam.AlignmentFile(aligner.stdout)
    # open unsorted bam for saving alignements with features
    tmp = tempfile.NamedTemporaryFile(delete=False); tmp.close()
    bam_unsorted = pysam.AlignmentFile(tmp.name, "wb", header=sam.header)
    for i, (res, err) in enumerate(resquiggle_reads(fast5, sam, ref, seq_samp_type, std_ref, rsqgl_params), 1):
        #if i>200: break
        if not i%100: sys.stderr.write(" %s - %s reads skipped: %s \r"%(i, sum(errors.values()), str(errors)))
        if not res:
            if err not in errors: errors[err] = 1
            else: errors[err] += 1
            continue
        # get pysam alignment object & exonic blocks
        a, blocks = res.a, res.align_info.blocks
        # get signal intensity means
        si = get_norm_mean(res.raw_signal, res.segs)
        # catch problems - here exonic seq will have different length
        if len(si)!=sum([e-s for s, e in blocks]): #a.reference_length:
            region = "%s:%s-%s"%(a.reference_name, a.reference_start, a.reference_end)
            print(a.qname, region, sam.lengths[a.reference_id], a.reference_length, len(si), blocks)
        # get dwell times capped at 255 to fit uint8 (1 byte per base)
        dt = res.segs[1:]-res.segs[:-1]
        dt[dt>255] = 255
        # get reference-aligned base probabilities: tr (ref base)
        tr = get_trace_for_reference_bases(a, res.read, rna) # this takes 189µs (>50%) of time!
        if a.is_reverse: si, dt = si[::-1], dt[::-1]
        # and finally set tags matching refseq
        ## but if alignment reaches seq end the end signal/probs will be wrong!
        ## same at exon-intron boundaries
        a.set_tag("bs", array("i", np.array(blocks).flatten()))
        a.set_tag("si", array("f", si))
        a.set_tag("dt", array("B", dt))
        # tr correspond to reference base
        # get exonic tr
        exonic_pos = np.concatenate([np.arange(s, e) for s, e in blocks])
        tr = tr[exonic_pos-a.pos]
        a.set_tag("tr", array("B", tr))
        # store read alignment with additional info
        bam_unsorted.write(a)
    # close tmp, sort, index & clean-up
    bam_unsorted.close()
    pysam.sort("-o", outfn, tmp.name)
    pysam.index(outfn)
    os.unlink(tmp.name)
    # write error report
    with open('%s.json'%outfn, 'w') as f:
        errors["Alignements"] = i # store number of alignements
        f.write(json.dumps(errors)) #
    return outfn
Beispiel #3
0
reg_base_levels = reg_data.add_reads(reads_index).get_base_levels(num_reads=10)

sample_per_read_stats = tombo_stats.PerReadStats(
    'test_stats.alt_model.5mC.tombo.per_read_stats')
# reg_per_read_stats contains a numpy array containing per-read stats
# over all reads covering the region of interest
reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats(reg_data)

# set read values
fast5_fn, reference_fn = '/home/mookse/workspace/DeepSimulator/fast5/signal_0_d1986e9e-afed-49d6-9b1a-dc997e107dfb.fast5', '/home/mookse/workspace/DeepSimulator/test_samples/adapter.fa'
fast5_data = h5py.File(fast5_fn, 'r')
seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data)

# prep aligner, signal model and parameters
aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1)
std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type)

# extract data from FAST5
map_results = resquiggle.map_read(fast5_data, aligner, std_ref)
all_raw_signal = tombo_helper.get_raw_read_slot(fast5_data)['Signal'][:]
if seq_samp_type.rev_sig:
    all_raw_signal = all_raw_signal[::-1]
map_results = map_results._replace(raw_signal=all_raw_signal)

# run full re-squiggle
rsqgl_results = resquiggle.resquiggle_read(map_results,
                                           std_ref,
                                           rsqgl_params,
                                           all_raw_signal=all_raw_signal)
def process_h5(fast5_data, aligner):
    seq_samp_type = tombo_helper.seqSampleType(
        "DNA", False)  # Impose DNA  but what is rev_sig
    # prep aligner, signal model and parameters

    std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type)
    rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type)

    # extract data from FAST5
    map_results = resquiggle.map_read(
        fast5_data, aligner, std_ref
    )  # Should be modified at that point to insert sequence from fasta
    #print(map_results)
    bases = set(map_results.genome_seq)
    non_canonical = [
        b for b in bases if b not in ["a", "t", "c", "g", "A", "T", "C", "G"]
    ]
    if len(non_canonical) != 0:
        raise ValueError("Found non canonical bases (%s)" % str(non_canonical))

    #print(map_results)
    all_raw_signal = tombo_helper.get_raw_read_slot(fast5_data)['Signal'][:]
    if seq_samp_type.rev_sig:
        all_raw_signal = all_raw_signal[::-1]
    map_results = map_results._replace(raw_signal=all_raw_signal)

    # run full re-squiggle

    #use_save_bandwidth = True
    try:

        rsqgl_res = resquiggle.resquiggle_read(map_results,
                                               std_ref,
                                               rsqgl_params,
                                               all_raw_signal=all_raw_signal,
                                               outlier_thresh=5)

    #cprs = copy.deepcopy(rsqgl_params)
    #cprs.use_save_bandwidth = True
    except:
        cprs = tombo_stats.load_resquiggle_parameters(seq_samp_type,
                                                      use_save_bandwidth=True)
        #print(cprs)
        rsqgl_res = resquiggle.resquiggle_read(map_results,
                                               std_ref,
                                               cprs,
                                               all_raw_signal=all_raw_signal,
                                               outlier_thresh=5)
    #except:
    #    pass
    """
       """
    """
    n_iters = 1
    max_scaling_iters = 3
    while n_iters < max_scaling_iters and rsqgl_res.norm_params_changed:
        rsqgl_res = resquiggle.resquiggle_read(
            map_results._replace(scale_values=rsqgl_res.scale_values),
            std_ref, rsqgl_params, outlier_thresh=5, all_raw_signal=all_raw_signal,
            seq_samp_type=seq_samp_type)
        n_iters += 1
    """
    norm_means = tombo_helper.c_new_means(rsqgl_res.raw_signal, rsqgl_res.segs)
    norm_stds = tombo_helper.repeat(np.NAN)

    event_data = np.array(list(
        zip(np.array(norm_means, dtype=np.float16),
            list(rsqgl_res.genome_seq))),
                          dtype=[(str('norm_mean'), np.float16),
                                 (str('base'), 'S1')])

    return event_data, rsqgl_res
Beispiel #5
0
from tombo import tombo_helper, tombo_stats, resquiggle
import h5py, mappy
import matplotlib.pyplot as plt
import numpy as np

#tombo_model = '/home/mookse/anaconda3/pkgs/ont-tombo-1.4-py36r341h24bf2e0_0/lib/python3.6/site-packages/tombo/tombo_models/tombo.DNA.model'
tombo_model_reg = '/home/mookse/anaconda3/pkgs/ont-tombo-1.5-py36r341h24bf2e0_0/lib/python3.6/site-packages/tombo/tombo_models/tombo.DNA.model'

reference_fn = 'GGCTTCTTCTTGCTCTTAGGTAGTAGGTTC'

instance1 = tombo_stats.TomboModel(tombo_model_reg)
#print('Class methods available:', [func for func in dir(tombo_stats.TomboModel) if callable(getattr(tombo_stats.TomboModel, func))])
std_model = instance1.get_exp_levels_from_seq(reference_fn, rev_strand=False)
model_new = np.repeat(std_model[0], 10, axis=0)
plt.plot(np.arange(0, len(model_new)), model_new)

plt.show()