forked from mpkocher/pbsmrtpipe-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_report_inc_multiple_mapped_ssets.py
124 lines (95 loc) · 4.23 KB
/
generate_report_inc_multiple_mapped_ssets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
import sys
import logging
import csv
import random
import pickle
sys.path.append('/mnt/usmp-data3/scratch/Labs/Kristofor/python/mhpbb')
import milhouseBAM as mh
from pbcommand.models import FileTypes
from pbcommand.cli import registry_builder, registry_runner
from pbcore.io import openDataSet
log = logging.getLogger(__name__)
NAMESPACE = "pbsmrtpipe_examples"
# the 'Driver' exe needs to be your your path. The first arg will be the path
# to the resolved tool contract.
#
# Note, When the tool contract is emitted, the 'run-rtc'
# will automatically be added to the driver.
#
# When this commandline tool is invoked, it will be of the form:
# comparative_plots.py run-rtc /path/to/resolved-tool-contract.py
registry = registry_builder(
NAMESPACE, "plot_multiple_mapped_ssets.py")
def _get_dset_paths(input_file):
dset_paths = []
log.info("Attempting to open input CSV")
with open(input_file, 'rb') as csvfile:
reader = csv.reader(csvfile)
for mapped_sset in reader:
# check for a commented line (like a header)
if mapped_sset[0][0] is not '#':
absolute_filename = mapped_sset[0]
dset_paths.append(absolute_filename)
return dset_paths
def _subsample_alignments(mapped_subreadset, num=1000):
ss = random.sample(mapped_subreadset, num)
return ss
def _getKPIs(mapped_sset, subsampled_mapped_sset):
"""
Retrieve the KPIs for a single mapped sset in a dictionary structure.
"""
data = {}
data['holenumber'] = []
data['readlength'] = []
data['templatespan'] = []
data['insertions'] = []
data['deletions'] = []
data['mismatches'] = []
data['accuracy'] = []
data['IPD'] = []
for aln in subsampled_mapped_sset:
data['holenumber'].append(aln.HoleNumber)
data['readlength'].append(float(aln.readEnd - aln.readStart))
data['templatespan'].append(
float(aln.referenceEnd - aln.referenceStart))
data['insertions'].append(float(aln.nIns) / data['readlength'][-1])
data['deletions'].append(float(aln.nDel) / data['readlength'][-1])
data['mismatches'].append(float(aln.nMM) / data['readlength'][-1])
error_rate = (aln.nIns + aln.nDel + aln.nMM) / data['readlength'][-1]
data['accuracy'].append(1 - error_rate)
data['IPD'].append(aln.IPD())
data['total nreads'] = len(mapped_sset)
return data
def _example_main(input_file, output_file, **kwargs):
"""
This func should be imported from your python package.
This should have *no* dependency on the pbcommand IO, such as the RTC/TC models.
"""
# This is just for test purposes
log.info("Running example main with {i} {o} kw:{k}".format(i=input_file,
o=output_file,
k=kwargs))
# Open input CSV. Store absolute path of each alignment set.
dset_paths = _get_dset_paths(input_file)
dsets_kpis = {}
for f in dset_paths:
dset = openDataSet(f)
subsampled_dset = _subsample_alignments(dset)
dsets_kpis[f] = _getKPIs(dset, subsampled_dset)
pickle.dump(dsets_kpis, open(output_file, 'wb'))
return 0
@registry("dev_plot_multiple_mapped_ssets", "0.2.2", (FileTypes.CSV, ), (FileTypes.PICKLE, ), nproc=1, options=dict(alpha=1234))
def run_rtc(rtc):
"""
Example Task for grabbing data from multiple mapped ssets. Single input CSV contains path to each mapped sset.
Takes a mapped SubreadSet XML file as input and writes a csv file with mock data.
"""
# The above docstring will be used as the Task/ToolContract Description
log.info("Got RTC task options {t}".format(t=rtc.task.options))
log.info("Got nproc {n}".format(n=rtc.task.nproc))
# The Task options are now accessible via global identifier
alpha = rtc.task.options['pbsmrtpipe_examples.task_options.alpha']
return _example_main(rtc.task.input_files[0], rtc.task.output_files[0], nproc=rtc.task.nproc, alpha=alpha)
if __name__ == '__main__':
sys.exit(registry_runner(registry, sys.argv[1:]))