forked from CGATOxford/UMI-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_umi.py
289 lines (210 loc) · 9.05 KB
/
extract_umi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
'''
cgat_script_template.py - template for CGAT scripts
====================================================
:Author:
:Release: $Id$
:Date: |today|
:Tags: Python
Purpose
-------
Extract UMI barcode from a read and add it to the read name, leaving
any sample barcode in place. Can deal with paired end reads and UMIs
split across the paired ends
Options
-------
--read2-in, --read2-out
Optionally a file with read pairs can be provided. The UMI will
be added to the read2 read name as well. Assumes that reads
are in the same order. read2-out specifies the output file.
--split-barcode
By default the UMI is assumed to be on the first read. Use this
option if the UMI is contained on both reads and specify the
pattern of the barcode/UMI on the second read using the option
``--bc-pattern2``
--bc-pattern
Use this option to specify the format of the UMI/barcode. Use Ns to
represent the random positions and Xs to indicate the bc positions.
Bases with Ns will be extracted and added to the read name. Remaining
bases, marked with an X will be reattached to the read.
E.g. If the pattern is NNXXNN,
Then the read:
@HISEQ:87:00000000 read1
AAGGTTGCTGATTGGATGGGCTAG
DA1AEBFGGCG01DFH00B1FF0B
+
will become:
@HISEQ:87:00000000_AATT read1
GGGCTGATTGGATGGGCTAG
1AFGGCG01DFH00B1FF0B
+
--bc-pattern2
Use this option to specify the format of the UMI/barcode for
the second read pair if required. If --bc-pattern2 is not
supplied, this defaults to the same pattern as --bc-pattern
--3prime
By default the barcode is assumed to be on the 5' end of the read, but
use this option to sepecify that it is on the 3' end instead
-L
Specify a log file to retain logging information and final statistics
--supress-stats
Supress logging of summary statistics
Usage:
------
For single ended reads
python extract_umi.py --bc-pattern=[PATTERN] -L extract.log [OPTIONS]
reads from stdin and outputs to stdout.
For paired end reads:
python extract_umi.py --bc-pattern=[PATTERN] --read2-in=[FASTQIN] --read2-out=[FASTQOUT] -L extract.log [OPTIONS]
reads end one from stdin and end two from FASTQIN and outputs end one to stdin
and end two to FASTQOUT.
Command line options
--------------------
'''
import sys
import CGAT.Experiment as E
import CGAT.Fastq as Fastq
import CGAT.IOTools as IOTools
def addUMItoIdentifier(read, UMI):
'''extract the identifier from a read and append the UMI before
the first space'''
read_id = read.identifier.split(" ")
read_id[0] = read_id[0] + "_" + UMI
identifier = " ".join(read_id)
return identifier
class Extractor:
''' A functor that extracts the UMI and the barcode from the read(s),
adds it to the identifier and pastes back on the sample barcode'''
bc_count = {}
def _extract_5prime(self, sequence, read=1):
if read == 1:
return (sequence[:self.pattern_length],
sequence[self.pattern_length:])
elif read == 2:
return (sequence[:self.pattern_length2],
sequence[self.pattern_length2:])
def _extract_3prime(self, sequence, read=1):
if read == 1:
return (sequence[-self.pattern_length:],
sequence[:-self.pattern_length])
if read == 2:
return (sequence[-self.pattern_length2:],
sequence[:-self.pattern_length2])
def _joiner_5prime(self, sequence, sample):
return sample + sequence
def _joiner_3prime(self, read, sequence, sample):
return sequence + sample
def __init__(self, pattern, pattern2, prime3=False):
if prime3:
self.extract = self._extract_3prime
self.joiner = self._joiner_3prime
else:
self.extract = self._extract_5prime
self.joiner = self._joiner_5prime
self.pattern_length = len(pattern)
self.umi_bases = [x for x in range(len(pattern)) if pattern[x] is "N"]
self.bc_bases = [x for x in range(len(pattern)) if pattern[x] is "X"]
self.split = False
if pattern2:
print pattern2
self.pattern_length2 = len(pattern2)
self.split = True
self.umi_bases2 = [x for x in range(len(pattern2))
if pattern2[x] is "N"]
self.bc_bases2 = [x for x in range(len(pattern2))
if pattern2[x] is "X"]
def __call__(self, read1, read2=None):
bc1, sequence1 = self.extract(read1.seq)
bc_qual1, seq_qual1 = self.extract(read1.quals)
umi1 = "".join([bc1[x] for x in self.umi_bases])
sample1 = "".join([bc1[x] for x in self.bc_bases])
sample_qual1 = "".join([bc_qual1[x] for x in self.bc_bases])
if self.split:
bc2, sequence2 = self.extract(read2.seq, read=2)
bc_qual2, seq_qual2 = self.extract(read2.quals, read=2)
umi2 = "".join([bc2[x] for x in self.umi_bases2])
sample2 = "".join([bc2[x] for x in self.bc_bases2])
sample_qual2 = "".join([bc_qual2[x] for x in self.bc_bases2])
umi = umi1 + umi2
bc = "_".join((bc1, bc2))
sample = "_".join((sample1, sample2))
read2.seq = self.joiner(sequence2, sample2)
read2.quals = self.joiner(seq_qual2, sample_qual2)
else:
umi = umi1
bc = bc1
sample = sample1
id = (bc, umi, sample)
self.bc_count[id] = self.bc_count.get(id, 0) + 1
read1.seq = self.joiner(sequence1, sample1)
read1.quals = self.joiner(seq_qual1, sample_qual1)
read1.identifier = addUMItoIdentifier(read1, umi)
if read2 is not None:
read2.identifier = addUMItoIdentifier(read2, umi)
return (read1, read2)
else:
return read1
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("--split-barcode", dest="split", action="store_true",
help="barcode is split across read pair")
parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
help="Barcode pattern. Ns are random bases X's fixed")
parser.add_option("--bc-pattern2", dest="pattern2", type="string",
help="Barcode pattern. Ns are random bases X's fixed")
parser.add_option("--read2-in", dest="read2_in", type="string",
help="file name for read pairs")
parser.add_option("--3prime", dest="prime3", action="store_true",
help="barcode is on 3' end of read")
parser.add_option("--read2-out", dest="read2_out", type="string",
help="file to output processed paired read to")
parser.add_option("--supress-stats", dest="stats", action="store_false",
help="Suppress the writing of stats to the log")
parser.set_defaults(split=False,
pattern=None,
pattern2=None,
read2_in=None,
read2_out=None,
prime3=False,
stats=True)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
# check options
if not options.pattern:
raise ValueError("must specify a pattern using ``--bc-pattern``")
if options.split:
if not options.read2_in:
raise ValueError("must specify a paired fastq ``--read2-in``")
if not options.pattern2:
options.pattern2 = options.pattern
if options.read2_in:
if not options.read2_out:
raise ValueError("must specify an output for the paired end "
"``--read2-out``")
# Initialise the processor
processor = Extractor(options.pattern, options.pattern2, options.prime3)
read1s = Fastq.iterate(options.stdin)
if options.read2_in is None:
for read in read1s:
options.stdout.write(str(processor(read)) + "\n")
else:
read2s = Fastq.iterate(IOTools.openFile(options.read2_in))
read2_out = IOTools.openFile(options.read2_out, "w")
for read1, read2 in zip(read1s, read2s):
new_1, new_2 = processor(read1, read2)
options.stdout.write(str(new_1) + "\n")
read2_out.write(str(new_2) + "\n")
# write footer and output benchmark information.
if options.stats:
options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n")
for id in processor.bc_count:
options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n")
E.Stop()
if __name__ == "__main__":
sys.exit(main(sys.argv))