-
Notifications
You must be signed in to change notification settings - Fork 1
/
deblur_relabel_merged.py
executable file
·119 lines (89 loc) · 3.78 KB
/
deblur_relabel_merged.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
"""
deblur_relabel_merged.py
This script takes a deblurred biom table with concatenated sequences as IDs
and a merged fastq file that is the result of overlap merging of those
concatenated sequences and relabels the deblurred biom with the new, merged IDs.
Currently this is really slow due to the way the Biom collapses tables -- expect
around 1 minute per 1000 observations.
"""
from __future__ import print_function
import sys
import os
import argparse
from biom import load_table, Table
from biom.parse import biom_open
import unittest
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', '--input_biom_fp',
type=str,
help='path to deblur biom to be split')
parser.add_argument('-o', '--output_biom_fp',
type=str,
help='path to output biom (default: biom file basename + merged.biom)')
parser.add_argument('-f', '--merged_fastq_fp',
type=str,
help='path to merged fastq file (output of deblur_split_biom.py)')
def readfq(fp): # this is a generator function
"""
From https://github.com/lh3/readfq/blob/master/readfq.py
"""
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last: break
name, seqs, last = last[1:].partition(" ")[0], [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, ''.join(seqs), None # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, seq, ''.join(seqs); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
def get_merged_dict(fastq):
merge_dict = {}
for name, seq, qual in fastq:
merge_dict[name] = seq
return(merge_dict)
def collapse_biom_observations(input_biom, merge_dict):
output_biom = input_biom.collapse(lambda id_, md: merge_dict[id_],
norm=False, axis='observation')
return(output_biom)
def main():
args = parser.parse_args()
input_biom_fp = args.input_biom_fp
output_biom_fp = args.output_biom_fp
merged_fastq_fp = args.merged_fastq_fp
deblur_biom = load_table(input_biom_fp)
if output_biom_fp is None:
output_biom_fp = os.path.splitext(input_biom_fp)[0] + '.merged.biom'
with open(merged_fastq_fp) as fq:
merged_fastq = readfq(fq)
# read each of the fastqs, make a dict of label:merged read
merge_dict = get_merged_dict(merged_fastq)
# filter biom to just the keys of dict
deblur_biom = deblur_biom.filter(lambda val, id_, md: id_ in merge_dict,
axis='observation')
output_biom = collapse_biom_observations(deblur_biom, merge_dict)
with biom_open(output_biom_fp, 'w') as f:
output_biom.to_hdf5(f, 'deblur_relabel_merged.py')
if __name__ == "__main__":
main()