-
Notifications
You must be signed in to change notification settings - Fork 2
/
bigsplitread.py
142 lines (111 loc) · 3.84 KB
/
bigsplitread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#! /usr/bin/env python
import sys
import os
import screed
import mapreads
READCHUNKSIZE=16*1024
def get_chunks(filename, n_chunks):
filesize = os.path.getsize(filename)
chunksize = int(filesize / float(n_chunks))
x = []
for i in range(0, filesize - 2*chunksize, chunksize):
x.append((i, i + chunksize))
i += chunksize
if (filesize - (i + chunksize)) < int(0.2*chunksize):
x.append((i, filesize))
else:
x.append((i, i + chunksize))
x.append((i + chunksize, filesize))
return x
class ReadlineUntil(object):
def __init__(self, fp, until, verbose=0):
self.fp = fp
self.until = until
self.verbose = verbose
def readline(self):
line = self.fp.readline()
if self.fp.tell() - len(line) >= self.until and line.startswith('>'):
if self.verbose:
print >>sys.stderr, 'STOP AT', self.fp.tell() - len(line), \
self.fp.tell(), self.until, line[0]
print >>sys.stderr, 'XX', (line,)
return ""
return line
def retrieve_records(filename, start, stop, verbose=0):
fp = open(filename, 'rb')
fp.seek(start)
line = fp.readline()
found = False
while line and fp.tell() < stop:
if verbose:
print >>sys.stderr, (line,)
if line.startswith('>'):
found = True
break
line = fp.readline()
if found:
newfp = ReadlineUntil(fp, stop, verbose)
if verbose:
print >>sys.stderr, "START AT:", fp.tell() - len(line)
for record in screed.fasta.fasta_iter(newfp, line=line):
yield record
else:
assert 0
fp.close()
def extract_reads_to_file(filename, start, stop):
_, tmpfile = mapreads.get_temp_filename('readchunk.fa')
fp = open(tmpfile, 'w')
for data in retrieve_bytes(filename, start, stop):
fp.write(data)
fp.close()
return tmpfile
def retrieve_bytes(filename, start, stop, verbose=0):
if verbose:
print >>sys.stderr, 'XXX', filename, start, stop
fp = open(filename, 'rb')
fp.seek(start)
line = fp.readline()
found = False
while line and fp.tell() < stop:
if verbose:
print >>sys.stderr, (line,)
if line.startswith('>'):
found = True
break
line = fp.readline()
if found:
if verbose:
print >>sys.stderr, start, 'READING FROM:', fp.tell() - len(line)
yield line
while fp.tell() + READCHUNKSIZE < stop:
yield fp.read(READCHUNKSIZE)
remaining = stop - fp.tell()
data = fp.read(remaining)
line = fp.readline()
while line and not line.startswith('>'):
data += line
line = fp.readline()
yield data
if verbose:
print >>sys.stderr, stop, 'READING TO:', fp.tell() - len(line)
else:
pass
fp.close()
if __name__ == '__main__':
filename = '/Users/t/dev/khmer/data/100k-filtered.fa'
x = get_chunks(filename, 8)
#print >>sys.stderr, x, os.path.getsize(filename)
#for n, (start, stop) in enumerate(x):
# print >>sys.stderr, n, start, stop, stop - start
if 0:
for (start, stop) in x:
for record in retrieve_records(filename, start, stop):
sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))
else:
for (start, stop) in x:
for data in retrieve_bytes(filename, start, stop):
sys.stdout.write(data)
# for record in retrieve(filename, x[-2][0], x[-2][1], verbose=1):
# sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))
# for record in retrieve(filename, x[-1][0], x[-1][1], verbose=1):
# sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))