forked from ctb/surely
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigsplitread.py
133 lines (102 loc) · 3.51 KB
/
bigsplitread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#! /usr/bin/env python
import sys
import os
import screed
import mapreads
READCHUNKSIZE=16*1024
def get_chunks(filename, n_chunks):
filesize = os.path.getsize(filename)
chunksize = int(filesize / float(n_chunks))
print chunksize, filesize, filesize / chunksize
x = []
for i in range(0, filesize - 2*chunksize, chunksize):
x.append((i, i + chunksize))
i += chunksize
x.append((i, i + chunksize))
x.append((i + chunksize, filesize))
return x
class ReadlineUntil(object):
def __init__(self, fp, until, verbose=0):
self.fp = fp
self.until = until
self.verbose = verbose
def readline(self):
line = self.fp.readline()
if self.fp.tell() - len(line) >= self.until and line.startswith('>'):
if self.verbose:
print >>sys.stderr, 'STOP AT', self.fp.tell() - len(line), \
self.fp.tell(), self.until, line[0]
print >>sys.stderr, 'XX', (line,)
return ""
return line
def retrieve_records(filename, start, stop, verbose=0):
fp = open(filename, 'rb')
fp.seek(start)
line = fp.readline()
found = False
while line and fp.tell() < stop:
if verbose:
print >>sys.stderr, (line,)
if line.startswith('>'):
found = True
break
line = fp.readline()
if found:
newfp = ReadlineUntil(fp, stop, verbose)
if verbose:
print >>sys.stderr, "START AT:", fp.tell() - len(line)
for record in screed.fasta.fasta_iter(newfp, line=line):
yield record
else:
assert 0
fp.close()
def extract_reads_to_file(filename, start, stop):
_, tmpfile = mapreads.get_temp_filename('readchunk.fa')
fp = open(tmpfile, 'w')
for record in retrieve_records(filename, start, stop):
fp.write('>%s\n%s\n' % (record.name, record.sequence))
fp.close()
return tmpfile
def retrieve_bytes(filename, start, stop, verbose=0):
fp = open(filename, 'rb')
fp.seek(start)
line = fp.readline()
found = False
while line and fp.tell() < stop:
if verbose:
print >>sys.stderr, (line,)
if line.startswith('>'):
found = True
break
line = fp.readline()
if found:
yield line
while fp.tell() + READCHUNKSIZE < stop:
yield fp.read(READCHUNKSIZE)
remaining = stop - fp.tell()
data = fp.read(remaining)
line = fp.readline()
while line and not line.startswith('>'):
data += line
line = fp.readline()
yield data
else:
assert 0
fp.close()
if __name__ == '__main__':
filename = '/Users/t/dev/khmer/data/100k-filtered.fa'
x = get_chunks(filename, 8)
print >>sys.stderr, x, os.path.getsize(filename)
if 0:
for (start, stop) in x:
for record in retrieve_records(filename, start, stop):
sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))
else:
for (start, stop) in x:
print start, stop
for data in retrieve_bytes(filename, start, stop):
sys.stdout.write(data)
# for record in retrieve(filename, x[-2][0], x[-2][1], verbose=1):
# sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))
# for record in retrieve(filename, x[-1][0], x[-1][1], verbose=1):
# sys.stdout.write('>%s\n%s\n' % (record.name, record.sequence))