-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate.py
executable file
·80 lines (75 loc) · 3.42 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
import csv
import os
import sys
from opener import opener
import utils
from Searcher import Searcher
values = {}
variables = ['found_strings', 'evalue', 'ali_from', 'ali_to', 'vd_insertion_length', 'dj_insertion_length']
for var in variables:
values[var] = {}
for region in utils.regions:
values[var][region] = {}
for imatch in range(3):
values[var][region][imatch] = []
assert len(sys.argv) == 3;
data_type = sys.argv[1]
human = sys.argv[2]
# for human in A B C; do
# datadir=data/human-beings/$human/M/data
# bzgrep -m100 . $datadir/data.tsv.bz2 | sed 's/[ \t][ \t]*/,/g'|cut -f2 -d, |sed 's/nucleotide/seq/'> $datadir/head-data.csv
# done
naivety = 'M'
infname = ''
if data_type == 'simu':
infname = '/home/dralph/Dropbox/work/recombinator/output/' + human + '/' + naivety + '/simu.csv'
else:
infname = 'data/human-beings/' + human + '/' + naivety + '/' + data_type + '/head-data.csv'
baseoutdir = 'data/human-beings/' + human + '/' + naivety + '/' + data_type
print 'opening ',infname
print ' output',baseoutdir
with opener('r')(infname) as infile:
germlines = utils.read_germlines('../../../recombinator')
reader = csv.DictReader(infile)
il = 0
for inline in reader:
il += 1
print inline['seq'][-100:]
# if len(inline['seq']) != 130:
# assert 'simulated' in infname
searcher = Searcher(inline['seq'][-100:], debug=False, n_matches_max=5)
found_str = searcher.search()
values['found_strings']['v'][0].append(found_str) # toss them in ['v'][0] -- doesn't really make sense, but they're fine anywhere
if found_str != 'vjd': # skip the ones where we didn't find matches in this order (see freqs above).
continue
for region in utils.regions:
for imatch in range(len(searcher.matches[region])):
if imatch > 2:
break
match = searcher.matches[region][imatch]
if imatch == 0 and region == 'd':
# print '%s (%3d%3d) --> (%3d%3d %s)' % (region, match['ali_from'], match['ali_to'], match['ali_from'] - 1, len(searcher.query_seqs[region]) - match['ali_to'], searcher.query_seqs[region]),
values['vd_insertion_length'][region][imatch].append(match['ali_from'] - 1) # NOTE these are index *one* counting (!!!)
values['dj_insertion_length'][region][imatch].append(len(searcher.query_seqs[region]) - match['ali_to']) # NOTE these are index *one* counting (!!!)
values['evalue'][region][imatch].append(match['evalue'])
values['ali_from'][region][imatch].append(match['ali_from'])
values['ali_to'][region][imatch].append(match['ali_to'])
# if il > 100:
# sys.exit()
# print ''
# break
for region in utils.regions:
for var in variables:
outdir = baseoutdir + '/' + var
if not os.path.exists(outdir):
os.makedirs(outdir)
for imatch in range(len(searcher.matches[region])):
if imatch > 2:
break
with opener('w')(outdir + '/' + region + '-' + str(imatch) + '.txt') as outfile:
for value in values[var][region][imatch]:
if var == 'found_strings':
outfile.write('%s\n' % value)
else:
outfile.write('%7e\n' % value)