-
Notifications
You must be signed in to change notification settings - Fork 0
/
rebase.py
114 lines (94 loc) · 4.34 KB
/
rebase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from collections import defaultdict
__author__ = 'kablag'
import fastrflp.exceptions as fexceps
import urllib
from fastrflp.models import RestrictionEnzyme as Re
from fastrflp.models import PrototypeEnzyme as Pe
from fastrflp.seq_tools import clean_re_sequence
from django.db import transaction
@transaction.atomic
def update_rebase_from_url(url='http://rebase.neb.com/rebase/link_itype2'):
"""
Updates (or creates) REs database from url (NEB Type II format with tabs)
Table Format
============
enzyme name [tab] prototype [tab] recognition sequence with cleavage site
[tab] methylation site and type [tab] commercial source [tab] references
:param url: REs table location (default is 'http://rebase.neb.com/rebase/link_itype2')
:returns: list of new REs and list of updated REs
Example
=======
>> list_of_new_res, list_of_updated_res = update_rebase_from_url()
Downloaded REs Entries: 3725
New REs: 3725
Updated REs: 0
"""
# from fastrflp.models import PrototypeEnzyme as Pe
# from fastrflp.models import RestrictionEnzyme as Re
try:
fh = urllib.request.urlopen(url)
except (ValueError, urllib.error.URLError):
raise fexceps.UpdateRebaseError('Error in URL: %r' % url)
html = fh.read().decode('utf8')
new_pes = []
upd_pes = []
new_res = []
upd_res = []
try:
enzyme_rows = html.splitlines()[10:-1]
except IndexError:
raise fexceps.UpdateRebaseError('Document %r is to short' % url)
for enzyme_row in enzyme_rows:
# enzyme name [tab] prototype [tab] recognition sequence with cleavage site
# [tab] methylation site and type [tab] commercial source [tab] references
try:
re_name, re_prototype, re_recognition_sequence, re_methylation_site, re_suppliers, re_refs \
= tuple(enzyme_row.split('\t'))
except ValueError as err:
if 'unpack' in err:
raise fexceps.UpdateRebaseError('RE table format error (num of columns)')
if re_prototype == '':
re_prototype = re_name
if not Pe.objects.filter(name=re_prototype).exists():
new_pe = Pe(name=re_prototype,
clean_recognition_sequence=clean_re_sequence(re_recognition_sequence),
)
new_pe.save()
new_pe.restrictionenzyme_set.create(name=re_name,
recognition_sequence=re_recognition_sequence,
suppliers=re_suppliers)
new_res.append(re_name)
#new_pe.save()
new_pes.append(re_prototype)
else:
# update PE
existing_pe = Pe.objects.get(name=re_prototype)
if existing_pe.clean_recognition_sequence != clean_re_sequence(re_recognition_sequence):
# existing_pe.prototype = re_prototype
# existing_pe.recognition_sequence = re_recognition_sequence
existing_pe.clean_recognition_sequence = clean_re_sequence(re_recognition_sequence)
# existing_pe.suppliers = re_suppliers
existing_pe.save()
upd_pes.append(re_prototype)
if not existing_pe.restrictionenzyme_set.filter(name=re_name).exists():
existing_pe.restrictionenzyme_set.create(name=re_name,
recognition_sequence=re_recognition_sequence,
suppliers=re_suppliers)
# existing_pe.save()
new_res.append(re_name)
else:
existing_re = Re.objects.get(name=re_name)
if existing_re.recognition_sequence != re_recognition_sequence \
or existing_re.suppliers != re_suppliers:
existing_re.recognition_sequence = re_recognition_sequence
existing_re.suppliers = re_suppliers
existing_re.save()
upd_res.append(re_name)
print('''
Downloaded REs Entries: {}
New PEs: {}\tUpdated PEs: {}
New REs: {}\tUpdated REs: {}
'''.format(len(enzyme_rows),
len(new_pes), len(upd_pes),
len(new_res), len(upd_res)))
return new_pes, upd_pes, new_res, upd_res