-
Notifications
You must be signed in to change notification settings - Fork 0
/
hapmap_vcf_loader_arangodb.py
executable file
·166 lines (157 loc) · 5.5 KB
/
hapmap_vcf_loader_arangodb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
import fileinput
import re
import sys
import copy
from arango import ArangoClient
# from pyArango.connection import *
class hapmap_load:
def __init__(self, logging = False):
self.logging = logging
self.table_columns = []
def load_vcf(self, vcf_file, database):
try:
if self.logging:
print "Begin load"
# contains the information in the VCF's info field
info_structure = {}
# contains the information in the VCF's alt field
alt_structure = {}
# look for the 'samples' collection
if len(filter(lambda x: x['name']=='samples', database.collections())) == 0:
# if we don't find it, let's create it
db_samples = database.create_collection('samples')
else:
db_samples = database.collection('samples')
# look for the 'variant' collection
if len(filter(lambda x: x['name']=='variants', database.collections())) == 0:
# if we don't find it, let's create it
db_variants = database.create_collection('variants')
else:
db_variants = database.collection('variants')
# open and walk through our VCF file
for line in fileinput.input(vcf_file):
# remove them end lines!
line = line.rstrip('\r\n')
# file header
if line[:2] == '##':
if self.logging:
print "Loading header"
# remove the starting ## and split on the first =
header_line = line[2:].split('=', 1)
# write to the appropriate hash
if header_line[0] == "INFO":
self.load_header(header_line[1], info_structure)
elif header_line[0] == "ALT":
self.load_header(header_line[1], alt_structure)
# sample header
elif line[:1] == '#':
if self.logging:
print "Creating Samples"
# split the line on the tab and drop the first 9 elements (they are not samples)
self.table_columns = (re.split(r'\t+', line))[9:]
# walk over each element
for sample in self.table_columns:
if not db_samples.has(sample):
# and add it into our database
db_samples.insert({ '_key' : sample, 'calls': [] }, False, False)
else:
if self.logging:
print "Loading Variant"
# split the line on the tabs
variant_calls = re.split(r'\t+', line)
# create the variant object
variant = {
'_key': variant_calls[2],
'names': variant_calls[2].split(','),
'chromosome': int(variant_calls[0]),
'position': variant_calls[1],
'filter': variant_calls[6],
'reference_base': variant_calls[3],
'alternate_bases': variant_calls[4].strip(r'<|>').split(','),
'alternate_structure': alt_structure,
'info': {}
}
# walk over each info value
for info_line in variant_calls[7].split(';'):
# cut apart at the ='s
info_line = info_line.split('=')
# copy the variant info
variant['info'][info_line[0]] = copy.deepcopy(info_structure[info_line[0]])
# find out how to store each value for the infos
values = []
if len(info_line) > 1:
# get each value
for elm in info_line[1].split(','):
raw_type = variant['info'][info_line[0]]['type'].lower()
# determin the type of the value
if raw_type == 'integer':
values.append( int(elm) )
elif raw_type == 'double' or raw_type == 'float':
values.append( float(elm) )
else:
values.append( str(elm) )
# take the value, comma seperated, and break it into an integer array (faster to search on and compare)
variant['info'][info_line[0]]['value'] = values
# don't overwrite an existing variant
if not db_variants.has(variant['_key']):
# add our variant to arango
db_variants.insert(variant)
# add each call to the appropriate sample
for ind, call in enumerate(variant_calls[9:]):
if re.search(r'(\d(\/))+\d', call) != None:
# add the variant, phase (if it's | then phased, if / unphased), and genotype as an integer array
variant_call = {
'variant': variant_calls[2],
'phased': re.search(r'\|', call) != None,
'genotype': map(lambda x: int(x), re.split(r'\||\/', call))
}
# get the sample
samp = db_samples.get(self.table_columns[ind])
# add the call to our sample
samp['calls'].append(variant_call)
# update our sample
db_samples.update(samp, True)
except Exception as e:
print "Could not load data, an error occured"
print e
finally:
pass
def load_header(self, header_line, structure):
# remove the < >
header_line = re.sub(r'<|>', '', header_line)
# format the header line so that we have an array of list elements
header_line = re.split(r',(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', header_line)
# reference element in the info structure
elm_ref = ''
if len(header_line) > 1:
# each header line
for line in header_line:
val = line.split(r'=')
# add the id as the key
if val[0] == 'ID':
# set the
elm_ref = structure[val[1]] = {}
else:
# for this id, add the values
elm_ref[val[0].lower()] = re.sub('^\"|\"?$', '', val[1])
if len(sys.argv) > 2:
# Initialize the client for ArangoDB
client = ArangoClient(
protocol='http',
host='localhost',
port=8529,
username='root',
password='PP0atsax',
enable_logging=True
)
if sys.argv[2] not in client.databases():
client.create_database(sys.argv[2])
# select the database
db = client.db(sys.argv[2])
# create our obj
loader = hapmap_load()
# start loading variants
loader.load_vcf(sys.argv[1], db)
else:
print "please specify a VCF and arango database"