-
Notifications
You must be signed in to change notification settings - Fork 2
/
processor.py
165 lines (151 loc) · 5.03 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from cursor import Cursor
from config import *
import sys
import json
import csv
import mmh3
import time
class Processor(object):
def __init__(self, es, data_file, data_from):
self.es = es
self.data_file = data_file
self.data_from = data_from
# setup deltab in elastic search
if not self.es.indices.exists(index="deltadb"):
self.setup_deltadb_in_es()
# load data from data_file
def load(self):
if self.data_file.endswith('.json'):
with open(self.data_file, 'r') as f:
data_json = json.load(f)
return data_json
else:
print "File not found or not supported file format. Json only"
return {}
def update_node(self, node, new_data, cursor_num):
old_keys = node.keys()
for each_key in new_data.keys():
if each_key not in old_keys:
node[each_key] = new_data[each_key]
localtime = time.localtime(time.time())
node["updated"] = "%s-%s-%s" % (localtime.tm_year, localtime.tm_mon, localtime.tm_mday)
if self.data_from not in node:
node[self.data_from] = [cursor_num]
if node[self.data_from][-1] < cursor_num:
node[self.data_from].append(cursor_num)
return node
def create_node(self, new_data, cursor):
# create new data
new_node = {}
for each_value in new_data.keys():
new_node[each_value] = new_data[each_value]
localtime = time.localtime(time.time())
new_node["created"] = "%s-%s-%s" % (localtime.tm_year, localtime.tm_mon, localtime.tm_mday)
new_node["updated"] = new_node["created"]
new_node[self.data_from] = [cursor]
return new_node
# id is the hash value of base url
def process(self):
# load data
data = self.load()
# index to elastic search
print "\nStart processing"
cursor = Cursor(self.es, self.data_from)
cursor_num = cursor.get_new_cursor()
for each_data in data:
key_string = ''
for each_key_string in key_value:
key_string += each_data[each_key_string]
hashkey = mmh3.hash(key_string)
print "parsing id: ", hashkey
# try to read record
try:
res = self.es.get( index="deltadb",
doc_type="data",
id=hashkey)
if res["found"]:
node = self.update_node(res["_source"], each_data, cursor_num)
else:
node = self.create_node(each_data, cursor_num)
except:
node = self.create_node(each_data, cursor_num)
# insert back to es
try:
res = self.es.index(index="deltadb",
doc_type="data",
id=hashkey,
body=node)
except:
continue
print "\nProcess finish."
def setup_deltadb_in_es(self):
delta_create_body = '''
{
"settings": {
"index": {
"store": {
"type": "default"
},
"number_of_shards": 1,
"number_of_replicas": 1
},
"analysis": {
"analyzer": {
"a0": {
"type": "english"
}
}
}
}
}
'''
delta_mapping_body = '''
{
"data": {
"properties": {
"url_base" : {
"type" : "string",
"store" : true,
"index" : "analyzed"
},
"url_parameters": {
"type": "string",
"store" : true,
"index" : "analyzed"
},
"http_method": {
"type": "string",
"store" : true,
"index" : "analyzed"
},
"http_headers": {
"type": "string",
"store" : true,
"index" : "analyzed"
},
"http_body": {
"type": "string",
"store" : true,
"index" : "analyzed"
},
"created": {
"type": "date",
"store" : true,
"index" : "not_analyzed"
},
"updated": {
"type": "date",
"store" : true,
"index" : "not_analyzed"
},
"tag": {
"type": "string",
"store" : true,
"index" : "analyzed"
}
}
}
}
'''
self.es.indices.create(index="deltadb", body=delta_create_body)
self.es.indices.put_mapping(index="deltadb", doc_type="data", body=delta_mapping_body)