forked from haoxizhong/search_engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
insert_new2.py
96 lines (79 loc) · 2.9 KB
/
insert_new2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import config
import json
import thulac
index = "law_thulac"
doc_type = "big_data"
dir_path = "/mnt/new/"
model_path = "/home/zhx/elasticsearch-5.5.2/plugins/thulac/models"
server_dir = os.path.dirname(os.path.realpath(__file__))
config_file = os.path.join(server_dir, 'config.py')
local_config_file = os.path.join(server_dir, 'local_config.py')
cutter = thulac.thulac(seg_only=True, model_path=model_path, T2S=True)
def cut(text):
res = cutter.cut(text.encode('utf8'))
result = ""
first = True
for x in res:
if first:
first = False
else:
result = result + " "
result = result + x[0]
return result
if __name__ == '__main__':
from application import app, initialize
app.config.from_pyfile(config_file)
if os.path.exists(local_config_file):
app.config.from_pyfile(local_config_file)
total = 0
cnt = 0
count = 0
basic = 0
from application.elastic import update_by_id
from application.processor import formatter
text_field = ["caseName", "time", "caseType", "caseNumber", "spcx", "court", "judge", "lawyer",
"keyword", "cause",
"docType", "punishment", "result", "docId", "document"]
need_cut = ["Title", "AJJBQK", "SSJL", "caseName", "content", "WBWB", "FYMC", "WBSB", "CPYZ", "DSRXX",
"PJJG"]
for x in os.listdir(dir_path):
file_name = dir_path + x
f = open(file_name, "r")
for line in f:
total += 1
if basic > total:
continue
if total % 100 == 0:
print total, cnt, count
try:
line = line.decode('utf8')
arr = line.split('\t')
if len(arr) == 1:
continue
if len(arr) != len(text_field):
gg
content = {}
for a in range(0, len(text_field)):
content[text_field[a]] = arr[a]
if len(content["document"]) == 3:
content["document"] = "{\"content\":\"\"}"
of = open('no_content2.txt', 'a')
print >> of, content["docId"]
of.close()
else:
content["document"] = content["document"][0:(len(content["document"]) - 2)]
data = formatter.new_parse(content)
if data["content"] == "":
continue
data["doc_name"] = data["docId"]
for x in need_cut:
data[x] = cut(data[x])
update_by_id(index, doc_type, data["doc_name"], data)
cnt += 1
except Exception as e:
# print e
count += 1
of = open('fail_list2.txt', 'a')
print >> of, file_name, e, line.encode("utf8")
of.close()