forked from bukun/TorCMS
/
script_gen_whoosh_database.py
71 lines (56 loc) · 1.88 KB
/
script_gen_whoosh_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys,os
import html2text
import tornado.escape
sys.path.append("../")
def html_strip(html):
from HTMLParser import HTMLParser
html = html.strip()
html = html.strip("\n")
result = []
parse = HTMLParser()
parse.handle_data = result.append
parse.feed(html)
parse.close()
return "".join(result)
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
sys.path.append('/opt/torlite/yunsuan')
from torlite.model.mpost import MPost
from torapp.model.app_model import MApp
mpost = MPost()
recs = mpost.query_all()
# title, cnt_html
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer = analyzer), type=TEXT(stored=True), link=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("lib/whoosh"):
os.mkdir("lib/whoosh")
ix = create_in("lib/whoosh", schema) # for create new index
# ix = open_dir("tmp") # for read only
writer = ix.writer()
for rec in recs:
# print(rec.title, rec.uid, rec.cnt_html)
text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))
print(text2)
writer.add_document(
title=rec.title,
type='<span style="color:blue;">[文档]</span>',
link='/post/{0}.html'.format(rec.uid),
content= text2
)
mapp = MApp()
app_recs = mapp.query_recent(2000)
for rec in app_recs:
# text2 = html2text.html2text(rec.cnt_html)
text2 = html2text.html2text(tornado.escape.xhtml_unescape(rec.cnt_html))
print(text2)
writer.add_document(
title=rec.title,
type='<span style="color:red;">[计算]</span>',
link='/app/{0}'.format(rec.uid),
content= text2
)
writer.commit()