-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
66 lines (56 loc) · 2.41 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# please refer to https://github.com/yichen0831/opencc-python
import csv
from opencc import OpenCC
import os
import re
import sqlite3
openCC = OpenCC()
openCC.set_conversion('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard, with phrases)
article_start_pattern = "<doc id="
article_id_pattern = re.compile("<doc id=\"(.+)\" url")
article_url_pattern = re.compile("url=\"(.+)\" title")
article_title_pattern = re.compile("title=\"(.+)\">")
article_text_pattern = re.compile("<doc id=\"(.+)\" url")
db_path = os.path.join(os.getcwd(), 'articles', 'wiki.db')
csv_path = os.path.join(os.getcwd(), 'articles', '_wiki.csv')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = "DELETE FROM Source"
cur.execute(sql)
conn.commit()
for item in range(0, 2):
if item == 0:
file_path = os.path.join(os.getcwd(), 'articles', 'wiki_00')
else:
file_path = os.path.join(os.getcwd(), 'articles', 'wiki_01')
i = 0
article_id = ""
article_url = ""
article_title = ""
article_text = ""
with open(file_path, 'r', encoding='utf-8') as infile:
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
for line in infile:
# if i > 100:
# break
if line == '\n':
continue
elif line == "</doc>\n":
print(int(article_id), article_url, article_title)
# cur.execute("INSERT INTO source VALUES(?,?,?,?)",
# (int(article_id), article_url, article_title, article_text))
# conn.commit()
#print(article_text)
#writer.writerow([int(article_id), article_url, article_title, article_text])
article_text = ""
continue
tw_line = re.sub("\n", '', openCC.convert(line))
if re.match(article_start_pattern, line):
article_id = article_id_pattern.findall(tw_line)[0]
article_url = article_url_pattern.findall(tw_line)[0]
article_title = article_title_pattern.findall(tw_line)[0]
else:
article_text += tw_line
#article_text += re.sub(chinese_pattern, "", tw_line)
i += 1