-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhwiki.py
54 lines (42 loc) · 1.43 KB
/
zhwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
from train import get_train_options
from gensim.corpora import WikiCorpus, dictionary
import settings
from utils.download import download
from utils.processor import (ConvertT2S, CutSentence, Processor,
RemoveNonChineseWords, RemoveStopwords)
from train import train, get_train_options
logger = settings.LOGGER
def download_zhwiki():
path = settings.ZHWIKI_PATH
if os.path.exists(path):
logger.info(f'zhwiki already downloaded at {path}')
return
url = settings.ZHWIKI_URL
logger.info(f'zhwiki downloading from {url} ...')
download(url, path)
logger.info(f'zhwiki downloaded: {path}')
def preprocess_zhwiki():
output_path = settings.ZHWIKI_CLEANED_PATH
if os.path.exists(output_path):
logger.info(f'{output_path} existed. Skip preprocess.')
logger.info(f'Delete {output_path} if preprocess needs to be redone.')
return
input_path = settings.ZHWIKI_PATH
processor = Processor([
ConvertT2S(),
CutSentence(),
RemoveNonChineseWords(),
RemoveStopwords(),
])
wiki = WikiCorpus(input_path, dictionary={})
processor.process_all(wiki.get_texts(), output_path)
def train_zhwiki():
opts = get_train_options()
opts.input_file = settings.ZHWIKI_CLEANED_PATH
opts.name_prefix = 'zhwiki'
train(opts)
if __name__ == '__main__':
download_zhwiki()
preprocess_zhwiki()
train_zhwiki()