forked from nOkuda/activetm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pickle_data.py
51 lines (44 loc) · 2 KB
/
pickle_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from __future__ import division
import argparse
import datetime
import os
import pickle
import time
import ankura.pipeline
from ankura import tokenize
from activetm import labeled
from activetm import utils
def get_dataset(settings):
PIPELINE = []
if settings['corpus'].find('*') >= 0:
PIPELINE.append((ankura.pipeline.read_glob, settings['corpus'], tokenize.simple))
else:
PIPELINE.append((ankura.pipeline.read_file, settings['corpus'], tokenize.simple))
PIPELINE.extend([
(ankura.pipeline.filter_stopwords, settings['stopwords']),
(ankura.pipeline.filter_rarewords, int(settings['rare'])),
(ankura.pipeline.filter_commonwords, int(settings['common'])),
(ankura.pipeline.filter_smalldocs, int(settings['smalldoc']))])
if settings['pregenerate'] == 'YES':
PIPELINE.append((ankura.pipeline.pregenerate_doc_tokens))
return ankura.pipeline.run_pipeline(PIPELINE)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
parser.add_argument('settings', help=\
'''the path to a file containing settings, as described in \
README.md in the root ActiveTM directory''')
parser.add_argument('outputdir', help='directory for output')
args = parser.parse_args()
start = time.time()
settings = utils.parse_settings(args.settings)
pickle_name = utils.get_pickle_name(args.settings)
if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
pre_dataset = get_dataset(settings)
labels = labeled.get_labels(settings['labels'])
dataset = labeled.LabeledDataset(pre_dataset, labels)
with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
pickle.dump(dataset, ofh)
end = time.time()
import_time = datetime.timedelta(seconds=end-start)
with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
ofh.write('# import time: {:s}\n'.format(str(import_time)))