/
tagger.py
56 lines (47 loc) · 1.69 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from konlpy.tag import Kkma
import glob
import sys
import json
import csv
def tag_all_reviews(norm, stem):
kkma = Kkma()
recommend_categories = set()
nouns = dict()
for filename in glob.glob('reviews/*.json'):
with open(filename, 'r') as raw_file:
print('parsing %s...' % filename)
raw_data = json.load(raw_file)
for review in raw_data:
raw_tags = kkma.pos(
review['text'])
review['tagged'] = list()
for tag in raw_tags:
if tag[1][0] in ['N', 'V']:
review['tagged'].append(tag)
if tag[1][0] == 'N':
if tag[0] in nouns:
nouns[tag[0]] += 1
else:
nouns[tag[0]] = 0
recommend_categories.update(list(review['recommend'].keys()))
new_filename = 'tagged_reviews/%s' % filename.split('/')[1]
with open(new_filename, 'w') as tagged_file:
json.dump(raw_data, tagged_file, ensure_ascii=False,
sort_keys=True, indent=2, separators=(',', ': '))
c = 0
with open('nouns.csv', 'w') as nouns_file:
nf = csv.writer(nouns_file)
for key in nouns.keys():
if nouns[key] >= 100:
c += 1
nf.writerow([key, nouns[key]])
print(c)
return recommend_categories
if __name__ == '__main__':
norm, stem = False, False
if len(sys.argv) > 1:
if 'norm' in sys.argv:
norm = True
if 'stem' in sys.argv:
stem = True
print(tag_all_reviews(norm, stem))