/
refine-stem.py
executable file
·149 lines (140 loc) · 4.24 KB
/
refine-stem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/c/Users/vadim/AppData/Local/Programs/Python/Python37-32/python
# -*- coding: utf-8 -*-
#
# a module for stemming paper titles LRJ
import sys, os.path, glob
from fancy.ANSI import C
from lib.AST import Sleigh
from lib.JSON import parseJSON
from lib.NLP import string2words, ifApproved
from collections import Counter
# import stemming.porter2
import snowballstemmer
# from nltk.stem.snowball import SnowballStemmer
ienputdir = '../json'
n2f_name = '_name2file.json'
name2file = parseJSON(n2f_name) if os.path.exists(n2f_name) else {}
sleigh = Sleigh(ienputdir + '/corpus', name2file)
verbose = False
ALLSTEMS = set()
def guessYear(P):
cys = [int(w) for w in P.split('-') if len(w) == 4 and w.isdigit()]
if len(cys) == 1:
return cys[0]
else:
j = sleigh.seekByKey(P)
if 'year' in j.json.keys():
return j.get('year')
elif 'year' in dir(j):
return j.year
else:
print('[ {} ] {}'.format(C.red('YEAR'), P))
return 0
def checkon(fn, o):
if not os.path.exists(fn) or os.path.isdir(fn):
fn = fn + '.json'
if 'title' not in o.json.keys():
if verbose:
print('No title in', o.getKey())
return 1 # no title
# check for a different language - to avoid stemming altogether
if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
if 'stemmed' in o.json.keys():
# if stemmed before marked foreign, remove this info
del o.json['stemmed']
F = open(fn, 'w', encoding='utf-8')
F.write(o.getJSON())
F.close()
return 2
else:
return 0
changed = False
### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
stemmer = snowballstemmer.stemmer('english').stemWords
### disregarded variant: snowballstemmer porter - considered outdated
# stemmer = snowballstemmer.stemmer('porter').stemWords
### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
### disregarded variant: nltk - worse on verbs ending with -ze
# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
### end variants
stemmed = stemmer(string2words(o.get('title')))
if '' in stemmed:
print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
print(string2words(o.get('title')))
print(stemmer(string2words(o.get('title'))))
ALLSTEMS.update(stemmed)
if o.get('stemmed') != stemmed:
o.json['stemmed'] = stemmed
changed = True
if changed:
F = open(fn, 'w', encoding='utf-8')
F.write(o.getJSON())
F.close()
return 2
else:
return 0
# somewhat of a code clone
def checkbrand(fn, o):
if 'vocabulary' in o.json:
oldvoc = o.json['vocabulary']
else:
oldvoc = Counter()
o.updateStems()
if 'vocabulary' in o.json:
newvoc = o.json['vocabulary']
else:
newvoc = Counter()
delta = (newvoc - oldvoc) + (oldvoc - newvoc)
if newvoc and delta:
print('NEW:', newvoc - oldvoc)
print('OLD:', oldvoc - newvoc)
F = open(fn, 'w', encoding='utf-8')
F.write(o.getJSON())
F.close()
return 2
else:
return 0
def checkreport(fn, o, br):
statuses = (C.blue('PASS'), C.red('FAIL'), C.yellow('FIXD'))
if br:
r = checkbrand(fn, br)
else:
r = checkon(fn, o)
# non-verbose mode by default
if verbose or r != 0:
print('[ {} ] {}'.format(statuses[r], fn))
return r
def two(n):
if n < 10:
return '0{}'.format(n)
else:
return '{}'.format(n)
if __name__ == "__main__":
verbose = sys.argv[-1] == '-v'
peoplez = glob.glob(ienputdir + '/people/*.json')
print('{}: {} venues, {} papers by {} people\n{}'.format(\
C.purple('BibSLEIGH'),
C.red(len(sleigh.venues)),
C.red(sleigh.numOfPapers()),
C.red(len(peoplez)),
C.purple('='*42)))
cx = {0: 0, 1: 0, 2: 0}
# stem ALL the papers!
for v in sleigh.venues:
for c in v.getConfs():
for p in c.papers:
cx[checkreport(p.filename, p, None)] += 1
for b in v.getBrands():
cx[checkreport(b.filename, None, b)] += 1
# write all stems
listOfStems = sorted(filter(ifApproved, ALLSTEMS), key=lambda w: two(len(w)) + w)
f = open(ienputdir + '/stems.json', 'w', encoding='utf-8')
f.write('[\n\t"' + '",\n\t"'.join(listOfStems) + '"\n]')
f.close()
print(C.red(len(ALLSTEMS)), 'stems found.')
print('{} files checked, {} ok, {} fixed, {} failed'.format(\
C.bold(cx[0] + cx[1] + cx[2]),
C.blue(cx[0]),
C.yellow(cx[2]),
C.red(cx[1])))