forked from AbeHandler/WordNet-Word2Vec
/
experiment.py
89 lines (79 loc) · 3.1 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from gensim.models import word2vec, KeyedVectors
from wordnetter import synononymous
from wordnetter import hypernomous
from wordnetter import not_in_wordnet
from wordnetter import holonymous
from wordnetter import meronymous
from wordnetter import hyoponomous
from nltk.corpus import reuters
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import random
import argparse
import os
try:
os.remove("textfiles/results.txt")
except OSError:
pass
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
stemmer = SnowballStemmer("english")
parser = argparse.ArgumentParser()
parser.add_argument('--out', '-o')
args = vars(parser.parse_args())
NUM_WORDS = 10000
def same_stem(one, two):
if stemmer.stem(one) == stemmer.stem(two):
return True
else:
return False
def printout(line):
if args['out']:
writeto = "textfiles/" + args['out']
with open(writeto, "a", encoding='utf-8') as results:
results.write('\n' + line)
def inStopWords(w):
if w in stopwords.words('english'):
return True
return False
words = random.sample(set(reuters.words()), NUM_WORDS)
words = [s for s in words if not inStopWords(s)]
counter = 0
for r in words:
counter = counter + 1
# r = r.encode('ascii', 'ignore').decode('ascii') # bytes object is not being handled by most_similar function
try:
n = 0
sims = model.most_similar(positive=[r], topn=200)
for s in sims:
n = n + 1
print("counter: " + str(counter) + "-" + str(n))
# s = (s[0].encode("ascii", 'ignore'), s[1])
hit = False
search = True
if same_stem(s[0], r):
search = False
printout(",".join(['same stem', s[0], r, str(s[1]), str(n)]))
if not_in_wordnet(r):
search = False
printout(",".join(['not_in_wordnet', s[0], r, str(s[1]), str(n)]))
if search:
hyper = hypernomous(s[0], r)
hypo = hyoponomous(s[0], r)
syno = synononymous(s[0], r)
holo = holonymous(s[0], r)
mero = meronymous(s[0], r)
if hyper > 0 and hyper < 1:
printout(",".join(['hyper', s[0], r, str(s[1]), str(n), str(hyper)]))
if hypo > 0 and hypo < 1:
printout(",".join(['hypo', s[0], r, str(s[1]), str(n), str(hypo)]))
if syno > 0 and syno < 1:
printout(",".join(['syn', s[0], r, str(s[1]), str(n), str(syno)]))
if holo > 0 and holo < 1:
printout(",".join(['holo', s[0], r, str(s[1]), str(n), str(holo)]))
if mero > 0 and mero < 1:
printout(",".join(['mero', s[0], r, str(s[1]), str(n), str(mero)]))
if (((hyper + hypo + syno + holo + mero) == 0) or ((hyper + hypo + syno + holo + mero) == 5)):
printout(",".join(['none', s[0], r, str(s[1]), str(n)]))
except KeyError:
print(printout(",".join(['KeyError', r])))
pass