forked from garnachod/SimpleDoc2Vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GeneraVectores.py
115 lines (87 loc) · 2.9 KB
/
GeneraVectores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from gensim.models import Doc2Vec
import numpy as np
from numpy import dot
from gensim import utils, matutils
class GeneraVectores(object):
"""docstring for GeneraVectores"""
def __init__(self, model):
super(GeneraVectores, self).__init__()
self.model = model
self.steps = 3
self.alpha = 0.08
self.docs = []
self.lastFile = None
def getVecsFromFile(self, file, max=-1):
"""
Retorna una lista de vectores inferidos
"""
if self.lastFile is None:
self.lastFile = file
if self.lastFile != file:
self.docs = []
self.lastFile = file
vecsRetorno = None
if len(self.docs) == 0:
with utils.smart_open(file) as fin:
for item_no, line in enumerate(fin):
if max != -1 and item_no > max:
break
line = line.replace("\n", "")
arrayWords = utils.to_unicode(line).split()
self.docs.append(arrayWords)
vecDoc = np.array([self.getVecsFromWords(arrayWords)])
if vecsRetorno is None:
vecsRetorno = vecDoc
else:
vecsRetorno = np.append(vecsRetorno, vecDoc, axis=0)
else:
for arrayWords in self.docs:
vecDoc = np.array([self.getVecsFromWords(arrayWords)])
if vecsRetorno is None:
vecsRetorno = vecDoc
else:
vecsRetorno = np.append(vecsRetorno, vecDoc, axis=0)
return vecsRetorno
def getVecsFromWords(self, words):
return self.model.infer_vector(words, steps=self.steps, alpha=self.alpha)
def setModel(self, model):
self.model = model
def pruebaCompletaCosenosDM():
model = Doc2Vec.load('./imdb_dm.d2v')
source = 'data/trainneg.txt'
generador = GeneraVectores(model)
steps = [1,2,3,5,7,10,15]
alphas = [0.2,0.1, 0.075]
for alpha in alphas:
for step in steps:
generador.steps = step
generador.alpha = alpha
vecs = generador.getVecsFromFile(source)
coseno_cum = 0.0
for i in range(0, 12500):
coseno_cum += dot(matutils.unitvec(vecs[i]), matutils.unitvec(model.docvecs["TRAIN_NEG_"+str(i)]))
print "dm\t" + str(step) + "\t" + str(alpha) + "\t" + str((coseno_cum / 12500.0))
def pruebaCompletaCosenosDBOW():
model = Doc2Vec.load('./imdb_dbow.d2v')
source = 'data/trainneg.txt'
generador = GeneraVectores(model)
steps = [1,2,3,5,7,10,15]
alphas = [0.1, 0.075, 0.035]
for alpha in alphas:
for step in steps:
generador.steps = step
generador.alpha = alpha
vecs = generador.getVecsFromFile(source)
coseno_cum = 0.0
for i in range(0, 12500):
coseno_cum += dot(matutils.unitvec(vecs[i]), matutils.unitvec(model.docvecs["TRAIN_NEG_"+str(i)]))
print "dbow\t" + str(step) + "\t" + str(alpha) + "\t" + str((coseno_cum / 12500.0))
def puebaSimpleCosenos():
model = Doc2Vec.load('./imdb_dm.d2v')
source = 'data/trainneg.txt'
generador = GeneraVectores(model)
vecs = generador.getVecsFromFile(source)
print "coseno primer vector, trainneg"
print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
if __name__ == '__main__':
pruebaCompletaCosenosDM()