class RandomEmbedding(Embedding): def __init__(self, vectordim = 300): self.index = Index() self.vdim = vectordim self.data = np.zeros((0, self.vdim), dtype = np.float32) self.invindex = None def getVector(self, word): if not self.index.hasWord(word): # create random vector v = np.random.rand(self.vdim).astype(np.float32) # normalize length = np.linalg.norm(v) if length == 0: length += 1e-6 v = v / length # add idx = self.index.add(self.id2w) self.data = np.vstack((self.data, v)) assert idx == len(self.data) if self.invindex is not None: del self.invindex self.invindex = None return v idx = self.index.getId(word) return self.data[idx] def search(self, q, topk = 4): if not self.invindex: print('Building faiss index...') self.invindex = faiss.IndexFlatL2(self.vdim) self.invindex.add(self.data) print('Faiss index built:', self.invindex.is_trained) if len(q.shape) == 1: q = np.matrix(q) if q.shape[1] != self.vdim: print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr) return D, I = self.invindex.search(q, topk) # D = distances, I = indices return ( I, D ) def wordForVec(self, v): idx, dist = self.search(v, topk=1) idx = idx[0,0] dist = dist[0,0] sim = 1. - dist word = self.index.getWord(idx) return word, sim def containsWord(self, word): return True def vocabulary(self): return self.index.vocbulary() def dim(self): return self.vdim
class RandomEmbedding(Embedding): def __init__(self, vectordim = 300): self.index = Index() self.vdim = vectordim self.data = np.zeros((0, self.vdim), dtype = np.float32) self.invindex = None def getVector(self, word): if not self.index.hasWord(word): # create random vector v = np.random.rand(self.vdim).astype(np.float32) # normalize length = np.linalg.norm(v) if length == 0: length += 1e-6 v = v / length # add idx = self.index.add(self.id2w) self.data = np.vstack((self.data, v)) assert idx == len(self.data) if self.invindex is not None: del self.invindex self.invindex = None return v idx = self.index.getId(word) return self.data[idx] def containsWord(self, word): return True def vocabulary(self): return self.index.vocbulary() def dim(self): return self.vdim