Exemple #1
0
class RandomEmbedding(Embedding):
  
  def __init__(self, vectordim = 300):
    self.index = Index()
    self.vdim = vectordim
    self.data = np.zeros((0, self.vdim), dtype = np.float32)
    self.invindex = None
  
  def getVector(self, word):
    if not self.index.hasWord(word):
      # create random vector
      v = np.random.rand(self.vdim).astype(np.float32)
      # normalize
      length = np.linalg.norm(v)
      if length == 0:
        length += 1e-6
      v = v / length
      # add
      idx = self.index.add(self.id2w)
      self.data = np.vstack((self.data, v))
      assert idx == len(self.data)
      if self.invindex is not None:
        del self.invindex
        self.invindex = None
      return v
    idx = self.index.getId(word)
    return self.data[idx]
    
  def search(self, q, topk = 4):
    if not self.invindex:
      print('Building faiss index...')
      self.invindex = faiss.IndexFlatL2(self.vdim)
      self.invindex.add(self.data)
      print('Faiss index built:', self.invindex.is_trained)
    if len(q.shape) == 1:
      q = np.matrix(q)
    if q.shape[1] != self.vdim:
      print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr)
      return
    D, I = self.invindex.search(q, topk) # D = distances, I = indices
    return ( I, D )
    
  def wordForVec(self, v):
    idx, dist = self.search(v, topk=1)
    idx = idx[0,0]
    dist = dist[0,0]
    sim = 1. - dist
    word = self.index.getWord(idx)
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.index.vocbulary()
  
  def dim(self):
    return self.vdim
Exemple #2
0
class RandomEmbedding(Embedding):

  def __init__(self, vectordim = 300):
    self.index = Index()
    self.vdim = vectordim
    self.data = np.zeros((0, self.vdim), dtype = np.float32)
    self.invindex = None

  def getVector(self, word):
    if not self.index.hasWord(word):
      # create random vector
      v = np.random.rand(self.vdim).astype(np.float32)
      # normalize
      length = np.linalg.norm(v)
      if length == 0:
        length += 1e-6
      v = v / length
      # add
      idx = self.index.add(self.id2w)
      self.data = np.vstack((self.data, v))
      assert idx == len(self.data)
      if self.invindex is not None:
        del self.invindex
        self.invindex = None
      return v
    idx = self.index.getId(word)
    return self.data[idx]

  def containsWord(self, word):
    return True

  def vocabulary(self):
    return self.index.vocbulary()

  def dim(self):
    return self.vdim