Python Index.getWord Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: Index

Method/Function: getWord

Examples at hotexamples.com: 2

Python Index.getWord - 2 examples found. These are the top rated real world Python examples of utils.Index.getWord extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Index(10)

add(4)

hasWord(4)

getId(3)

freeze(2)

getWord(2)

vocabulary(2)

vocbulary(2)

fromfile(1)

load(1)

Example #1

Show file

class RandomEmbedding(Embedding):
  
  def __init__(self, vectordim = 300):
    self.index = Index()
    self.vdim = vectordim
    self.data = np.zeros((0, self.vdim), dtype = np.float32)
    self.invindex = None
  
  def getVector(self, word):
    if not self.index.hasWord(word):
      # create random vector
      v = np.random.rand(self.vdim).astype(np.float32)
      # normalize
      length = np.linalg.norm(v)
      if length == 0:
        length += 1e-6
      v = v / length
      # add
      idx = self.index.add(self.id2w)
      self.data = np.vstack((self.data, v))
      assert idx == len(self.data)
      if self.invindex is not None:
        del self.invindex
        self.invindex = None
      return v
    idx = self.index.getId(word)
    return self.data[idx]
    
  def search(self, q, topk = 4):
    if not self.invindex:
      print('Building faiss index...')
      self.invindex = faiss.IndexFlatL2(self.vdim)
      self.invindex.add(self.data)
      print('Faiss index built:', self.invindex.is_trained)
    if len(q.shape) == 1:
      q = np.matrix(q)
    if q.shape[1] != self.vdim:
      print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr)
      return
    D, I = self.invindex.search(q, topk) # D = distances, I = indices
    return ( I, D )
    
  def wordForVec(self, v):
    idx, dist = self.search(v, topk=1)
    idx = idx[0,0]
    dist = dist[0,0]
    sim = 1. - dist
    word = self.index.getWord(idx)
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.index.vocbulary()
  
  def dim(self):
    return self.vdim

Example #2

Show file

File: embedding.py Project: uhh-lt/lttc

class TextEmbedding(Embedding):

  def __init__(self, txtfile, sep = ' ', vectordim = 300):
    self.file = txtfile
    self.vdim = vectordim
    self.separator = sep

  def load(self, skipheader = True, nlines = sys.maxsize, normalize = False):
    self.index = Index()
    print('Loading embedding from %s' % self.file)
    data_ = []
    with open(self.file, 'r', encoding='utf-8', errors='ignore') as f:
      if skipheader:
        f.readline()
      for i, line in enumerate(f):
        if i >= nlines:
          break
        try:
          line = line.strip()
          splits = line.split(self.separator)
          word = splits[0]
          if self.index.hasWord(word):
            continue
          coefs = np.array(splits[1:self.vdim+1], dtype=np.float32)
          if normalize:
            length = np.linalg.norm(coefs)
            if length == 0:
              length += 1e-6
            coefs = coefs / length
          if coefs.shape != (self.vdim,):
            continue
          idx = self.index.add(word)
          data_.append(coefs)
          assert idx == len(data_)
        except Exception as err:
          print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr)
          print(' ', err, file = sys.stderr)
          continue
    self.data = np.array(data_, dtype = np.float32)
    del data_
    return self

  def getVector(self, word):
    if not self.containsWord(word):
      print("'%s' is unknown." % word, file = sys.stderr)
      v = np.zeros(self.vdim)
      v[0] = 1
      return v
    idx = self.index.getId(word)
    return self.data[idx]

  def search(self, q, topk = 4):
    if len(q.shape) == 1:
      q = np.matrix(q)
    if q.shape[1] != self.vdim:
      print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr )
      return
    D, I = self.invindex.search(q, topk) # D = distances, I = indices
    return ( I, D )

  def wordForVec(self, v):
    idx, dist = self.search(v, topk=1)
    idx = idx[0,0]
    dist = dist[0,0]
    sim = 1. - dist
    word = self.index.getWord(idx)
    return word, sim

  def containsWord(self, word):
    return self.index.hasWord(word)

  def vocabulary(self):
    return self.index.vocabulary()

  def dim(self):
    return self.vdim