Ejemplo n.º 1
0
 def __init__(self, n, token_vocab, *args, **kwargs):
   """ """
   
   recount = kwargs.pop('recount', False)
   initialize_zero = kwargs.pop('initialize_zero', False)
   super(TokenVocab, self).__init__(*args, **kwargs)
   
   self._n = n
   self._token_vocab = token_vocab
   self._token_counts = Counter()
   self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab)
   self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
   
   if recount:
     self.count()
   else:
     if os.path.isfile(self.filename):
       self.load()
     else:
       self.count()
       self.dump()
   self.index_vocab()
   
   embed_dims = [len(self), self.embed_size]
   if initialize_zero:
     self.embeddings = np.zeros(embed_dims)
   else:
     self.embeddings = np.random.randn(*embed_dims)
   return
Ejemplo n.º 2
0
      elif self.cased != value.cased:
        cls = value.__class__
        value = cls.from_configurable(value,
                                      cased=self.cased,
                                      recount=True)
    super(SubtokenVocab, self).__setattr__(name, value)
    return

#***************************************************************
class CharVocab(SubtokenVocab):
  pass

#***************************************************************
if __name__ == '__main__':
  """ """
  
  from nparser import Configurable
  from nparser.vocabs import WordVocab, CharVocab
  
  configurable = Configurable()
  token_vocab = WordVocab.from_configurable(configurable, 1)
  token_vocab.fit_to_zipf()
  if os.path.isfile('saves/defaults/chars.txt'):
    os.remove('saves/defaults/chars.txt')
  subtoken_vocab = CharVocab.from_vocab(token_vocab)
  subtoken_vocab = CharVocab.from_vocab(token_vocab)
  subtoken_vocab.token_vocab.count(configurable.valid_files)
  subtoken_vocab.index_tokens()
  subtoken_vocab.fit_to_zipf()
  print('SubtokenVocab passes',file=sys.stderr)