Example #1
0
 def build_vocabulary(self):
     '''
     vocabulary
     '''
     try:
         emb_source = self.args.emb_source
     except:
         emb_source = 'scratch'
         
     if emb_source == 'pretrain':
         vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain(
             os.path.join(self.args.data_dir, self.args.file_pretrain_vocab),
             os.path.join(self.args.data_dir, self.args.file_pretrain_vec))
         vocab_size = len(vocab2id)
         self.batch_data['vocab2id'] = vocab2id
         self.batch_data['id2vocab'] = id2vocab
         self.batch_data['pretrain_emb'] = pretrain_vec
         self.batch_data['vocab_size'] = vocab_size
         print('The vocabulary size: {}'.format(vocab_size))
     elif emb_source == 'scratch':
         vocab2id, id2vocab = construct_vocab(
             file_=os.path.join(self.args.data_dir, self.args.file_vocab),
             max_size=self.args.max_vocab_size,
             mincount=self.args.word_minfreq)
         vocab_size = len(vocab2id)
         self.batch_data['vocab2id'] = vocab2id
         self.batch_data['id2vocab'] = id2vocab
         self.batch_data['vocab_size'] = vocab_size
         print('The vocabulary size: {}'.format(vocab_size))
Example #2
0
 def build_vocabulary(self):
     vocab2id, id2vocab = construct_vocab(file_=os.path.join(
         self.args.data_dir, self.args.file_vocab),
                                          max_size=self.args.max_vocab_size,
                                          mincount=self.args.word_minfreq)
     vocab_size = len(vocab2id)
     self.batch_data['vocab2id'] = vocab2id
     self.batch_data['id2vocab'] = id2vocab
     self.batch_data['vocab_size'] = vocab_size
     print('The vocabulary size: {}'.format(vocab_size))