def build_vocabulary(self): ''' vocabulary ''' vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain( os.path.join(self.args.data_dir, self.args.file_pretrain_vocab), os.path.join(self.args.data_dir, self.args.file_pretrain_vec)) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['pretrain_emb'] = pretrain_vec self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size)) vocab2id_pos, id2vocab_pos = construct_pos_vocab( file_=os.path.join(self.args.data_dir, self.args.file_vocab_pos)) vocab_size_pos = len(vocab2id_pos) self.batch_data['pos_vocab2id'] = vocab2id_pos self.batch_data['pos_id2vocab'] = id2vocab_pos self.batch_data['pos_vocab_size'] = vocab_size_pos print('The vocabulary (pos) size: {}'.format(vocab_size_pos)) vocab2id_char, id2vocab_char = construct_char_vocab( file_=os.path.join(self.args.data_dir, self.args.file_vocab_char)) vocab_size_char = len(vocab2id_char) self.batch_data['char_vocab2id'] = vocab2id_char self.batch_data['char_id2vocab'] = id2vocab_char self.batch_data['char_vocab_size'] = vocab_size_char print('The vocabulary (char) size: {}'.format(vocab_size_char))
def build_vocabulary(self): ''' build vocabulary ''' cluster_dir = '../cluster_results' file_wordvec = 'vectors_w2v.npy' file_vocab = 'vocab.txt' file_kmeans_centroid = 'aspect_centroid.txt' vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain( os.path.join(cluster_dir, file_vocab), os.path.join(cluster_dir, file_wordvec)) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['pretrain_emb'] = pretrain_vec self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size)) aspect_vec = np.loadtxt(os.path.join(cluster_dir, file_kmeans_centroid), dtype=float) aspect_vec = torch.FloatTensor(aspect_vec).to(self.args.device) self.batch_data['aspect_centroid'] = aspect_vec self.batch_data['n_aspects'] = aspect_vec.shape[0]
def build_vocabulary(self): ''' vocabulary ''' try: emb_source = self.args.emb_source except: emb_source = 'scratch' if emb_source == 'pretrain': vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain( os.path.join(self.args.data_dir, self.args.file_pretrain_vocab), os.path.join(self.args.data_dir, self.args.file_pretrain_vec)) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['pretrain_emb'] = pretrain_vec self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size)) elif emb_source == 'scratch': vocab2id, id2vocab = construct_vocab( file_=os.path.join(self.args.data_dir, self.args.file_vocab), max_size=self.args.max_vocab_size, mincount=self.args.word_minfreq) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size))
def build_vocabulary(self): ''' vocabulary ''' vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain( os.path.join(self.args.data_dir, self.args.file_pretrain_vocab), os.path.join(self.args.data_dir, self.args.file_pretrain_vec)) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['pretrain_emb'] = pretrain_vec self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size))