コード例 #1
0
    def build_vocabulary(self):
        '''
        vocabulary
        '''
        vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain(
            os.path.join(self.args.data_dir, self.args.file_pretrain_vocab),
            os.path.join(self.args.data_dir, self.args.file_pretrain_vec))
        vocab_size = len(vocab2id)
        self.batch_data['vocab2id'] = vocab2id
        self.batch_data['id2vocab'] = id2vocab
        self.batch_data['pretrain_emb'] = pretrain_vec
        self.batch_data['vocab_size'] = vocab_size
        print('The vocabulary size: {}'.format(vocab_size))

        vocab2id_pos, id2vocab_pos = construct_pos_vocab(
            file_=os.path.join(self.args.data_dir, self.args.file_vocab_pos))
        vocab_size_pos = len(vocab2id_pos)
        self.batch_data['pos_vocab2id'] = vocab2id_pos
        self.batch_data['pos_id2vocab'] = id2vocab_pos
        self.batch_data['pos_vocab_size'] = vocab_size_pos
        print('The vocabulary (pos) size: {}'.format(vocab_size_pos))

        vocab2id_char, id2vocab_char = construct_char_vocab(
            file_=os.path.join(self.args.data_dir, self.args.file_vocab_char))
        vocab_size_char = len(vocab2id_char)
        self.batch_data['char_vocab2id'] = vocab2id_char
        self.batch_data['char_id2vocab'] = id2vocab_char
        self.batch_data['char_vocab_size'] = vocab_size_char
        print('The vocabulary (char) size: {}'.format(vocab_size_char))
コード例 #2
0
ファイル: model_sscl_base.py プロジェクト: tshi04/AspDecSSCL
    def build_vocabulary(self):
        '''
        build vocabulary
        '''
        cluster_dir = '../cluster_results'
        file_wordvec = 'vectors_w2v.npy'
        file_vocab = 'vocab.txt'
        file_kmeans_centroid = 'aspect_centroid.txt'

        vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain(
            os.path.join(cluster_dir, file_vocab),
            os.path.join(cluster_dir, file_wordvec))
        vocab_size = len(vocab2id)
        self.batch_data['vocab2id'] = vocab2id
        self.batch_data['id2vocab'] = id2vocab
        self.batch_data['pretrain_emb'] = pretrain_vec
        self.batch_data['vocab_size'] = vocab_size
        print('The vocabulary size: {}'.format(vocab_size))

        aspect_vec = np.loadtxt(os.path.join(cluster_dir,
                                             file_kmeans_centroid),
                                dtype=float)
        aspect_vec = torch.FloatTensor(aspect_vec).to(self.args.device)
        self.batch_data['aspect_centroid'] = aspect_vec
        self.batch_data['n_aspects'] = aspect_vec.shape[0]
コード例 #3
0
ファイル: modelMTC_base.py プロジェクト: monordstrom/LeafNATS
 def build_vocabulary(self):
     '''
     vocabulary
     '''
     try:
         emb_source = self.args.emb_source
     except:
         emb_source = 'scratch'
         
     if emb_source == 'pretrain':
         vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain(
             os.path.join(self.args.data_dir, self.args.file_pretrain_vocab),
             os.path.join(self.args.data_dir, self.args.file_pretrain_vec))
         vocab_size = len(vocab2id)
         self.batch_data['vocab2id'] = vocab2id
         self.batch_data['id2vocab'] = id2vocab
         self.batch_data['pretrain_emb'] = pretrain_vec
         self.batch_data['vocab_size'] = vocab_size
         print('The vocabulary size: {}'.format(vocab_size))
     elif emb_source == 'scratch':
         vocab2id, id2vocab = construct_vocab(
             file_=os.path.join(self.args.data_dir, self.args.file_vocab),
             max_size=self.args.max_vocab_size,
             mincount=self.args.word_minfreq)
         vocab_size = len(vocab2id)
         self.batch_data['vocab2id'] = vocab2id
         self.batch_data['id2vocab'] = id2vocab
         self.batch_data['vocab_size'] = vocab_size
         print('The vocabulary size: {}'.format(vocab_size))
コード例 #4
0
 def build_vocabulary(self):
     '''
     vocabulary
     '''
     vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain(
         os.path.join(self.args.data_dir, self.args.file_pretrain_vocab),
         os.path.join(self.args.data_dir, self.args.file_pretrain_vec))
     vocab_size = len(vocab2id)
     self.batch_data['vocab2id'] = vocab2id
     self.batch_data['id2vocab'] = id2vocab
     self.batch_data['pretrain_emb'] = pretrain_vec
     self.batch_data['vocab_size'] = vocab_size
     print('The vocabulary size: {}'.format(vocab_size))