def __init__(self, name_scope, vocab_size, size, dtype="float32"):
     super(MyLayer2, self).__init__(name_scope, dtype)
     self.embed0 = fluid.Embedding(self.full_name(),
                                   size=(vocab_size, size))
     self.embed1 = fluid.Embedding(self.full_name(),
                                   size=(vocab_size, size))
     self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
     self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
 def init_emb(self, for_test=False):
     """
     初始化Embedding层的参数
     :param for_test: 是否用于训练
     :return:
     """
     if not for_test:
         self.embedding = fluid.Embedding(
             size=[
                 int(len(self.vocab) * self.emb_size_ratio),
                 self.word_emb_dim
             ],
             padding_idx=0,
             param_attr=fluid.ParamAttr(
                 name='embedding',
                 initializer=fluid.initializer.UniformInitializer(low=-0.1,
                                                                  high=0.1),
                 learning_rate=self.lr,
                 trainable=True),
             dtype='float32')
     else:
         self.test_emb_pin += 1
         if len(self.extra_emb) > 0:
             extra_vecs = np.array(self.extra_emb)
             extend_vecs = np.concatenate(
                 (self.emb_numpy[np.arange(len(self.vocab) + 1), :],
                  extra_vecs),
                 axis=0)
             extend_vecs = np.asarray(extend_vecs, dtype='float32')
         else:
             extend_vecs = self.emb_numpy[np.arange(len(self.vocab) + 1), :]
         padding_len = int(
             len(self.vocab) * self.emb_size_ratio) - extend_vecs.shape[0]
         paddings = np.zeros((padding_len, self.word_emb_dim),
                             dtype='float32')
         extend_vecs = np.concatenate((extend_vecs, paddings), axis=0)
         init = fluid.ParamAttr(
             name='embedding_',
             initializer=fluid.initializer.NumpyArrayInitializer(
                 extend_vecs),
             trainable=False)
         self.test_embedding = fluid.Embedding(size=[
             int(len(self.vocab) * self.emb_size_ratio), self.word_emb_dim
         ],
                                               padding_idx=0,
                                               param_attr=init,
                                               dtype='float32')
Ejemplo n.º 3
0
 def __init__(self,
              output_dim,
              kernel_size=5,
              dimension=100,
              conv_filters=40,
              stride=2,
              act='relu',
              words_num=10000,
              use_bias=True,
              padding_id=0):
     super(CNN_pd, self).__init__()
     self.output_dim = output_dim
     self.kernel_size = kernel_size
     self.dimension = dimension
     self.conv_filters = conv_filters
     self.stride = stride
     self.act = act
     self.words_num = words_num
     self.use_bias = use_bias
     self.padding_id = padding_id
     self.if_built = False
     self.embedding = fluid.Embedding(
         size=[self.words_num, self.dimension],
         is_sparse=True,
         padding_idx=self.padding_id,
         param_attr=fluid.ParamAttr(
             name='embedding',
             initializer=fluid.initializer.UniformInitializer(low=-0.05,
                                                              high=0.05)))
     self.conv1d = fluid.Conv2D(num_filters=self.conv_filters,
                                stride=(self.stride, 1),
                                num_channels=1,
                                filter_size=(self.kernel_size,
                                             self.dimension),
                                act=self.act,
                                bias_attr=self.use_bias)
Ejemplo n.º 4
0
 def _fit(self, text):
     try:
         text = [t.split() for t in text]
         print("The form of input text is [text1, text2, ...].")
     except AttributeError:
         print("The form of input text is [[word11, word12, ...], [word21, word22, ...]].")
     self.words_counter = Counter(chain(*text))
     self.vocab = [word for word, freq in self.words_counter.most_common() if freq > self.min_count]
     if self.verbose:
         print('number of all words: ', len(self.words_counter))
         print('vocabary size: ', len(self.vocab))
     self.vocab_index = {word: index for index, word in enumerate(self.vocab)}
     # to follow the paper, use two different embeddings of the vocabulary, and emerge them as the final result.
     self.embedding = fluid.Embedding(size=[len(self.vocab_index), self.dimension],
                                      param_attr=fluid.ParamAttr(name='embedding',
                                                                 initializer=fluid.initializer.UniformInitializer(
                                                                     low=-self.init_scale, high=self.init_scale
                                                                 )))
     # why self.embedding is dict and other paras are list? self.embedding will directly be a dict for looking up
     # when training is completed.
     self.bias = fluid.Embedding(size=[len(self.vocab_index), 1],
                                 param_attr=fluid.ParamAttr(name='bias',
                                                            initializer=fluid.initializer.ConstantInitializer(0.0)))
     # use a sparse form to represent the co-occurrence matrix of words, the high frequency word is the parent of the
     # low frequency
     # word pairs that are d words apart contribute 1/d to the total count. This is one way to account for the fact
     # that very distant word pairs are expected to contain less relevant information about the words’ relationship
     # to one another.
     self.cooccur = CoOccur()
     self.buffer = Buffer(self.overflow_buffer_size)
     total_length = np.sum([freq for freq in self.words_counter.values()])
     if self.verbose:
         print('pre-processing the text, total length (word counts) of the text is ', total_length)
     start = time.time()
     counter = 0
     for num, words in enumerate(text):
         for index, w in enumerate(words):
             if self.words_counter[w] > self.min_count:
                 pre = max(0, index - self.window)
                 length = len(words[pre:index])
                 if length > 0:
                     for i, w_ in enumerate(words[pre:index]):
                         if self.words_counter[w_] > self.min_count:
                             ind_1 = self.vocab_index[w]
                             ind_2 = self.vocab_index[w_]
                             dis = length - i
                             if (ind_1 + 1) * (ind_2 + 1) <= self.max_product:
                                 if ind_1 < ind_2:
                                     self.cooccur.pair(str(w), str(w_), dis)
                                 elif ind_1 == ind_2:
                                     continue
                                 else:
                                     self.cooccur.pair(str(w_), str(w), dis)
                             else:
                                 if ind_1 < ind_2:
                                     self.buffer.pair(str(w), str(w_), dis)
                                 elif ind_1 == ind_2:
                                     continue
                                 else:
                                     self.buffer.pair(str(w_), str(w), dis)
             counter += 1
             if self.verbose:
                 if counter % 100000 == 0:
                     print('{}/{} processed - cost time: {:.0f}s - ETA: {:.0f}s ......'.
                           format(str(counter).rjust(len(str(total_length))), total_length, time.time() - start,
                                  (time.time() - start) / counter * (total_length - counter)))
     self.buffer.update_file()
     if self.verbose:
         print('pre-processing complete. cost time: {:.0f}s'.format(time.time() - start))
     if self.verbose:
         print("fit complete......")
         print("begin training operation......")
Ejemplo n.º 5
0
 def __init__(self, input_size, vocab_size, size, dtype="float32"):
     super(MyLayer2, self).__init__(dtype=dtype)
     self.embed0 = fluid.Embedding(size=(vocab_size, size))
     self.embed1 = fluid.Embedding(size=(vocab_size, size))
     self.linear_0 = fluid.Linear(input_size, size, dtype=dtype)
     self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)