def __init__(self, name_scope, vocab_size, size, dtype="float32"): super(MyLayer2, self).__init__(name_scope, dtype) self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype) self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
def init_emb(self, for_test=False): """ 初始化Embedding层的参数 :param for_test: 是否用于训练 :return: """ if not for_test: self.embedding = fluid.Embedding( size=[ int(len(self.vocab) * self.emb_size_ratio), self.word_emb_dim ], padding_idx=0, param_attr=fluid.ParamAttr( name='embedding', initializer=fluid.initializer.UniformInitializer(low=-0.1, high=0.1), learning_rate=self.lr, trainable=True), dtype='float32') else: self.test_emb_pin += 1 if len(self.extra_emb) > 0: extra_vecs = np.array(self.extra_emb) extend_vecs = np.concatenate( (self.emb_numpy[np.arange(len(self.vocab) + 1), :], extra_vecs), axis=0) extend_vecs = np.asarray(extend_vecs, dtype='float32') else: extend_vecs = self.emb_numpy[np.arange(len(self.vocab) + 1), :] padding_len = int( len(self.vocab) * self.emb_size_ratio) - extend_vecs.shape[0] paddings = np.zeros((padding_len, self.word_emb_dim), dtype='float32') extend_vecs = np.concatenate((extend_vecs, paddings), axis=0) init = fluid.ParamAttr( name='embedding_', initializer=fluid.initializer.NumpyArrayInitializer( extend_vecs), trainable=False) self.test_embedding = fluid.Embedding(size=[ int(len(self.vocab) * self.emb_size_ratio), self.word_emb_dim ], padding_idx=0, param_attr=init, dtype='float32')
def __init__(self, output_dim, kernel_size=5, dimension=100, conv_filters=40, stride=2, act='relu', words_num=10000, use_bias=True, padding_id=0): super(CNN_pd, self).__init__() self.output_dim = output_dim self.kernel_size = kernel_size self.dimension = dimension self.conv_filters = conv_filters self.stride = stride self.act = act self.words_num = words_num self.use_bias = use_bias self.padding_id = padding_id self.if_built = False self.embedding = fluid.Embedding( size=[self.words_num, self.dimension], is_sparse=True, padding_idx=self.padding_id, param_attr=fluid.ParamAttr( name='embedding', initializer=fluid.initializer.UniformInitializer(low=-0.05, high=0.05))) self.conv1d = fluid.Conv2D(num_filters=self.conv_filters, stride=(self.stride, 1), num_channels=1, filter_size=(self.kernel_size, self.dimension), act=self.act, bias_attr=self.use_bias)
def _fit(self, text): try: text = [t.split() for t in text] print("The form of input text is [text1, text2, ...].") except AttributeError: print("The form of input text is [[word11, word12, ...], [word21, word22, ...]].") self.words_counter = Counter(chain(*text)) self.vocab = [word for word, freq in self.words_counter.most_common() if freq > self.min_count] if self.verbose: print('number of all words: ', len(self.words_counter)) print('vocabary size: ', len(self.vocab)) self.vocab_index = {word: index for index, word in enumerate(self.vocab)} # to follow the paper, use two different embeddings of the vocabulary, and emerge them as the final result. self.embedding = fluid.Embedding(size=[len(self.vocab_index), self.dimension], param_attr=fluid.ParamAttr(name='embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale ))) # why self.embedding is dict and other paras are list? self.embedding will directly be a dict for looking up # when training is completed. self.bias = fluid.Embedding(size=[len(self.vocab_index), 1], param_attr=fluid.ParamAttr(name='bias', initializer=fluid.initializer.ConstantInitializer(0.0))) # use a sparse form to represent the co-occurrence matrix of words, the high frequency word is the parent of the # low frequency # word pairs that are d words apart contribute 1/d to the total count. This is one way to account for the fact # that very distant word pairs are expected to contain less relevant information about the words’ relationship # to one another. self.cooccur = CoOccur() self.buffer = Buffer(self.overflow_buffer_size) total_length = np.sum([freq for freq in self.words_counter.values()]) if self.verbose: print('pre-processing the text, total length (word counts) of the text is ', total_length) start = time.time() counter = 0 for num, words in enumerate(text): for index, w in enumerate(words): if self.words_counter[w] > self.min_count: pre = max(0, index - self.window) length = len(words[pre:index]) if length > 0: for i, w_ in enumerate(words[pre:index]): if self.words_counter[w_] > self.min_count: ind_1 = self.vocab_index[w] ind_2 = self.vocab_index[w_] dis = length - i if (ind_1 + 1) * (ind_2 + 1) <= self.max_product: if ind_1 < ind_2: self.cooccur.pair(str(w), str(w_), dis) elif ind_1 == ind_2: continue else: self.cooccur.pair(str(w_), str(w), dis) else: if ind_1 < ind_2: self.buffer.pair(str(w), str(w_), dis) elif ind_1 == ind_2: continue else: self.buffer.pair(str(w_), str(w), dis) counter += 1 if self.verbose: if counter % 100000 == 0: print('{}/{} processed - cost time: {:.0f}s - ETA: {:.0f}s ......'. format(str(counter).rjust(len(str(total_length))), total_length, time.time() - start, (time.time() - start) / counter * (total_length - counter))) self.buffer.update_file() if self.verbose: print('pre-processing complete. cost time: {:.0f}s'.format(time.time() - start)) if self.verbose: print("fit complete......") print("begin training operation......")
def __init__(self, input_size, vocab_size, size, dtype="float32"): super(MyLayer2, self).__init__(dtype=dtype) self.embed0 = fluid.Embedding(size=(vocab_size, size)) self.embed1 = fluid.Embedding(size=(vocab_size, size)) self.linear_0 = fluid.Linear(input_size, size, dtype=dtype) self.linear_1 = fluid.Linear(input_size, size, dtype=dtype)