def main(): data_set = DataSet.load(FLAGS.data_set_path) embedder = Embedder(data_set.input_vocabulary, FLAGS.checkpoint_path) texts = [ text.replace('\n', '') for text in get_text_file(FLAGS.inputs_file_path) ] logging.info('getting embeddings') embeddings = [embedder.get_embedding(text) for text in tqdm(texts)] embedder.close() logging.info('applying dimensionality reduction') embeddings_reduced = do_pca(embeddings, 2, FLAGS.dim_reduction_method) logging.info('plotting') plt.scatter(embeddings_reduced[:, 0], embeddings_reduced[:, 1]) for text, x_coord, y_coord in zip(texts, embeddings_reduced[:, 0], embeddings_reduced[:, 1]): plt.annotate(text, xy=(x_coord, y_coord), xytext=(0, 0), textcoords='offset points') plt.show()
def __init__(self, path_KG, path_QA, split_ratio=0.8, using_cache=True): self.KG = KnowledgeGraph(path_KG) self.embedder = Embedder() self.training = True # 指定是否是训练阶段 self._iter_i = 0 self._split_ratio = split_ratio # try to load from cache if using_cache and Utility.Binary.exists('dataset'): self.questions = Utility.Binary.load('dataset') print('{} questions loaded'.format(len(self.questions))) return # read the original questions questions = pd.read_csv( path_QA, sep='\t', header=None, names=['question_sentence', 'answer_set', 'answer_path']) questions['answer'] = questions['answer_set'].apply( lambda x: x.split('(')[0]) questions['q_split'] = questions['question_sentence'].apply( lambda x: x.lower().split(' ')) questions['answer'] = questions['answer_set'].apply( lambda x: x.split('(')[0]) questions['e_s'] = questions['answer_path'].apply( lambda x: x.split('#')[0]) # find head entity e_s, answer, and question_list by parsing the question_sentence questions['q_str'] = [ self.parse_question(row['question_sentence'].split('?')[0], row['e_s']) for idx, row in questions.iterrows() ] # 对问题编码 # NOTE: 这里是正对小数据集采取的空间换时间的方式,避免每一次都重新embed问题,对于大数据集需要单独处理数据 questions['q'] = questions['q_str'].apply( lambda q: self.embed_question(q)) question_list = questions[['q_str', 'q', 'e_s', 'answer']].values.tolist() question_list = [tuple(x) for x in question_list] self.questions = question_list print('{} questions loaded'.format(len(question_list))) if using_cache: Utility.Binary.save('dataset', question_list)
def __init__(self, vocab_size, d_model, N, heads, device, weight_matrix): super().__init__() self.N = N self.embed = Embedder(weight_matrix).to(device) self.linear = nn.Linear(weight_matrix.shape[1], d_model) self.pe = PositionalEncoder(d_model) self.layers = self.get_clones(EncoderLayer(d_model, heads, 0.3), N) self.norm = Norm(d_model)
def __init__(self, vocab_file, embed_dim, filter_num, filter_sizes, drop_rate, sen_len): self.embedding_ = Embedder(vocab_file, embed_dim) self.textcnn_ = TextCnn(filter_num, filter_sizes, drop_rate, embed_dim, sen_len) self.textmatch_ = TextMatch(filter_num, filter_sizes, drop_rate, embed_dim, sen_len) self.sen_len = sen_len self.__call__() pass
class Dataset: ''' 数据集: 每一条数据为 [问题的字符串表示, 问题的embedding, 头实体, 答案] ''' def __init__(self, path_KG, path_QA, split_ratio=0.8, using_cache=True): self.KG = KnowledgeGraph(path_KG) self.embedder = Embedder() self.training = True # 指定是否是训练阶段 self._iter_i = 0 self._split_ratio = split_ratio # try to load from cache if using_cache and Utility.Binary.exists('dataset'): self.questions = Utility.Binary.load('dataset') print('{} questions loaded'.format(len(self.questions))) return # read the original questions questions = pd.read_csv( path_QA, sep='\t', header=None, names=['question_sentence', 'answer_set', 'answer_path']) questions['answer'] = questions['answer_set'].apply( lambda x: x.split('(')[0]) questions['q_split'] = questions['question_sentence'].apply( lambda x: x.lower().split(' ')) questions['answer'] = questions['answer_set'].apply( lambda x: x.split('(')[0]) questions['e_s'] = questions['answer_path'].apply( lambda x: x.split('#')[0]) # find head entity e_s, answer, and question_list by parsing the question_sentence questions['q_str'] = [ self.parse_question(row['question_sentence'].split('?')[0], row['e_s']) for idx, row in questions.iterrows() ] # 对问题编码 # NOTE: 这里是正对小数据集采取的空间换时间的方式,避免每一次都重新embed问题,对于大数据集需要单独处理数据 questions['q'] = questions['q_str'].apply( lambda q: self.embed_question(q)) question_list = questions[['q_str', 'q', 'e_s', 'answer']].values.tolist() question_list = [tuple(x) for x in question_list] self.questions = question_list print('{} questions loaded'.format(len(question_list))) if using_cache: Utility.Binary.save('dataset', question_list) def embed_question(self, question): n, idx = len(question), 0 q_emb = torch.zeros((n, ExpSet.word_embedding_dimension)) for word in question: if word == '<e>': continue w_emb = self.embedder.get_word_embedding(word) if w_emb is not None: q_emb[idx] = w_emb idx = idx + 1 return q_emb[:idx] def embed_relation(self, relation): return self.embedder.get_relation_embedding(relation) def parse_question(self, question: str, e_s: str): ''' 将问题分为以单词为单位的分词列表,并找出头实体(e_s)【问题中在KG实体的单词中最长的单词】,并替换为<e> :param question: 问题字符串 :param e_s: 头实体 :return: 问题分词列表(字符串) ''' modified_question_list = [] for item in question.split(' '): if item == e_s: modified_question_list.append('<e>') else: if len(item.split('_')) > 0: for x in item.split('_'): if x != '': modified_question_list.append(x) else: modified_question_list.append(item) return modified_question_list def __iter__(self): return self def __next__(self): try: d = self[self._iter_i] self._iter_i = self._iter_i + 1 return d except IndexError: self._iter_i = 0 raise StopIteration() def __getitem__(self, item): if item >= self.size: raise IndexError( 'index out of bound, size={}, item={}, training={}'.format( self.size, item, self.training)) if self.training: return self.questions[item] return self.questions[self.training_size + item] def __len__(self): return self.size @property def size(self): if self.training: return self.training_size return self.testing_size @property def data_size(self): return len(self.questions) @property def testing_size(self): return self.data_size - self.training_size @property def training_size(self): return int(self._split_ratio * self.data_size) def train(self, _train=True): self.training = _train self._iter_i = 0
max_length_en) target_tensor = tensor_from_sentence("de", output_lang, pair[1], max_length_de) return input_tensor, target_tensor language, total_data = data_generator(batch_size, 20, device) train_data, test_data, y_train, y_test = train_test_split( total_data, np.zeros(len(total_data, )), test_size=0.1, random_state=42) d_model = 128 heads = 8 N = 6 src_vocab = language.n_words trg_vocab = language.n_words en_weight_matrix = Embedder.initial_weights_matrix( "word_vector/glove.6B.300d.txt", language, 300) src_vocab = language.n_words trg_vocab = language.n_words model = Transformer(src_vocab, trg_vocab, d_model, N, heads, device, en_weight_matrix, en_weight_matrix) try: model.load_state_dict( torch.load("model/transformer.pt", map_location=device)) model.eval() except: print("no weights exist") for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
max_length_de): input_tensor = tensor_from_sentence("en", input_lang, pair[0], max_length_en) target_tensor = tensor_from_sentence("de", output_lang, pair[1], max_length_de) return input_tensor, target_tensor input_lang, output_lang, _ = prepare_data(lang1, lang2, 40) d_model = 128 heads = 8 N = 6 src_vocab = input_lang.n_words trg_vocab = output_lang.n_words en_weight_matrix = Embedder.initial_weights_matrix( "word_vector/glove.6B.300d.txt", input_lang, 300) de_weight_matrix = Embedder.initial_weights_matrix( "word_vector/vn_word2vec_300d.txt", input_lang, 300) src_vocab = input_lang.n_words trg_vocab = output_lang.n_words model = Transformer(src_vocab, trg_vocab, d_model, N, heads, device, en_weight_matrix, de_weight_matrix) model.load_state_dict(torch.load("model/transformer.pt", map_location=device)) def translate(model, sentence, lang_input, lang_output, max_len=80): model.eval() src = tensor_from_sentence("en", lang_input, sentence, len(sentence))
n_way = 3 n_support = 5 n_query = 5 max_length = 75 support, query, label = batch_maker(1, n_way, n_support, n_query, max_length, train['encoded'], train['label']) from embedding import Embedder model = Embedder(r.word_vec_tot, 10) print('Support: ', support['pos'].shape) output = model(support['pos']) query1 = model(query['pos']) print('LAbel', label) print('Support Shape Embedding', output.shape) output = output.view(3, 5, 750) query1 = query1.view(15, 1, 750)