def get_data(source_path, target_path, source_embedding_path, target_embedding_path): # source_word, target_word = set(), set() ## filter word need more count source_word_dict, target_word_dict = defaultdict(int), defaultdict(int) # source_path = "/home/FuDawei/NLP/Machine_Translation/dataset/datum2017/Book1_en.txt" # target_path = "/home/FuDawei/NLP/Machine_Translation/dataset/datum2017/Book1_cn.txt" get_wordset(source_path, source_word_dict) # get_wordset(source_dev_path, source_word_dict) # get_wordset(target_train_path, target_word_dict) get_wordset(target_path, target_word_dict) # source_thresh_cnt = 1 # target_thresh_cnt = 8 # source_word = list(filter(lambda x: source_word_dict[x]>source_thresh_cnt, source_word_dict.keys())) # target_word = list(filter(lambda x: target_word_dict[x]>target_thresh_cnt, target_word_dict.keys())) source_word = list(source_word_dict.keys()) target_word = list(target_word_dict.keys()) save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/" # path, Word, glove_dim, ignore_head, debug=True target_embedding, target_word2id, target_id2word = get_embedding(target_embedding_path, target_word, 300, 1) print("target "+str(len(target_word2id))) source_embedding, source_word2id, source_id2word = get_embedding(source_embedding_path, source_word, 300, 0) print("source "+str(len(source_word2id))) dump_data(source_embedding, save_dir+"source_embedding.json") dump_data(source_word2id, save_dir+"source_word2id.json") dump_data(source_id2word, save_dir+"source_id2word.json") dump_data(target_embedding, save_dir+"target_embedding.json") dump_data(target_word2id, save_dir+"target_word2id.json") dump_data(target_id2word, save_dir+"target_id2word.json")
def get_if_duplicate(sentence1, sentence2): is_duplicate = 0 try: global model if sentence1 and sentence2: sentence1 = tokenize_sent(str(sentence1).lower()) sentence2 = tokenize_sent(str(sentence2).lower()) sentence1 = [get_embedding(w) for w in sentence1] sentence2 = [get_embedding(w) for w in sentence2] len1 = len(sentence1) len2 = len(sentence2) results = model.sess.run( model.is_duplicate, feed_dict = model.get_feed_dict( [sentence1], [sentence2], [len1], [len2], None ) ) is_duplicate = results[0] except Exception as e: print(str(e)) return dup_dict[is_duplicate]
def __init__(self, params): super(PGN, self).__init__() word_model_path = os.path.join(os.path.abspath('../'), 'data', 'w2v.model') vocab_path = os.path.join(os.path.abspath('../'), 'data', 'words_frequences.txt') self.params = params self.matrix = get_embedding(vocab_path, word_model_path, params) self.encoder = Encoder(params["vocab_size"], params["embed_size"], self.matrix, params["enc_units"], params["batch_size"]) self.attention = BahdanauAttention(params["attn_units"]) self.decoder = Decoder(params["vocab_size"], params["embed_size"], self.matrix, params["dec_units"], params["batch_size"]) self.pointer = Pointer()
def get_similar_documents(query: str, count: int) -> List[Dict[str, str]]: """ get similar documents """ index = config.get_es_index() embedded_query = get_embedding(query) knn_query = {"size": count, "query": {"knn": {"embedding": {"vector": embedded_query, "k": count}}}} results = es_handler.search(index=index, body=knn_query)["hits"]["hits"] documents = [] for res in results: doc = {"id": res["_id"], "score": res["_score"]} source = res["_source"] source.pop("embedding") doc.update(source) documents.append(doc) return documents
def index_page(pageid: int) -> None: """ add page to index - get document from s3 - get embedding from document content - prepare es document - index es document """ index_name = config.get_es_index() s3_bucket = config.get_s3_bucket() s3_prefix = config.get_s3_prefix() s3_file_uri = f"S3://{s3_bucket}/{s3_prefix}/{pageid}" with smart_open(s3_file_uri, "r") as fp: page = json.load(fp) page_id = page.pop("pageid") content = page.pop("content") page["uri"] = s3_file_uri page["embedding"] = get_embedding(content) es_handler.index(index_name, body=page, id=page_id)
def read_corpus(random, max_len): vocab, word2id, embedding = get_embedding(random, 300) sentsid_, sents_, tags_ = [], [], [] logging.info("开始读取数据集") with open(config.split_data, encoding='utf-8') as fr: lines = fr.readlines() sentid_, sent_, tag_ = [], [], [] for line in lines: if line != '\n': char, label = line.strip().split() tag_.append(tag2label[label]) if char.startswith("num"): sent_.append("num") sentid_.append(1) elif char.startswith("en"): sent_.append("en") sentid_.append(2) elif '\u4e00' <= char <= '\u9fa5' and char in vocab: sent_.append(char) sentid_.append(word2id[char]) else: sent_.append("unk") sentid_.append(0) else: if 3 < len(sent_) <= max_len: sents_.append(sent_) tags_.append(tag_) sentsid_.append(sentid_) sentid_, sent_, tag_ = [], [], [] else: sentid_, sent_, tag_ = [], [], [] # 在get_feed_dict去padding,不事先padding好了 # padding_tags = tflearn.data_utils.pad_sequences(tags_, maxlen=max_len, value=3) # padding_sentsid = tflearn.data_utils.pad_sequences(sentsid_, maxlen=max_len, value=0) # print(sents_[0]) # print(padding_sentsid[0]) # print(padding_tags[0]) return sentsid_, sents_, tags_
gru_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(gru_out, 1)], 1) gru_out = self.wc(gru_out) logits = self.ws(gru_out) return logits, state_h, state_c, aligment def init_states(self, batch_size): return (tf.zeros([batch_size, self.gru_size]), tf.zeros([batch_size, self.gru_size])) from embedding import get_embedding embedding_matrix1, embedding_matrix2, input_tensor, target_tensor, tokenizer1, tokenizer2 = get_embedding( ) BUFFER_SIZE = len(input_tensor) BATCH_SIZE = 64 steps_per_epoch = len(input_tensor) // BATCH_SIZE embedding_dim = 256 units = 1024 vocab_inp_size = len(input_tensor) + 1 vocab_tar_size = len(target_tensor) + 1 encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) def data_loader(input_tensor, target_tensor): dataset = tf.data.Dataset.from_tensor_slices( (input_tensor, target_tensor)).shuffle(len(input_tensor))
default='predict', help='三种模式:train/test/predict') parser.add_argument('--embedding_random', type=str, default=True, help='使用随机的字嵌入(True)还是已经预训练好的(False),默认使用随机') parser.add_argument('--update_embedding', type=str2bool, default=True, help='默认训练') args = parser.parse_args() train_data, test_data = get_train_test_data(args.embedding_random, args.max_len) vocab, word2id, embeddings = get_embedding(args.embedding_random, args.embedding_dim) configs = tf.ConfigProto() configs.gpu_options.allow_growth = True configs.gpu_options.per_process_gpu_memory_fraction = 0.2 # paths setting paths = {} output_path = config.output_path if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path):
return train_dataloader, val_dataloader if __name__ == '__main__': # vocab_size = 10000 # vocabulary = get_all_vocabulary(train_file_path='dataset/train.csv', vocab_size=vocab_size) # assert isinstance(vocabulary, list) # assert isinstance(vocabulary[0], str) # assert len(vocabulary) <= vocab_size # f = open('dataset/vocabulary.txt', 'r') vocabulary = f.readlines() vocabulary = [v.strip() for v in vocabulary] embedding, token2id, vocab_size = get_embedding(set(vocabulary)) X_train, y_train, X_val, y_val, label2id, id2label = get_train_data( 'dataset/train.csv', vocab2ids=token2id) print(X_train, y_train, X_val, y_val, label2id, id2label) train_loader, val_loader = build_dataloader(X_train, y_train, X_val, y_val, batch_size=128) for i, (x, y) in enumerate(train_loader): ic(x) ic(y)
def forward(self, input_ids=None): word_embeddings = self.embedding(input_ids) sentence_embedding = word_embeddings.unsqueeze(1) out = torch.cat([self.conv_and_pool(sentence_embedding, conv) for conv in self.convs], 1) out = self.dropout(out) out = self.fc(out) outputs = (out, ) return outputs if __name__ == '__main__': some_text_sentence = '今天股市大跌' words = list(jieba.cut(some_text_sentence)) embedding, token2id, _ = get_embedding(set(words)) text_cnn_model = TextCNN(embedding, each_filter_num=128, filter_heights=[2, 3, 5], drop_out=0.3, num_classes=15) ids =[token2id[w] for w in words] some_text_sentence = '测试一个新句子' words = list(jieba.cut(some_text_sentence)) embedding, token2id, _ = get_embedding(set(words)) # out = text_cnn_model(torch.tensor([ids])) # print(out)