def get_batch_generator(word2id, id2idf, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len): """ This is similar to get_batch_generator in data_batcher.py, but with some differences (see explanation in refill_batches). Inputs: word2id: dictionary mapping word (string) to word id (int) qn_uuid_data: list of strings that are unique ids context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding) batch_size: int. size of batches to make context_len, question_len: ints. max sizes of context and question. Anything longer is truncated. Yields: Batch objects, but they only contain context and question information (no answer information) """ batches = [] while True: if len(batches) == 0: refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len) if len(batches) == 0: break # Get next batch. These are all lists length batch_size (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0) # Pad context_ids and qn_ids qn_ids = padded(qn_ids, question_len) # pad questions to length question_len context_ids = padded(context_ids, context_len) # pad contexts to length context_len # Make qn_ids into a np array and create qn_mask qn_ids = np.array(qn_ids) qn_mask = (qn_ids != PAD_ID).astype(np.int32) # Make context_ids into a np array and create context_mask context_ids = np.array(context_ids) context_mask = (context_ids != PAD_ID).astype(np.int32) qn_features = get_question_features(word2id, id2idf, context_ids, qn_ids, qn_mask) cx_features = get_context_features(word2id, id2idf, context_ids, qn_ids, context_mask) # Make into a Batch object batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, qn_features=qn_features, cx_features=cx_features, uuids=uuids) yield batch return
def get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len): """ This is similar to get_batch_generator in data_batcher.py, but with some differences. """ batches = [] while True: if len(batches) == 0: refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len) if len(batches) == 0: break (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0) qn_ids = padded(qn_ids, question_len) context_ids = padded(context_ids, context_len) qn_ids = np.array(qn_ids) qn_mask = (qn_ids != PAD_ID).astype(np.int32) context_ids = np.array(context_ids) context_mask = (context_ids != PAD_ID).astype(np.int32) batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, uuids=uuids) yield batch return
def test_append_batch_both_different(): """Test appending batch with both timestamps, different times.""" archiver = CsvArchiver(SAMPLE_PATH) batch = Batch([SAMPLE_TIMESTAMP], [SAMPLE_TIMESTAMP2]) archiver.append(batch) data_expected = { SAMPLE_TIMESTAMP: { LABEL_TIMESTAMP: SAMPLE_TIMESTAMP, EventType.monitoring_started.name: 0, EventType.monitoring_ended.name: 0, EventType.person_entered.name: 1, EventType.person_left.name: 0, }, SAMPLE_TIMESTAMP2: { LABEL_TIMESTAMP: SAMPLE_TIMESTAMP2, EventType.monitoring_started.name: 0, EventType.monitoring_ended.name: 0, EventType.person_entered.name: 0, EventType.person_left.name: 1, }, } assert archiver._entries == data_expected
def get_batch_generator(word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, word_len): """ This is similar to get_batch_generator in data_batcher.py, but with some differences (see explanation in refill_batches). Inputs: word2id: dictionary mapping word (string) to word id (int) qn_uuid_data: list of strings that are unique ids context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding) batch_size: int. size of batches to make context_len, question_len: ints. max sizes of context and question. Anything longer is truncated. Yields: Batch objects, but they only contain context and question information (no answer information) """ batches = [] while True: if len(batches) == 0: # refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len) refill_batches(batches, word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, word_len) if len(batches) == 0: break # Get next batch. These are all lists length batch_size (uuids, context_tokens, context_ids, qn_ids,\ context_char_tokens, context_char_ids, context_char_mask,\ qn_char_tokens, qn_char_ids, qn_char_mask) = batches.pop(0) # Pad context_ids and qn_ids qn_ids = padded(qn_ids, question_len) # pad questions to length question_len context_ids = padded(context_ids, context_len) # pad contexts to length context_len # Make qn_ids into a np array and create qn_mask qn_ids = np.array(qn_ids) qn_mask = (qn_ids != PAD_ID).astype(np.int32) # Make context_ids into a np array and create context_mask context_ids = np.array(context_ids) context_mask = (context_ids != PAD_ID).astype(np.int32) curr_batch_size = len(qn_ids) # Fill in blanks context_char_ids_np = np.ones(shape=(curr_batch_size, context_len, word_len)) context_char_ids = np.array(context_char_ids) (a,b,c) = context_char_ids.shape context_char_ids_np[:a,:b,:c] = context_char_ids context_char_mask_np = np.ones(shape=(curr_batch_size, context_len, word_len)) context_char_mask = np.array(context_char_mask) context_char_mask_np[:a,:b,:c] = context_char_mask qn_char_ids_np = np.ones(shape=(curr_batch_size, question_len, word_len)) qn_char_ids = np.array(qn_char_ids) (a,b,c) = qn_char_ids.shape qn_char_ids_np[:a,:b,:c] = qn_char_ids qn_char_mask_np = np.ones(shape=(curr_batch_size, question_len, word_len)) qn_char_mask = np.array(qn_char_mask) qn_char_mask_np[:a,:b,:c] = qn_char_mask # word_len_chars = word_len*[1] # context_len_chars = curr_batch_size *[context_len * [word_len_chars]] # question_len_chars = curr_batch_size *[question_len * [word_len_chars]] # Make into a Batch object batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, char_context_ids=context_char_ids_np, char_context_mask=context_char_mask_np, char_context_tokens=context_char_tokens, char_qn_ids=qn_char_ids_np, char_qn_mask=qn_char_mask_np, char_qn_tokens=qn_char_tokens, uuids=uuids) yield batch return
def get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, num_feats, word_len, mcids_dict): """ This is similar to get_batch_generator in data_batcher.py, but with some differences (see explanation in refill_batches). Inputs: word2id: dictionary mapping word (string) to word id (int) qn_uuid_data: list of strings that are unique ids context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding) batch_size: int. size of batches to make context_len, question_len: ints. max sizes of context and question. Anything longer is truncated. Yields: Batch objects, but they only contain context and question information (no answer information) """ batches = [] while True: if len(batches) == 0: refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, word_len, mcids_dict) if len(batches) == 0: break # Get next batch. These are all lists length batch_size (uuids, context_tokens, context_ids, qn_ids, feats, char_ids, commonQ_mask, commonQ_emb_indices, charQ_ids, commonC_mask, commonC_emb_indices) = batches.pop(0) # Pad context_ids and qn_ids qn_ids = padded(qn_ids, question_len) # pad questions to length question_len context_ids = padded(context_ids, context_len) # pad contexts to length context_len # Make qn_ids into a np array and create qn_mask qn_ids = np.array(qn_ids) qn_mask = (qn_ids != PAD_ID).astype(np.int32) # Make context_ids into a np array and create context_mask context_ids = np.array(context_ids) context_mask = (context_ids != PAD_ID).astype(np.int32) # Make feats into an np array feats = np.array(padded2(feats, num_feats, context_len)) # Pad character ids (first for word length, then for context length), then make into array char_ids = padded2(char_ids, word_len, context_len, islist=True) char_ids = np.array(char_ids) char_mask = (char_ids != PAD_ID).astype(np.int32) charQ_ids = padded2(charQ_ids, word_len, question_len, islist=True) charQ_ids = np.array(charQ_ids) charQ_mask = (charQ_ids != PAD_ID).astype(np.int32) # Pad commonQ_mask and commonQ_emb_indices / convert to np.array commonQ_mask = np.array(paddedBool(commonQ_mask, question_len)) commonQ_emb_indices = np.array( padded(commonQ_emb_indices, question_len)) commonC_mask = np.array(paddedBool(commonC_mask, context_len)) commonC_emb_indices = np.array(padded(commonC_emb_indices, context_len)) # Make into a Batch object batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, \ feats=feats, char_ids=char_ids, char_mask=char_mask, commonQ_mask=commonQ_mask, commonQ_emb_indices=commonQ_emb_indices, \ charQ_ids=charQ_ids, charQ_mask=charQ_mask, commonC_mask=commonC_mask, commonC_emb_indices=commonC_emb_indices, uuids=uuids) yield batch return
def test_append_empty_batch(): """Test appending empty batch.""" archiver = CsvArchiver(SAMPLE_PATH) batch = Batch([], []) archiver.append(batch) assert not archiver._entries
def convert_text_as_batch(self, context, question): # 转为 unicode context = context.decode("utf-8") question = question.decode("utf-8") # 1.分词,pos,ner,词干,tf logger.debug("#" * 20 + " 开始转换为模型的输入 " + "#" * 20) t1 = time.time() context_tokens, context_poses, context_ners, context_lemmas, context_tf = tokenize_pos_ner(context) question_tokens, question_poses, question_ners, question_lemmas, _ = tokenize_pos_ner(question) t2 = time.time() logger.debug("分词、词性、实体处理: {}s".format(t2 - t1)) # 2.获取文章与问题的3种匹配特征 question_tokens_lower = [w.lower() for w in question_tokens] context_tokens_lower = [w.lower() for w in context_tokens] exact_match = c2q_match(context_tokens, set(question_tokens)) # 精确匹配 lower_match = c2q_match(context_tokens_lower, set(question_tokens_lower)) # 小写匹配 lemma_match = c2q_match(context_lemmas, set(question_lemmas)) # 提取文章 token 的词干是否出现在问题中 context_tf_match = [[f1, f2, f3, f4] for f1, f2, f3, f4 in zip(context_tf, exact_match, lower_match, lemma_match)] t3 = time.time() logger.debug("文章与问题匹配处理: {}s".format(t3 - t2)) # 3. 转为 id context_ids = [self.word2id.get(token.lower(), UNK_ID) for token in context_tokens] ques_ids = [self.word2id.get(token.lower(), UNK_ID) for token in question_tokens] context_pos_ids = [self.tag2id.get(p, 0) for p in context_poses] context_ner_ids = [self.ner2id.get(n, 0) for n in context_ners] t4 = time.time() logger.debug("token->id 处理: {}s".format(t4 - t3)) def batchry(item): ''' 将一条数据封装为 batch 的形式,其实就是加一层[]''' return [item] batch_uuids = batchry(1) batch_context_ids = batchry(context_ids) batch_context_tokens = batchry(context_tokens) batch_context_pos_ids = batchry(context_pos_ids) batch_context_ner_ids = batchry(context_ner_ids) batch_context_features = batchry(context_tf_match) batch_ques_ids = batchry(ques_ids) batch_ques_tokens = batchry(question_tokens) batch_ans_span = batchry([0, 1]) batch_ans_tokens = batchry([]) # 进行pad context_len = self.FLAGS.context_len ques_len = self.FLAGS.ques_len batch_context_ids = pad(batch_context_ids, context_len) batch_context_pos_ids = pad(batch_context_pos_ids, context_len) batch_context_ner_ids = pad(batch_context_ner_ids, context_len) batch_ques_ids = pad(batch_ques_ids, ques_len) batch_context_features = pad(batch_context_features, context_len, np.array([0, 0, 0, 0])) # np化 batch_context_ids = np.asarray(batch_context_ids) batch_context_pos_ids = np.asarray(batch_context_pos_ids) batch_context_ner_ids = np.asarray(batch_context_ner_ids) batch_context_features = np.asarray(batch_context_features) batch_ques_ids = np.asarray(batch_ques_ids) batch_ans_span = np.asarray(batch_ans_span) # 进行mask,只有进行np化后,才能进行这样的操作 batch_context_mask = (batch_context_ids != PAD_ID).astype(np.int32) batch_ques_mask = (batch_ques_ids != PAD_ID).astype(np.int32) batch = Batch(batch_context_ids, batch_context_mask, batch_context_tokens, batch_context_pos_ids, batch_context_ner_ids, batch_context_features, batch_ques_ids, batch_ques_mask, batch_ques_tokens, batch_ans_span, batch_ans_tokens, batch_uuids) t5 = time.time() logger.debug("封装为 batch 处理: {}s".format(t5 - t4)) logger.debug("#" * 10 + " 完成转换为模型的输入,共耗时:{} ".format(t5 - t1) + "#" * 10) return batch