def get_batch_generator(word2id, id2idf, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids, question_len) # pad questions to length question_len
        context_ids = padded(context_ids, context_len) # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        qn_features = get_question_features(word2id, id2idf, context_ids, qn_ids, qn_mask)
        cx_features = get_context_features(word2id, id2idf, context_ids, qn_ids, context_mask)

        # Make into a Batch object
        batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, qn_features=qn_features, cx_features=cx_features, uuids=uuids)

        yield batch

    return
def get_batch_generator(word2id, qn_uuid_data, context_token_data,
                        qn_token_data, batch_size, context_len, question_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences.

    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                           qn_token_data, batch_size, context_len,
                           question_len)
        if len(batches) == 0:
            break

        (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0)

        qn_ids = padded(qn_ids, question_len)
        context_ids = padded(context_ids, context_len)

        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        batch = Batch(context_ids,
                      context_mask,
                      context_tokens,
                      qn_ids,
                      qn_mask,
                      qn_tokens=None,
                      ans_span=None,
                      ans_tokens=None,
                      uuids=uuids)

        yield batch

    return
Beispiel #3
0
def test_append_batch_both_different():
    """Test appending batch with both timestamps, different times."""
    archiver = CsvArchiver(SAMPLE_PATH)
    batch = Batch([SAMPLE_TIMESTAMP], [SAMPLE_TIMESTAMP2])
    archiver.append(batch)

    data_expected = {
        SAMPLE_TIMESTAMP: {
            LABEL_TIMESTAMP: SAMPLE_TIMESTAMP,
            EventType.monitoring_started.name: 0,
            EventType.monitoring_ended.name: 0,
            EventType.person_entered.name: 1,
            EventType.person_left.name: 0,
        },
        SAMPLE_TIMESTAMP2: {
            LABEL_TIMESTAMP: SAMPLE_TIMESTAMP2,
            EventType.monitoring_started.name: 0,
            EventType.monitoring_ended.name: 0,
            EventType.person_entered.name: 0,
            EventType.person_left.name: 1,
        },
    }
    assert archiver._entries == data_expected
def get_batch_generator(word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, word_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            # refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len)
            refill_batches(batches, word2id, char2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len, word_len)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids,\
            context_char_tokens, context_char_ids, context_char_mask,\
            qn_char_tokens, qn_char_ids, qn_char_mask) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids, question_len) # pad questions to length question_len
        context_ids = padded(context_ids, context_len) # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        curr_batch_size = len(qn_ids)
        
        # Fill in blanks
        context_char_ids_np = np.ones(shape=(curr_batch_size, context_len, word_len))
        context_char_ids = np.array(context_char_ids)
        (a,b,c) = context_char_ids.shape
        context_char_ids_np[:a,:b,:c] = context_char_ids

        context_char_mask_np = np.ones(shape=(curr_batch_size, context_len, word_len))
        context_char_mask = np.array(context_char_mask)
        context_char_mask_np[:a,:b,:c] = context_char_mask

        qn_char_ids_np = np.ones(shape=(curr_batch_size, question_len, word_len))
        qn_char_ids = np.array(qn_char_ids)
        (a,b,c) = qn_char_ids.shape
        qn_char_ids_np[:a,:b,:c] = qn_char_ids

        qn_char_mask_np = np.ones(shape=(curr_batch_size, question_len, word_len))
        qn_char_mask = np.array(qn_char_mask)
        qn_char_mask_np[:a,:b,:c] = qn_char_mask

        # word_len_chars = word_len*[1]
        # context_len_chars = curr_batch_size *[context_len * [word_len_chars]]
        # question_len_chars = curr_batch_size *[question_len * [word_len_chars]]
        # Make into a Batch object
        batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, 
                        char_context_ids=context_char_ids_np, char_context_mask=context_char_mask_np, char_context_tokens=context_char_tokens, 
                        char_qn_ids=qn_char_ids_np, char_qn_mask=qn_char_mask_np, char_qn_tokens=qn_char_tokens, uuids=uuids)

        yield batch

    return
def get_batch_generator(word2id, qn_uuid_data, context_token_data,
                        qn_token_data, batch_size, context_len, question_len,
                        num_feats, word_len, mcids_dict):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                           qn_token_data, batch_size, context_len,
                           question_len, word_len, mcids_dict)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids, feats, char_ids,
         commonQ_mask, commonQ_emb_indices, charQ_ids, commonC_mask,
         commonC_emb_indices) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids,
                        question_len)  # pad questions to length question_len
        context_ids = padded(context_ids,
                             context_len)  # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        # Make feats into an np array
        feats = np.array(padded2(feats, num_feats, context_len))

        # Pad character ids (first for word length, then for context length), then make into array
        char_ids = padded2(char_ids, word_len, context_len, islist=True)
        char_ids = np.array(char_ids)
        char_mask = (char_ids != PAD_ID).astype(np.int32)

        charQ_ids = padded2(charQ_ids, word_len, question_len, islist=True)
        charQ_ids = np.array(charQ_ids)
        charQ_mask = (charQ_ids != PAD_ID).astype(np.int32)

        # Pad commonQ_mask and commonQ_emb_indices / convert to np.array
        commonQ_mask = np.array(paddedBool(commonQ_mask, question_len))
        commonQ_emb_indices = np.array(
            padded(commonQ_emb_indices, question_len))

        commonC_mask = np.array(paddedBool(commonC_mask, context_len))
        commonC_emb_indices = np.array(padded(commonC_emb_indices,
                                              context_len))

        # Make into a Batch object
        batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, \
            feats=feats, char_ids=char_ids, char_mask=char_mask, commonQ_mask=commonQ_mask, commonQ_emb_indices=commonQ_emb_indices, \
            charQ_ids=charQ_ids, charQ_mask=charQ_mask, commonC_mask=commonC_mask, commonC_emb_indices=commonC_emb_indices, uuids=uuids)

        yield batch

    return
Beispiel #6
0
def test_append_empty_batch():
    """Test appending empty batch."""
    archiver = CsvArchiver(SAMPLE_PATH)
    batch = Batch([], [])
    archiver.append(batch)
    assert not archiver._entries
Beispiel #7
0
    def convert_text_as_batch(self, context, question):

        # 转为 unicode
        context = context.decode("utf-8")
        question = question.decode("utf-8")

        # 1.分词,pos,ner,词干,tf
        logger.debug("#" * 20 + " 开始转换为模型的输入 " + "#" * 20)
        t1 = time.time()
        context_tokens, context_poses, context_ners, context_lemmas, context_tf = tokenize_pos_ner(context)
        question_tokens, question_poses, question_ners, question_lemmas, _ = tokenize_pos_ner(question)
        t2 = time.time()
        logger.debug("分词、词性、实体处理: {}s".format(t2 - t1))

        # 2.获取文章与问题的3种匹配特征
        question_tokens_lower = [w.lower() for w in question_tokens]
        context_tokens_lower = [w.lower() for w in context_tokens]
        exact_match = c2q_match(context_tokens, set(question_tokens))  # 精确匹配
        lower_match = c2q_match(context_tokens_lower, set(question_tokens_lower))  # 小写匹配
        lemma_match = c2q_match(context_lemmas, set(question_lemmas))  # 提取文章 token 的词干是否出现在问题中
        context_tf_match = [[f1, f2, f3, f4] for f1, f2, f3, f4 in
                            zip(context_tf, exact_match, lower_match, lemma_match)]
        t3 = time.time()
        logger.debug("文章与问题匹配处理: {}s".format(t3 - t2))

        # 3. 转为 id
        context_ids = [self.word2id.get(token.lower(), UNK_ID) for token in context_tokens]
        ques_ids = [self.word2id.get(token.lower(), UNK_ID) for token in question_tokens]
        context_pos_ids = [self.tag2id.get(p, 0) for p in context_poses]
        context_ner_ids = [self.ner2id.get(n, 0) for n in context_ners]
        t4 = time.time()
        logger.debug("token->id 处理: {}s".format(t4 - t3))

        def batchry(item):
            ''' 将一条数据封装为 batch 的形式,其实就是加一层[]'''
            return [item]

        batch_uuids = batchry(1)
        batch_context_ids = batchry(context_ids)
        batch_context_tokens = batchry(context_tokens)
        batch_context_pos_ids = batchry(context_pos_ids)
        batch_context_ner_ids = batchry(context_ner_ids)
        batch_context_features = batchry(context_tf_match)
        batch_ques_ids = batchry(ques_ids)
        batch_ques_tokens = batchry(question_tokens)
        batch_ans_span = batchry([0, 1])
        batch_ans_tokens = batchry([])

        # 进行pad
        context_len = self.FLAGS.context_len
        ques_len = self.FLAGS.ques_len
        batch_context_ids = pad(batch_context_ids, context_len)
        batch_context_pos_ids = pad(batch_context_pos_ids, context_len)
        batch_context_ner_ids = pad(batch_context_ner_ids, context_len)
        batch_ques_ids = pad(batch_ques_ids, ques_len)
        batch_context_features = pad(batch_context_features, context_len, np.array([0, 0, 0, 0]))

        # np化
        batch_context_ids = np.asarray(batch_context_ids)
        batch_context_pos_ids = np.asarray(batch_context_pos_ids)
        batch_context_ner_ids = np.asarray(batch_context_ner_ids)
        batch_context_features = np.asarray(batch_context_features)
        batch_ques_ids = np.asarray(batch_ques_ids)
        batch_ans_span = np.asarray(batch_ans_span)

        # 进行mask,只有进行np化后,才能进行这样的操作
        batch_context_mask = (batch_context_ids != PAD_ID).astype(np.int32)
        batch_ques_mask = (batch_ques_ids != PAD_ID).astype(np.int32)

        batch = Batch(batch_context_ids, batch_context_mask, batch_context_tokens, batch_context_pos_ids,
                      batch_context_ner_ids,
                      batch_context_features, batch_ques_ids, batch_ques_mask, batch_ques_tokens, batch_ans_span,
                      batch_ans_tokens, batch_uuids)
        t5 = time.time()
        logger.debug("封装为 batch 处理: {}s".format(t5 - t4))
        logger.debug("#" * 10 + " 完成转换为模型的输入,共耗时:{} ".format(t5 - t1) + "#" * 10)

        return batch