Example #1
0
def convert_examples_to_features(examples, config):
    features = []
    for index, example in enumerate(tqdm(examples, desc='Converting Examples')):
        src_seq, tgt_seq = [], []

        for word in example.src.split():
            src_seq.append(word)

        if example.tgt:
            for word in example.tgt.split():
                tgt_seq.append(word)

        src_seq = [config.sos] + src_seq[:config.max_seq_length] + [config.eos]
        tgt_seq = [config.sos] + tgt_seq[:config.max_seq_length] + [config.eos]
        if config.to_lower:
            src_seq = list(map(str.lower, src_seq))
            tgt_seq = list(map(str.lower, tgt_seq))

        src_ids = convert_list(src_seq, config.src_2_id, config.pad_id, config.unk_id)
        tgt_ids = convert_list(tgt_seq, config.tgt_2_id, config.pad_id, config.unk_id)

        features.append(InputFeatures(example.guid, src_ids, tgt_ids))

        if index < 5:
            logger.info(log_title('Examples'))
            logger.info('guid: {}'.format(example.guid))
            logger.info('source input: {}'.format(src_seq))
            logger.info('source ids: {}'.format(src_ids))
            logger.info('target input: {}'.format(tgt_seq))
            logger.info('target ids: {}'.format(tgt_ids))

    return features
Example #2
0
    def _read_data(self, data_file):
        src_seq = []
        tgt_seq = []

        counter = 0
        for line in read_json_lines(data_file):
            src = line.get('src', [])
            tgt = line.get('tgt', [])

            if self.config.to_lower:
                src = list(map(str.lower, src))
                tgt = list(map(str.lower, tgt))

            src = src[:self.config.sequence_len]
            tgt = [self.config.sos
                   ] + tgt[:self.config.sequence_len - 2] + [self.config.eos]

            src_seq.append(
                convert_list(src, self.config.word_2_id, self.config.pad_id,
                             self.config.unk_id))
            tgt_seq.append(
                convert_list(tgt, self.config.word_2_id, self.config.pad_id,
                             self.config.unk_id))

            counter += 1
            if counter % 10000 == 0:
                print('\rprocessing file {}: {:>6d}'.format(
                    data_file, counter),
                      end='')
        print()

        return src_seq, tgt_seq
Example #3
0
    def _read_data(self, data_file):
        input_ids = []
        input_mask = []
        segment_ids = []
        input_length = []
        pos_ids = []
        tag_ids = []

        counter = 0
        with open(data_file, 'r', encoding='utf-8') as fin:
            for line in fin:
                line = json.loads(line)
                context = line['context']
                pos_seq = line['pos_seq']
                tag_seq = line['tag_seq']

                v1, v2, v3, v4, v5, v6 = _convert_single_example(
                    context, pos_seq, tag_seq, self.config.sequence_len,
                    self.tokenizer)
                v5 = convert_list(v5, self.pos_2_id, self.config.pad_id,
                                  self.config.unk_id)
                v6 = convert_list(v6, self.tag_2_id, 0, 0)

                input_ids.append(v1)
                input_mask.append(v2)
                segment_ids.append(v3)
                input_length.append(v4)
                pos_ids.append(v5)
                tag_ids.append(v6)

                counter += 1
                print('\rprocessing: {}'.format(counter), end='')
            print()

        return input_ids, input_mask, segment_ids, input_length, pos_ids, tag_ids
Example #4
0
    def _read_data(self, data_file):
        topic = []
        triple = []
        src = []
        tgt = []

        data_iter = tqdm(list(read_json_lines(data_file)))
        for index, line in enumerate(data_iter):
            topic_seq = ' {} '.format(self.config.sep).join(line['topic'])
            triple_seq = ' {} '.format(self.config.sep).join(
                [' '.join(v) for v in line['triples']])
            src_seq = ' {} '.format(self.config.sep).join(line['src'])
            tgt_seq = line['tgt']

            if self.config.to_lower:
                topic_seq = topic_seq.lower()
                triple_seq = triple_seq.lower()
                src_seq = src_seq.lower()
                tgt_seq = tgt_seq.lower()

            topic_tokens = [self.config.sos
                            ] + topic_seq.split() + [self.config.eos]
            triple_tokens = [self.config.sos] + triple_seq.split(
            )[:self.config.max_triple_length] + [self.config.eos]
            src_tokens = [self.config.sos] + src_seq.split(
            )[-self.config.max_seq_length:] + [self.config.eos]
            tgt_tokens = [self.config.sos] + tgt_seq.split(
            )[:self.config.max_seq_length] + [self.config.eos]

            topic_ids = convert_list(topic_tokens, self.config.word_2_id,
                                     self.config.pad_id, self.config.unk_id)
            triple_ids = convert_list(triple_tokens, self.config.word_2_id,
                                      self.config.pad_id, self.config.unk_id)
            src_ids = convert_list(src_tokens, self.config.word_2_id,
                                   self.config.pad_id, self.config.unk_id)
            tgt_ids = convert_list(tgt_tokens, self.config.word_2_id,
                                   self.config.pad_id, self.config.unk_id)

            topic.append(topic_ids)
            triple.append(triple_ids)
            src.append(src_ids)
            tgt.append(tgt_ids)

            if index < 5:
                logger.info(log_title('Examples'))
                logger.info('topic tokens: {}'.format(topic_tokens))
                logger.info('topic ids: {}'.format(topic_ids))
                logger.info('triple tokens: {}'.format(triple_tokens))
                logger.info('triple ids: {}'.format(triple_ids))
                logger.info('source tokens: {}'.format(src_tokens))
                logger.info('source ids: {}'.format(src_ids))
                logger.info('target tokens: {}'.format(tgt_tokens))
                logger.info('target ids: {}'.format(tgt_ids))

        return topic, triple, src, tgt
Example #5
0
def save_result(predicted_ids, alignment_history, id_2_label, input_file, output_file):
    src_inputs = []
    with open(input_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            line = WikiEntity(line)
            box = line.get_box()
            if len(box) == 0:
                continue
            src = []
            for a in box.keys():
                src += box[a].split()
            src_inputs.append(src)

    tgt_outputs = []
    for tgt in predicted_ids:
        tgt[-1] = config.eos_id
        tgt_outputs.append(convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk))

    assert len(src_inputs) == len(tgt_outputs)

    with open(output_file, 'w', encoding='utf-8') as fout:
        for src, tgt, alignment in zip(src_inputs, tgt_outputs, alignment_history):
            for i, (word, index) in enumerate(zip(tgt, alignment)):
                if word == config.unk:
                    tgt[i] = src[index]
            print(json.dumps({'description': ' '.join(tgt)}, ensure_ascii=False), file=fout)
Example #6
0
def save_result_v2(predicted_ids, id_2_label, output_file):
    with open(output_file, 'w', encoding='utf-8') as fout:
        for tgt in predicted_ids:
            tgt[-1] = config.eos_id
            tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                               config.pad, config.unk)
            print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)
Example #7
0
def save_result(outputs, result_file, tokenizer, id_2_tag):
    print('write file: {}'.format(result_file))
    with open(result_file, 'w', encoding='utf-8') as fout:
        for context, tags in outputs:
            context = tokenizer.convert_ids_to_tokens(context)
            tags = convert_list(tags, id_2_tag, 'O', 'O')
            result = parse_output(tags, context)
            print(json.dumps(result, ensure_ascii=False), file=fout)
Example #8
0
def check_data(data, tokenizer, id_2_pos, id_2_tag):
    input_ids, input_mask, segment_ids, input_length, pos_ids, tag_ids = data

    for _ in range(5):
        print('=' * 20)
        index = np.random.randint(0, len(input_ids))
        print('id: {}'.format(index))
        length = input_length[index]

        input_tokens = tokenizer.convert_ids_to_tokens(input_ids[index])
        print('input tokens: {}'.format(input_tokens[:length]))
        pos_tokens = convert_list(pos_ids[index], id_2_pos, '<pad>', '<unk>')
        print('pos tokens: {}'.format(pos_tokens[:length]))
        tag_tokens = convert_list(tag_ids[index], id_2_tag, 'O', 'O')
        print('tag tokens: {}'.format(tag_tokens[:length]))

        result = refine_output(input_ids[index], tag_ids[index], length, tokenizer, id_2_tag)
        print(result)
Example #9
0
def save_outputs(predicted_ids, id_2_label, input_file, output_file):
    src_inputs = []
    for line in read_json_lines(input_file):
        src_inputs.append(' {} '.format(config.sep).join(line['src']))

    with open(output_file, 'w', encoding='utf-8') as fout:
        for src, tgt in zip(src_inputs, predicted_ids):
            tgt[-1] = config.eos_id
            tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                               config.pad, config.unk)
            print(json.dumps({
                'tgt': ' '.join(tgt),
                'src': src
            },
                             ensure_ascii=False),
                  file=fout)
Example #10
0
def save_outputs(predicted_ids, id_2_label, input_file, output_file):
    golden_outputs = []
    for line in read_json_lines(input_file):
        golden_outputs.append(line['tgt'])

    with open(output_file, 'w', encoding='utf-8') as fout:
        for tgt, golden in zip(predicted_ids, golden_outputs):
            tgt[-1] = config.eos_id
            tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                               config.pad, config.unk)
            print(json.dumps({
                'tgt': ' '.join(tgt),
                'golden': golden
            },
                             ensure_ascii=False),
                  file=fout)
Example #11
0
    def convert_data(self, context):
        context_seq = []
        pos_seq = []
        for word, pos in pos_text(cut_text(context)):
            context_seq.append(word)
            pos_seq.append(pos)

        input_tokens = []
        pos_tokens = []
        temp = [self.tokenizer.tokenize(word) for word in context_seq]
        for i, pos in enumerate(pos_seq):
            input_tokens += temp[i]
            pos_tokens += [pos] * len(temp[i])

        # Account for [CLS] and [SEP] with "- 2"
        input_tokens = ['[CLS]'] + input_tokens[0:(self.config.sequence_len -
                                                   2)] + ['[SEP]']
        pos_tokens = ['<pad>'] + pos_tokens[0:(self.config.sequence_len -
                                               2)] + ['<pad>']
        input_length = len(input_tokens)

        assert len(pos_tokens) == input_length

        input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
        segment_ids = [0] * input_length
        input_mask = [1] * input_length
        pos_ids = convert_list(pos_tokens, self.pos_2_id, self.config.pad_id,
                               self.config.unk_id)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.config.sequence_len:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            pos_ids.append(self.config.pad_id)

        assert len(input_ids) == self.config.sequence_len
        assert len(input_mask) == self.config.sequence_len
        assert len(segment_ids) == self.config.sequence_len
        assert len(pos_ids) == self.config.sequence_len

        return input_ids, input_mask, segment_ids, input_length, pos_ids
Example #12
0
def save_result_v1(predicted_ids, alignment_history, id_2_label, input_file,
                   output_file):
    src_inputs = []
    for line in read_json_lines(input_file):
        src_inputs.append(line['src'])

    tgt_outputs = []
    for tgt in predicted_ids:
        tgt[-1] = config.eos_id
        tgt_outputs.append(
            convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                         config.pad, config.unk))

    assert len(src_inputs) == len(tgt_outputs)

    with open(output_file, 'w', encoding='utf-8') as fout:
        for src, tgt, alignment in zip(src_inputs, tgt_outputs,
                                       alignment_history):
            for i, (word, index) in enumerate(zip(tgt, alignment)):
                if word == config.unk:
                    tgt[i] = src[index]
            print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)
Example #13
0
    def _read_data(self, data_file, max_data_size=None):
        value_seq = []
        attr_seq = []
        pos_fw_seq = []
        pos_bw_seq = []
        desc_seq = []

        counter = 0
        with open(data_file, 'r', encoding='utf-8') as fin:
            for line in fin:
                we = WikiEntity(line)

                value = []
                attr = []
                pos_fw = []
                pos_bw = []
                box = we.get_box()
                for a in box.keys():
                    v = box[a].split()
                    a = [a] * len(v)
                    p = list(range(len(v)))
                    value += v
                    attr += a
                    pos_fw += p
                    pos_bw += reversed(p)
                desc = we.get_desc().split()

                # check length and limit the maximum length of input
                assert len(value) == len(attr)
                assert len(value) == len(pos_fw)
                assert len(value) == len(pos_bw)
                if len(value) == 0:
                    continue

                value = value[:self.config.sequence_len]
                attr = attr[:self.config.sequence_len]
                pos_fw = pos_fw[:self.config.sequence_len]
                pos_fw = np.minimum(pos_fw, self.config.pos_size - 1).tolist()  # 1 for zero
                pos_bw = pos_bw[:self.config.sequence_len]
                pos_bw = np.minimum(pos_bw, self.config.pos_size - 1).tolist()  # 1 for zero
                desc = desc[:self.config.sequence_len - 2]  # 2 for sos and eos

                if self.config.to_lower:
                    value = list(map(str.lower, value))
                    attr = list(map(str.lower, attr))
                    desc = list(map(str.lower, desc))
                desc = [self.config.sos] + desc + [self.config.eos]

                value_seq.append(convert_list(value, self.config.word_2_id, self.config.pad_id, self.config.unk_id))
                attr_seq.append(convert_list(attr, self.config.attr_2_id, self.config.pad_id, self.config.unk_id))
                pos_fw_seq.append(pos_fw)
                pos_bw_seq.append(pos_bw)
                desc_seq.append(convert_list(desc, self.config.word_2_id, self.config.pad_id, self.config.unk_id))

                counter += 1
                if counter % 10000 == 0:
                    print('\rprocessing file {}: {:>6d}'.format(data_file, counter), end='')
                if max_data_size and counter >= max_data_size:
                    break
            print()

        return value_seq, attr_seq, pos_fw_seq, pos_bw_seq, desc_seq
Example #14
0
def refine_output(input_ids, pred_ids, input_length, tokenizer, id_2_tag):
    context = tokenizer.convert_ids_to_tokens(input_ids)
    pred_tags = convert_list(pred_ids, id_2_tag, 'O', 'O')

    return parse_output(pred_tags[:input_length], context[:input_length])