Beispiel #1
0
    def evaluate(self, ref_file, hyp_file, to_lower):
        references = []
        for line in read_json_lines(ref_file):
            ref = line.get(self.key,
                           '').strip().split()  # ref is a list of tokens
            if to_lower:
                ref = list(map(str.lower, ref))
            references.append(ref)

        hypotheses = []
        for line in read_json_lines(hyp_file):
            hyp = line.get(self.key,
                           '').strip().split()  # hyp is a list of tokens
            if to_lower:
                hyp = list(map(str.lower, hyp))
            hypotheses.append(hyp)

        assert len(references) == len(hypotheses)

        results = {}
        results.update(calc_bleu(references, hypotheses))
        results.update(calc_f1(references, hypotheses))
        results.update(calc_distinct_ngram(hypotheses, max_ngram=2))

        for key, value in results.items():
            logger.info('{}: {:>.4f}'.format(key, value))

        return results
Beispiel #2
0
def build_dict(config):
    counter = collections.Counter()

    for line in read_json_lines(config.train_data):
        src_seq = line.get('src', [])
        if config.to_lower:
            src_seq = list(map(str.lower, src_seq))
        for word in src_seq:
            counter[word] += 1

        tgt_seq = line.get('tgt', [])
        if config.to_lower:
            tgt_seq = list(map(str.lower, tgt_seq))
        for word in tgt_seq:
            counter[word] += 1

    counter[config.pad] = 1e9 - config.pad_id
    counter[config.unk] = 1e9 - config.unk_id
    counter[config.sos] = 1e9 - config.sos_id
    counter[config.eos] = 1e9 - config.eos_id
    counter[config.sep] = 1e9 - config.sep_id
    counter[config.num] = 1e9 - config.num_id
    counter[config.time] = 1e9 - config.time_id
    print('number of words: {}'.format(len(counter)))

    word_dict = {}
    for word, _ in counter.most_common(config.vocab_size +
                                       config.oov_vocab_size):
        word_dict[word] = len(word_dict)

    save_json_dict(word_dict, config.vocab_dict)
Beispiel #3
0
def build_dict(config):
    src_counter = collections.Counter()
    tgt_counter = collections.Counter()
    for line in tqdm(list(read_json_lines(config.train_data)),
                     desc='Building dict'):
        src = line['src']
        tgt = line['tgt']
        if config.to_lower:
            src = src.lower()
            tgt = tgt.lower()
        for word in src.split():
            src_counter[word] += 1
        for word in tgt.split():
            tgt_counter[word] += 1

    src_counter[config.pad] = tgt_counter[config.pad] = 1e9 - config.pad_id
    src_counter[config.unk] = tgt_counter[config.unk] = 1e9 - config.unk_id
    src_counter[config.sos] = tgt_counter[config.sos] = 1e9 - config.sos_id
    src_counter[config.eos] = tgt_counter[config.eos] = 1e9 - config.eos_id
    src_counter[config.sep] = tgt_counter[config.sep] = 1e9 - config.sep_id
    src_counter[config.num] = tgt_counter[config.num] = 1e9 - config.num_id
    src_counter[config.time] = tgt_counter[config.time] = 1e9 - config.time_id
    logger.info('number of source words: {}'.format(len(src_counter)))
    logger.info('number of target words: {}'.format(len(tgt_counter)))

    word_dict = {}
    for word, _ in src_counter.most_common(config.src_vocab_size):
        word_dict[word] = len(word_dict)
    save_json_dict(word_dict, config.src_vocab_dict)

    word_dict = {}
    for word, _ in tgt_counter.most_common(config.tgt_vocab_size):
        word_dict[word] = len(word_dict)
    save_json_dict(word_dict, config.tgt_vocab_dict)
Beispiel #4
0
def generate_data(input_file, output_file, is_test=False):
    data = []
    for line in tqdm(list(read_json_lines(input_file))):
        goal = line['goal']
        knowledge = line['knowledge']

        topic = goal[0][1:]
        triples = knowledge + [v for v in goal[1:] if v not in knowledge]
        if not is_test:
            conversation = line['conversation']
            for i in range(len(conversation)):
                src = conversation[:i]
                tgt = conversation[i]
                data.append({
                    'src': src,
                    'tgt': tgt,
                    'topic': topic,
                    'triples': triples
                })
        else:
            src = line['history']
            tgt = line['response']
            data.append({
                'src': src,
                'tgt': tgt,
                'topic': topic,
                'triples': triples
            })

    save_json_lines(data, output_file)
Beispiel #5
0
    def _read_data(self, data_file):
        src_seq = []
        tgt_seq = []

        counter = 0
        for line in read_json_lines(data_file):
            src = line.get('src', [])
            tgt = line.get('tgt', [])

            if self.config.to_lower:
                src = list(map(str.lower, src))
                tgt = list(map(str.lower, tgt))

            src = src[:self.config.sequence_len]
            tgt = [self.config.sos
                   ] + tgt[:self.config.sequence_len - 2] + [self.config.eos]

            src_seq.append(
                convert_list(src, self.config.word_2_id, self.config.pad_id,
                             self.config.unk_id))
            tgt_seq.append(
                convert_list(tgt, self.config.word_2_id, self.config.pad_id,
                             self.config.unk_id))

            counter += 1
            if counter % 10000 == 0:
                print('\rprocessing file {}: {:>6d}'.format(
                    data_file, counter),
                      end='')
        print()

        return src_seq, tgt_seq
Beispiel #6
0
    def _read_data(self, data_file):
        topic = []
        triple = []
        src = []
        tgt = []

        data_iter = tqdm(list(read_json_lines(data_file)))
        for index, line in enumerate(data_iter):
            topic_seq = ' {} '.format(self.config.sep).join(line['topic'])
            triple_seq = ' {} '.format(self.config.sep).join(
                [' '.join(v) for v in line['triples']])
            src_seq = ' {} '.format(self.config.sep).join(line['src'])
            tgt_seq = line['tgt']

            if self.config.to_lower:
                topic_seq = topic_seq.lower()
                triple_seq = triple_seq.lower()
                src_seq = src_seq.lower()
                tgt_seq = tgt_seq.lower()

            topic_tokens = [self.config.sos
                            ] + topic_seq.split() + [self.config.eos]
            triple_tokens = [self.config.sos] + triple_seq.split(
            )[:self.config.max_triple_length] + [self.config.eos]
            src_tokens = [self.config.sos] + src_seq.split(
            )[-self.config.max_seq_length:] + [self.config.eos]
            tgt_tokens = [self.config.sos] + tgt_seq.split(
            )[:self.config.max_seq_length] + [self.config.eos]

            topic_ids = convert_list(topic_tokens, self.config.word_2_id,
                                     self.config.pad_id, self.config.unk_id)
            triple_ids = convert_list(triple_tokens, self.config.word_2_id,
                                      self.config.pad_id, self.config.unk_id)
            src_ids = convert_list(src_tokens, self.config.word_2_id,
                                   self.config.pad_id, self.config.unk_id)
            tgt_ids = convert_list(tgt_tokens, self.config.word_2_id,
                                   self.config.pad_id, self.config.unk_id)

            topic.append(topic_ids)
            triple.append(triple_ids)
            src.append(src_ids)
            tgt.append(tgt_ids)

            if index < 5:
                logger.info(log_title('Examples'))
                logger.info('topic tokens: {}'.format(topic_tokens))
                logger.info('topic ids: {}'.format(topic_ids))
                logger.info('triple tokens: {}'.format(triple_tokens))
                logger.info('triple ids: {}'.format(triple_ids))
                logger.info('source tokens: {}'.format(src_tokens))
                logger.info('source ids: {}'.format(src_ids))
                logger.info('target tokens: {}'.format(tgt_tokens))
                logger.info('target ids: {}'.format(tgt_ids))

        return topic, triple, src, tgt
Beispiel #7
0
    def _load_and_cache_data(self, data_file, cache_file=None):
        examples = []
        for index, line in enumerate(tqdm(list(read_json_lines(data_file)), desc='Loading file: {}'.format(data_file))):
            src = line['src']
            tgt = line.get('tgt')
            examples.append(InputExample(index, src, tgt))

        features = convert_examples_to_features(examples, self.config)
        if cache_file:
            pickle.dump({'examples': examples, 'features': features}, open(cache_file, 'wb'))

        return examples, features
Beispiel #8
0
    def evaluate(self, ref_file, hyp_file, to_lower):
        list_of_references = []
        for line in read_json_lines(ref_file):
            ref = line[self.key]  # ref is a list of words
            if to_lower:
                ref = list(map(str.lower, ref))
            list_of_references.append([ref])

        hypotheses = []
        for line in read_json_lines(hyp_file):
            hyp = line[self.key]  # hyp is a list of words
            if to_lower:
                hyp = list(map(str.lower, hyp))
            hypotheses.append(hyp)

        assert len(list_of_references) == len(hypotheses)

        bleu1 = 100 * corpus_bleu(list_of_references, hypotheses,
                                  (1., 0., 0., 0.),
                                  SmoothingFunction().method4)
        bleu2 = 100 * corpus_bleu(list_of_references, hypotheses,
                                  (0.5, 0.5, 0., 0.),
                                  SmoothingFunction().method4)
        bleu3 = 100 * corpus_bleu(list_of_references, hypotheses,
                                  (0.33, 0.33, 0.33, 0.),
                                  SmoothingFunction().method4)
        bleu4 = 100 * corpus_bleu(list_of_references, hypotheses,
                                  (0.25, 0.25, 0.25, 0.25),
                                  SmoothingFunction().method4)
        print('{:>.4f}, {:>.4f}, {:>.4f}, {:>.4f}'.format(
            bleu1, bleu2, bleu3, bleu4))
        res = {
            'Bleu_1': bleu1,
            'Bleu_2': bleu2,
            'Bleu_3': bleu3,
            'Bleu_4': bleu4,
        }
        return res
Beispiel #9
0
def build_word_dict(config, min_freq=5):
    cnt = 0
    word_cnt = collections.Counter()
    attr_cnt = collections.Counter()

    for line in read_json_lines(config.train_data):
        we = WikiEntity(line)

        box = we.get_box()
        for a in box.keys():
            for w in box[a].split():
                if config.to_lower:
                    w = w.lower()
                word_cnt[w] += 1
            if config.to_lower:
                a = a.lower()
            attr_cnt[a] += 1

        desc = we.get_desc()
        for w in desc.split():
            if config.to_lower:
                w = w.lower()
            word_cnt[w] += 1

        cnt += 1
        if cnt % 10000 == 0:
            print('\rprocessing: {}'.format(cnt), end='')
    print()

    word_cnt[config.pad] = attr_cnt[config.pad] = 1e9 - config.pad_id
    word_cnt[config.unk] = attr_cnt[config.unk] = 1e9 - config.unk_id
    word_cnt[config.sos] = attr_cnt[config.sos] = 1e9 - config.sos_id
    word_cnt[config.eos] = attr_cnt[config.eos] = 1e9 - config.eos_id
    word_cnt[config.num] = attr_cnt[config.num] = 1e9 - config.num_id
    word_cnt[config.time] = attr_cnt[config.time] = 1e9 - config.time_id
    print('number of words in word counter: {}'.format(len(word_cnt)))
    print('number of words in attribute counter: {}'.format(len(attr_cnt)))

    word_dict = {}
    for word, cnt in word_cnt.most_common():
        if cnt < min_freq:
            break
        word_dict[word] = len(word_dict)
    save_json(word_dict, config.word_dict)

    attr_dict = {}
    for attr, _ in attr_cnt.most_common():
        attr_dict[attr] = len(attr_dict)
    save_json(attr_dict, config.attr_dict)
Beispiel #10
0
def save_outputs(predicted_ids, id_2_label, input_file, output_file):
    src_inputs = []
    for line in read_json_lines(input_file):
        src_inputs.append(' {} '.format(config.sep).join(line['src']))

    with open(output_file, 'w', encoding='utf-8') as fout:
        for src, tgt in zip(src_inputs, predicted_ids):
            tgt[-1] = config.eos_id
            tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                               config.pad, config.unk)
            print(json.dumps({
                'tgt': ' '.join(tgt),
                'src': src
            },
                             ensure_ascii=False),
                  file=fout)
Beispiel #11
0
def save_outputs(predicted_ids, id_2_label, input_file, output_file):
    golden_outputs = []
    for line in read_json_lines(input_file):
        golden_outputs.append(line['tgt'])

    with open(output_file, 'w', encoding='utf-8') as fout:
        for tgt, golden in zip(predicted_ids, golden_outputs):
            tgt[-1] = config.eos_id
            tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                               config.pad, config.unk)
            print(json.dumps({
                'tgt': ' '.join(tgt),
                'golden': golden
            },
                             ensure_ascii=False),
                  file=fout)
Beispiel #12
0
def build_dict(filename, config):
    counter = collections.Counter()

    for line in tqdm(list(read_json_lines(filename))):
        goal = line['goal']
        knowledge = line['knowledge']
        conversation = line['conversation']

        topic = goal[0][1:]
        for entity in topic:
            if config.to_lower:
                entity = entity.lower()
            for token in entity.strip().split():
                counter[token] += 1

        triples = knowledge + [v for v in goal[1:] if v not in knowledge]
        for triple in triples:
            for node in triple:
                if config.to_lower:
                    node = node.lower()
                for token in node.strip().split():
                    counter[token] += 1

        for sequence in conversation:
            if config.to_lower:
                sequence = sequence.lower()
            for token in sequence.strip().split():
                counter[token] += 1

    counter[config.pad] = 1e9 - config.pad_id
    counter[config.unk] = 1e9 - config.unk_id
    counter[config.sos] = 1e9 - config.sos_id
    counter[config.eos] = 1e9 - config.eos_id
    counter[config.sep] = 1e9 - config.sep_id
    counter[config.num] = 1e9 - config.num_id
    counter[config.time] = 1e9 - config.time_id
    logger.info('number of words: {}'.format(len(counter)))

    word_dict = {}
    for word, _ in counter.most_common(config.vocab_size +
                                       config.oov_vocab_size):
        word_dict[word] = len(word_dict)

    save_json_dict(word_dict, config.vocab_dict)
Beispiel #13
0
def save_result_v1(predicted_ids, alignment_history, id_2_label, input_file,
                   output_file):
    src_inputs = []
    for line in read_json_lines(input_file):
        src_inputs.append(line['src'])

    tgt_outputs = []
    for tgt in predicted_ids:
        tgt[-1] = config.eos_id
        tgt_outputs.append(
            convert_list(tgt[:tgt.index(config.eos_id)], id_2_label,
                         config.pad, config.unk))

    assert len(src_inputs) == len(tgt_outputs)

    with open(output_file, 'w', encoding='utf-8') as fout:
        for src, tgt, alignment in zip(src_inputs, tgt_outputs,
                                       alignment_history):
            for i, (word, index) in enumerate(zip(tgt, alignment)):
                if word == config.unk:
                    tgt[i] = src[index]
            print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)