Beispiel #1
0
def evaluate_policy_docs():
    opt = make_options()
    dataset = data.Dataset()
    feeder = data.Feeder(dataset)
    model, _ = models.load_or_create_models(opt, False)
    translator = Translator(model, opt.beam_size, opt.min_length,
                            opt.max_length)
    docs = data.load_policy_documents()
    for doc in docs:
        data.parse_paragraphs(doc)
    lines = []
    for doc in docs:
        paras = [p for p in doc.paragraphs if 50 <= len(p) <= 400]
        if not paras:
            continue
        lines.append('=================================')
        lines.append(doc.title)
        if len(paras) > 16:
            paras = random.sample(paras, 16)
        paras = sorted(paras, key=lambda x: -len(x))
        pids = [feeder.sent_to_ids(p) for p in paras]
        pids = data.align2d(pids)
        src = nu.tensor(pids)
        lengths = (src != data.NULL_ID).sum(-1)
        tgt = translator.translate(src.transpose(0, 1), lengths,
                                   opt.best_k_questions)
        questions = [[feeder.ids_to_sent(t) for t in qs] for qs in tgt]
        for p, qs in zip(paras, questions):
            lines.append('--------------------------------')
            lines.append(p)
            for k, q in enumerate(qs):
                lines.append('predict {}: {}'.format(k, q))
    utils.write_all_lines(opt.output_file, lines)
def create_vocab(filename):
    char_vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        for word in line.split(' '):
            for char in word:
                char_vocab[char] += 1
    char_vocab = sorted(char_vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.vocab_file,
                          ['{}:{}'.format(w, n) for w, n in char_vocab])
Beispiel #3
0
def create_question_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        question = sample['segmented_question']
        for word in question:
            vocab[word] += 1
    vocab = sorted(vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.question_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in vocab])
Beispiel #4
0
def create_answer_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        for doc in sample['documents']:
            for answer in doc['segmented_paragraphs']:
                for word in answer:
                    vocab[word] += 1
    vocab = sorted(vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.answer_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in vocab])
def prepare_dataset_with_question_answers(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        question = sample['question']
        for answer in sample['answers']:
            if len(answer) > len(question) * 2 and len(answer) >= 20:
                lines.append(answer)
                lines.append(question)
                lines.append('<P>')
    utils.write_all_lines(target, lines)
Beispiel #6
0
def export(doc):
    lines = ['url', doc.url, ''] + ['title', doc.title, ''
                                    ] + ['content'] + doc.paragraphs + ['']
    for k, v in doc.classified_paragraphs.items():
        lines.append(k)
        lines += v
        lines.append('')
    lines.append('qas')
    for qa in doc.qas:
        lines += [
            '------------------------', 'q:' + qa.question, 'a:' + qa.answer
        ]
    utils.write_all_lines('./generate/{}.txt'.format(doc.title), lines)
def prepare_dataset_with_document(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        documents = [doc for doc in sample['documents'] if doc['is_selected']]
        questions = [doc['title'] for doc in documents]
        para_indices = [doc['most_related_para'] for doc in documents]
        answers = [
            doc['paragraphs'][k] for doc, k in zip(documents, para_indices)
        ]
        for q, a in zip(questions, answers):
            lines.append(rip_marks(a))
            lines.append(rip_marks(q))
            lines.append('<P>')
    utils.write_all_lines(target, lines)
Beispiel #8
0
def evaluate_accuracy(model,
                      dataset,
                      batch_size=32,
                      char_limit=16,
                      size=None,
                      output_file='./output/evaluate.txt',
                      profile='dev'):
    model.eval()
    feeder = data.TrainFeeder(dataset, batch_size, char_limit)
    feeder.prepare(profile)
    size = size or feeder.size
    feeder.sort(size)
    lines = []
    total_em, total_f1, total = 0, 0, 0
    while feeder.cursor < size:
        ids, cs, qs, chs, qhs, y1s, y2s, ct, qt = feeder.next(batch_size)
        logits1, logits2 = model(func.tensor(cs), func.tensor(qs),
                                 func.tensor(chs), func.tensor(qhs), ct, qt)
        y1p, y2p = model.calc_span(logits1, logits2)
        for pids, qids, lable_start, label_end, predict_start, predict_end in zip(
                cs, qs, y1s, y2s, y1p, y2p):
            lines.append('--------------------------------')
            lines.append(feeder.ids_to_sent(pids))
            lines.append('question:  ' + feeder.ids_to_sent(qids))
            lines.append('reference: ' +
                         feeder.ids_to_sent(pids[lable_start:label_end + 1]))
            lines.append('predict:   ' +
                         feeder.ids_to_sent(pids[predict_start:predict_end +
                                                 1]))
        em, f1, bs = evaluate_batch(feeder, ids, y1p.tolist(), y2p.tolist())
        total_em += em
        total_f1 += f1
        total += bs
        print('{}/{}'.format(feeder.cursor, size))

    exact_match = total_em / total * 100
    f1 = total_f1 / total * 100
    message = 'EM: {:>.4F}, F1: {:>.4F}, Total: {}'.format(
        exact_match, f1, total)
    lines.append(message)
    utils.write_all_lines(output_file, lines)
    print('evauation finished with ' + message)
    return exact_match, f1
Beispiel #9
0
def prepare_dataset_with_document(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        documents = sample['documents']
        questions = [sample['segmented_question']
                     ] + [doc['segmented_title'] for doc in documents]
        question_words = set(questions[0]) - stop_words
        questions = [' '.join(question) for question in questions]
        for doc in documents:
            for passage in doc['segmented_paragraphs']:
                passage_words = set(passage) - stop_words
                common = question_words & passage_words
                passage = rip_marks(' '.join(passage))
                if len(common) / len(question_words) > 0.3 and len(
                        passage) > 2 * len(questions[0]):
                    lines.append(passage)
                    lines += list(set(questions))
                    lines.append('<P>')
    utils.write_all_lines(target, lines)
Beispiel #10
0
def prepare_dataset_with_document(source, target):
    aqs = []
    all = 0
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        question = sample['segmented_question']
        question_words = set(question) - stop_words
        for doc in sample['documents']:
            for answer in doc['segmented_paragraphs']:
                answer_words = set(answer) - stop_words
                common = question_words & answer_words
                if len(common) / len(question_words) > 0.3:
                    a = rip_marks(str.join(' ', answer))
                    q = rip_marks(str.join(' ', question))
                    if len(a) > 2 * len(q):
                        aqs.append((a, q))
                all += 1
    print('{}: {}/{} preprocessed'.format(source, len(aqs), all))
    #utils.save_json(target, [{'q': q, 'a': a} for a,q in aqs])
    utils.write_all_lines(target, ['{}\n{}\n'.format(q, a) for a, q in aqs])
    return aqs
Beispiel #11
0
def evaluate_accuracy(model,
                      dataset,
                      batch_size=20,
                      beam_size=5,
                      min_length=5,
                      max_length=20,
                      best_k_questions=3,
                      size=None,
                      output_file=config.evaluate_output_file):
    model.eval()
    feeder = data.TrainFeeder(dataset)
    feeder.prepare('dev', batch_size)
    translator = Translator(model, beam_size, min_length, max_length)
    size = size or feeder.size
    feeder.sort(size)
    lines = []
    correct = 0
    total = 0
    while feeder.cursor < size:
        x, _, lengths, pids, qids = data.next(feeder, batch_size)
        tgt = translator.translate(x, lengths, best_k_questions)
        passages = [feeder.ids_to_sent(t) for t in pids]
        questions = [[feeder.ids_to_sent(t) for t in qs] for qs in tgt]
        gtruths = [feeder.ids_to_sent(t) for t in qids]
        for p, qs, g in zip(passages, questions, gtruths):
            lines.append('--------------------------------')
            lines.append(p)
            lines.append('reference: ' + g)
            for k, q in enumerate(qs):
                lines.append('predict {}: {}'.format(k, q))
            correct += len(set(g) & set(qs[0]))
            total += len(set(qs[0]))
        print('{}/{}'.format(feeder.cursor, size))
    accuracy = correct / total * 100
    lines.append('correct: {}/{}, accuracy: {}'.format(correct, total,
                                                       accuracy))
    print('evauation finished with accuracy: {:>.2F}'.format(accuracy))
    utils.write_all_lines(output_file, lines)
    return accuracy
Beispiel #12
0
def create_vocab(filename):
    qv = defaultdict(lambda: 0)
    av = defaultdict(lambda: 0)
    qset = set()
    aset = set()
    for q, a in data.load_qa(filename):
        sq = str.join('', q)
        sa = str.join('', a)
        if sq not in qset:
            for word in q:
                qv[word] += 1
            qset.add(sq)
        if sa not in aset:
            for word in a:
                av[word] += 1
            aset.add(sa)
    qv = sorted(qv.items(), key=lambda x: -x[1])
    av = sorted(av.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.question_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in qv])
    utils.write_all_lines(config.answer_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in av])
    utils.write_all_lines('./generate/questions.txt', qset)
    utils.write_all_lines('./generate/answers.txt', aset)
Beispiel #13
0
    def append_source(span):
        part = line[span[0]:span[1]]
        for c in part:
            source.append(c)

    def append_target(span, source_span):
        slen = source_span[1] - source_span[0]
        tag = line[span[0]:span[1]].upper()
        global target
        if slen == 1:
            target.append('S-' + tag)
        else:
            target += ['S-' + tag] + ['M-' + tag] * (slen - 2) + ['E-' + tag]

    def join(tp):
        return '$'.join(tp)

    last_pos = 0
    for m in re.finditer(r'<(.*?)>(.*?)</.*?>', line):
        start, end = m.span(0)
        process_others(last_pos, start)
        last_pos = end
        append_source(m.span(2))
        append_target(m.span(1), m.span(2))
    process_others(last_pos, len(line))
    lines.append(join(source))
    lines.append(join(target))

utils.write_all_lines('eval.postprocessed.txt', lines)
Beispiel #14
0
lines = list(utils.read_all_lines('./eval.csv'))[100:200]

source = []
target = []

for line in lines:
    line = line.split('$')
    if len(source) == len(target):
        source.append(line)
    else:
        target.append(line)

assert len(source) == len(target)

lines = []
for s, t in zip(source, target):
    assert len(s) == len(t)
    line = ''
    for x, y in zip(s, t):
        if y.startswith('S-'):
            line += f'<{y[2:]}>{x}</{y[2:]}>'
        elif y.startswith('B-'):
            line += f'<{y[2:]}>{x}'
        elif y.startswith('E-'):
            line += f'{x}</{y[2:]}>'
        else:
            line += x
    lines.append(line)

utils.write_all_lines('./eval.txt', lines)
Beispiel #15
0
 def __call__(self, message):
     print(message)
     self.lines.append(message)
     utils.write_all_lines(self.output_file, self.lines)