Example #1
0
def parse_qa_dataset(input_dir,
                     word_id=0,
                     word_to_id={},
                     update_word_ids=True):
    dataset = []
    questions = []

    article_files = set()
    print("Parsing questions...")
    with open(input_dir + '/question_answer_pairs.txt') as f:
        for line in f:
            # Skip first line
            if 'ArticleFile' in line:
                continue

            line = line.strip()

            # Skip empty lines
            if len(line) == 0:
                continue

            parts = line.split('\t')
            if len(parts) != 6:
                print("Malformed line: " + line)
                continue

            question = parts[1]
            answer = parts[2]
            answer = canonicalize_tokens([only_words(answer).strip().lower()])
            assert (len(answer) == 1)
            answer = answer[0]

            article_name = parts[5]

            # There are other fields in the dataset, use them later if you want

            # This dataset has repeated questions. What to do?

            # Don't answer questions with more than 1 word answers
            if len(answer) == 0 or len(answer.split(' ')) > 1:
                # Skip for now
                continue

            if not update_word_ids and answer not in word_to_id:
                continue

            question_parts = question.split('\t')
            tokens = clean_sentence(question_parts[0]).strip().split()
            tokens = filter(lambda x: len(x.strip()) > 0, tokens)
            tokens = map(lambda x: x.lower(), tokens)
            tokens = canonicalize_tokens(tokens)

            if not update_word_ids:
                tokens = filter(lambda x: x in word_to_id, tokens)

            question_tokens = tokens
            if update_word_ids:
                for token in (tokens + [answer]):
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1

            article_no = len(questions)

            article_file = input_dir + '/' + article_name + '.txt.clean'
            article_files.add(article_file)
            dataset.append(question_tokens)
            questions.append(
                [article_no, article_file, None, question_tokens, answer])

    article_data = {}
    print("Parsing articles...")
    for article_file in article_files:
        # Get all statements in the dataset for this question

        print("Parsing: " + article_file)
        s_file = open(article_file)
        statements = []
        for statement in s_file:
            if len(statement.strip()) == 0:
                continue

            sentences = get_sentences(statement.strip())

            for sentence in sentences:
                tokens = sentence.strip().split()
                tokens = filter(lambda x: len(x.strip()) > 0, tokens)
                tokens = map(lambda x: x.lower(), tokens)
                tokens = canonicalize_tokens(tokens)

                if not update_word_ids:
                    tokens = filter(lambda x: x in word_to_id, tokens)

                article = tokens
                statements.append(article)
                dataset.append(article)
                if update_word_ids:
                    for token in tokens:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1

        article_data[article_file] = statements

    print("Mapping articles to statements...")
    print("There are %d questions before deduplication" % len(questions))
    question_set = set()
    for i in xrange(len(questions)):
        question = questions[i]
        question_tuple = tuple(question[3])
        if question_tuple in question_set:
            question[0] = None
            continue

        question_set.add(question_tuple)
        question[2] = article_data[question[1]]

    questions = filter(lambda x: x[0] is not None, questions)
    print("There are %d questions after deduplication" % len(questions))

    print("Trying to prune extraneaous statements...")
    questions = prune_statements(dataset, questions)
    before_prune = len(questions)
    questions = filter(lambda x: len(x[2]) > 1, questions)
    after_prune = len(questions)
    print("Pruning invalidated %d questions", (before_prune - after_prune))

    print("Final processing...")
    questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id),
                        questions)
    return dataset, questions_seq, word_to_id, word_id
def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, pad=True, add_pruning=False):
    dataset = []
    questions = []

    null_word = '<NULL>'
    if null_word not in word_to_id:
        if update_word_ids == True:
            word_to_id[null_word] = word_id
            word_id += 1
        else:
            print "Null word not found!! AAAAA"
            sys.exit(1)
    null_word_id = word_to_id[null_word]

    article_files = set()
    print("Parsing questions %s %s" % (questions_file, answers_file))
    q_file = open(questions_file, 'r')
    a_file = open(answers_file, 'r')

    questions_data = q_file.readlines()
    answers_data = a_file.readlines()

    assert(len(questions_data) == len(answers_data))

    more_than_1_word_answers = 0
    answer_word_unknown = 0

    for i in xrange(len(questions_data)):
        question_line = questions_data[i]
        answer_line = answers_data[i]

        question_pieces = question_line.strip().split('\t')
        assert(len(question_pieces) == 23)

        answer_pieces = answer_line.strip().split('\t')
        assert(len(answer_pieces) == 4)

        text = question_pieces[2]
        text = text.replace('\\newline', ' ')
        sentences = get_sentences(text)

        statements = []
        for s in sentences:
            tokens = s.strip().split()

            if update_word_ids:
                for token in tokens:
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1
            else:
                tokens = filter(lambda x: x in word_to_id, tokens)

            statements.append(tokens)
            dataset.append(tokens)

        # 4 questions
        for j in range(4):
            q_index = (j * 5) + 3
            q_words = question_pieces[q_index]
            q_words = clean_sentence(q_words).split()

            options = [
                only_words(question_pieces[q_index + 1]),
                only_words(question_pieces[q_index + 2]),
                only_words(question_pieces[q_index + 3]),
                only_words(question_pieces[q_index + 4]),
            ]
            correct = get_answer_index(answer_pieces[j])
            answer = options[correct]

            # if len(answer) > 1:
            #     more_than_1_word_answers += 1
            #     continue

            if update_word_ids:
                for token in q_words:
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1
                for o in options:
                    for token in o:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1
            else:
                q_words = filter(lambda x: x in word_to_id, q_words)

            if q_words[0] == 'multiple' or q_words[0] == 'one':
                del q_words[0]

            # Ignore questions with unknown words in the answer
            options_word_ids = []
            skip = False
            for o in options:
                option_word_ids = []
                for w in o:
                    if w not in word_to_id:
                        if update_word_ids:
                            word_to_id[w] = word_id
                            word_id += 1
                            option_word_ids.append(w)
                        else:
                            skip = True
                            break
                    else:
                        option_word_ids.append(w)
                if skip:
                    break
                else:
                    #if len(option_word_ids) > 1:
                    #    skip = True
                    #    more_than_1_word_answers += 1
                    #    break
                    options_word_ids.append(option_word_ids)

            if skip:
                answer_word_unknown += 1
                continue

            article_no = len(questions)
            questions.append([article_no, -1, statements, q_words, correct, options_word_ids])

    print "There are %d questions" % len(questions)
    print "There are %d statements" % len(dataset)
    print "There are %d words" % len(word_to_id)
    print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers
    print "Ignored %d questions which had an unknown answer word" % answer_word_unknown

    if add_pruning:
        print("Trying to prune extraneaous statements...")
        questions = prune_statements(dataset, questions)
        before_prune = len(questions)
        questions = filter(lambda x: len(x[2]) > 1, questions)
        after_prune = len(questions)
        print("Pruning invalidated %d questions" % (before_prune - after_prune))

    max_stmts = None
    max_words = None
    if pad:
        s_lens = []
        q_lens = []
        for i in xrange(len(questions)):
            q = questions[i]
            s_lens.append(len(q[2]))
            for j in xrange(len(q[2])):
                q_lens.append(len(q[2][j]))

        max_stmts = max(s_lens)
        max_words = max(q_lens)
        print "Max statement length: ", max_words
        print "Max number of statements: ", max_stmts

        for i in xrange(len(questions)):
            q = questions[i]
            # Statements

            for j in xrange(len(q[2])):
                q[2][j] = pad_statement(q[2][j], null_word, max_words)

            q[2] = pad_memories(q[2], null_word, max_stmts, max_words)
            q[3] = pad_statement(q[3], null_word, max_words)

            for j in xrange(len(q[5])):
                q[5][j] = pad_statement(q[5][j], null_word, max_words)


    print("Final processing...")
    questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
    return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words
Example #3
0
def parse_mc_test_dataset(questions_file,
                          answers_file,
                          word_id=0,
                          word_to_id={},
                          update_word_ids=True,
                          pad=True,
                          add_pruning=False):
    dataset = []
    questions = []

    null_word = '<NULL>'
    if null_word not in word_to_id:
        if update_word_ids == True:
            word_to_id[null_word] = word_id
            word_id += 1
        else:
            print "Null word not found!! AAAAA"
            sys.exit(1)
    null_word_id = word_to_id[null_word]

    article_files = set()
    print("Parsing questions %s %s" % (questions_file, answers_file))
    q_file = open(questions_file, 'r')
    a_file = open(answers_file, 'r')

    questions_data = q_file.readlines()
    answers_data = a_file.readlines()

    assert (len(questions_data) == len(answers_data))

    more_than_1_word_answers = 0
    answer_word_unknown = 0

    for i in xrange(len(questions_data)):
        question_line = questions_data[i]
        answer_line = answers_data[i]

        question_pieces = question_line.strip().split('\t')
        assert (len(question_pieces) == 23)

        answer_pieces = answer_line.strip().split('\t')
        assert (len(answer_pieces) == 4)

        text = question_pieces[2]
        text = text.replace('\\newline', ' ')
        sentences = get_sentences(text)

        statements = []
        for s in sentences:
            tokens = s.strip().split()

            if update_word_ids:
                for token in tokens:
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1
            else:
                tokens = filter(lambda x: x in word_to_id, tokens)

            statements.append(tokens)
            dataset.append(tokens)

        # 4 questions
        for j in range(4):
            q_index = (j * 5) + 3
            q_words = question_pieces[q_index]
            q_words = clean_sentence(q_words).split()

            options = [
                only_words(question_pieces[q_index + 1]),
                only_words(question_pieces[q_index + 2]),
                only_words(question_pieces[q_index + 3]),
                only_words(question_pieces[q_index + 4]),
            ]
            correct = get_answer_index(answer_pieces[j])
            answer = options[correct]

            # if len(answer) > 1:
            #     more_than_1_word_answers += 1
            #     continue

            if update_word_ids:
                for token in q_words:
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1
                for o in options:
                    for token in o:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1
            else:
                q_words = filter(lambda x: x in word_to_id, q_words)

            if q_words[0] == 'multiple' or q_words[0] == 'one':
                del q_words[0]

            # Ignore questions with unknown words in the answer
            options_word_ids = []
            skip = False
            for o in options:
                option_word_ids = []
                for w in o:
                    if w not in word_to_id:
                        if update_word_ids:
                            word_to_id[w] = word_id
                            word_id += 1
                            option_word_ids.append(w)
                        else:
                            skip = True
                            break
                    else:
                        option_word_ids.append(w)
                if skip:
                    break
                else:
                    #if len(option_word_ids) > 1:
                    #    skip = True
                    #    more_than_1_word_answers += 1
                    #    break
                    options_word_ids.append(option_word_ids)

            if skip:
                answer_word_unknown += 1
                continue

            article_no = len(questions)
            questions.append([
                article_no, -1, statements, q_words, correct, options_word_ids
            ])

    print "There are %d questions" % len(questions)
    print "There are %d statements" % len(dataset)
    print "There are %d words" % len(word_to_id)
    print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers
    print "Ignored %d questions which had an unknown answer word" % answer_word_unknown

    if add_pruning:
        print("Trying to prune extraneaous statements...")
        questions = prune_statements(dataset, questions)
        before_prune = len(questions)
        questions = filter(lambda x: len(x[2]) > 1, questions)
        after_prune = len(questions)
        print("Pruning invalidated %d questions" %
              (before_prune - after_prune))

    max_stmts = None
    max_words = None
    if pad:
        s_lens = []
        q_lens = []
        for i in xrange(len(questions)):
            q = questions[i]
            s_lens.append(len(q[2]))
            for j in xrange(len(q[2])):
                q_lens.append(len(q[2][j]))

        max_stmts = max(s_lens)
        max_words = max(q_lens)
        print "Max statement length: ", max_words
        print "Max number of statements: ", max_stmts

        for i in xrange(len(questions)):
            q = questions[i]
            # Statements

            for j in xrange(len(q[2])):
                q[2][j] = pad_statement(q[2][j], null_word, max_words)

            q[2] = pad_memories(q[2], null_word, max_stmts, max_words)
            q[3] = pad_statement(q[3], null_word, max_words)

            for j in xrange(len(q[5])):
                q[5][j] = pad_statement(q[5][j], null_word, max_words)

    print("Final processing...")
    questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id),
                        questions)
    return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words
Example #4
0
def parse_qa_dataset(input_dir, word_id=0, word_to_id={}, update_word_ids=True):
    dataset = []
    questions = []

    article_files = set()
    print("Parsing questions...")
    with open(input_dir + '/question_answer_pairs.txt') as f:
        for line in f:
            # Skip first line
            if 'ArticleFile' in line:
                continue

            line = line.strip()

            # Skip empty lines
            if len(line) == 0:
                continue

            parts = line.split('\t')
            if len(parts) != 6:
                print("Malformed line: " + line)
                continue

            question = parts[1]
            answer = parts[2]
            answer = canonicalize_tokens([only_words(answer).strip().lower()])
            assert(len(answer) == 1)
            answer = answer[0]

            article_name = parts[5]

            # There are other fields in the dataset, use them later if you want

            # This dataset has repeated questions. What to do?

            # Don't answer questions with more than 1 word answers
            if len(answer) == 0 or len(answer.split(' ')) > 1:
                # Skip for now
                continue

            if not update_word_ids and answer not in word_to_id:
                continue

            question_parts = question.split('\t')
            tokens = clean_sentence(question_parts[0]).strip().split()
            tokens = filter(lambda x: len(x.strip()) > 0, tokens)
            tokens = map(lambda x: x.lower(), tokens)
            tokens = canonicalize_tokens(tokens)

            if not update_word_ids:
                tokens = filter(lambda x: x in word_to_id, tokens)

            question_tokens = tokens
            if update_word_ids:
                for token in (tokens + [answer]):
                    if token not in word_to_id:
                        word_to_id[token] = word_id
                        word_id += 1

            article_no = len(questions)

            article_file = input_dir + '/' + article_name + '.txt.clean'
            article_files.add(article_file)
            dataset.append(question_tokens)
            questions.append([article_no, article_file, None, question_tokens, answer])

    article_data = {}
    print("Parsing articles...")
    for article_file in article_files:
        # Get all statements in the dataset for this question

        print("Parsing: " + article_file)
        s_file = open(article_file)
        statements = []
        for statement in s_file:
            if len(statement.strip()) == 0:
                continue

            sentences = get_sentences(statement.strip())

            for sentence in sentences:
                tokens = sentence.strip().split()
                tokens = filter(lambda x: len(x.strip()) > 0, tokens)
                tokens = map(lambda x: x.lower(), tokens)
                tokens = canonicalize_tokens(tokens)

                if not update_word_ids:
                    tokens = filter(lambda x: x in word_to_id, tokens)

                article = tokens
                statements.append(article)
                dataset.append(article)
                if update_word_ids:
                    for token in tokens:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1

        article_data[article_file] = statements

    print("Mapping articles to statements...")
    print("There are %d questions before deduplication" % len(questions))
    question_set = set()
    for i in xrange(len(questions)):
        question = questions[i]
        question_tuple = tuple(question[3])
        if question_tuple in question_set:
            question[0] = None
            continue

        question_set.add(question_tuple)
        question[2] = article_data[question[1]]

    questions = filter(lambda x: x[0] is not None, questions)
    print("There are %d questions after deduplication" % len(questions))

    print("Trying to prune extraneaous statements...")
    questions = prune_statements(dataset, questions)
    before_prune = len(questions)
    questions = filter(lambda x: len(x[2]) > 1, questions)
    after_prune = len(questions)
    print("Pruning invalidated %d questions", (before_prune - after_prune))

    print("Final processing...")
    questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
    return dataset, questions_seq, word_to_id, word_id