def parse_qa_dataset(input_dir, word_id=0, word_to_id={}, update_word_ids=True): dataset = [] questions = [] article_files = set() print("Parsing questions...") with open(input_dir + '/question_answer_pairs.txt') as f: for line in f: # Skip first line if 'ArticleFile' in line: continue line = line.strip() # Skip empty lines if len(line) == 0: continue parts = line.split('\t') if len(parts) != 6: print("Malformed line: " + line) continue question = parts[1] answer = parts[2] answer = canonicalize_tokens([only_words(answer).strip().lower()]) assert (len(answer) == 1) answer = answer[0] article_name = parts[5] # There are other fields in the dataset, use them later if you want # This dataset has repeated questions. What to do? # Don't answer questions with more than 1 word answers if len(answer) == 0 or len(answer.split(' ')) > 1: # Skip for now continue if not update_word_ids and answer not in word_to_id: continue question_parts = question.split('\t') tokens = clean_sentence(question_parts[0]).strip().split() tokens = filter(lambda x: len(x.strip()) > 0, tokens) tokens = map(lambda x: x.lower(), tokens) tokens = canonicalize_tokens(tokens) if not update_word_ids: tokens = filter(lambda x: x in word_to_id, tokens) question_tokens = tokens if update_word_ids: for token in (tokens + [answer]): if token not in word_to_id: word_to_id[token] = word_id word_id += 1 article_no = len(questions) article_file = input_dir + '/' + article_name + '.txt.clean' article_files.add(article_file) dataset.append(question_tokens) questions.append( [article_no, article_file, None, question_tokens, answer]) article_data = {} print("Parsing articles...") for article_file in article_files: # Get all statements in the dataset for this question print("Parsing: " + article_file) s_file = open(article_file) statements = [] for statement in s_file: if len(statement.strip()) == 0: continue sentences = get_sentences(statement.strip()) for sentence in sentences: tokens = sentence.strip().split() tokens = filter(lambda x: len(x.strip()) > 0, tokens) tokens = map(lambda x: x.lower(), tokens) tokens = canonicalize_tokens(tokens) if not update_word_ids: tokens = filter(lambda x: x in word_to_id, tokens) article = tokens statements.append(article) dataset.append(article) if update_word_ids: for token in tokens: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 article_data[article_file] = statements print("Mapping articles to statements...") print("There are %d questions before deduplication" % len(questions)) question_set = set() for i in xrange(len(questions)): question = questions[i] question_tuple = tuple(question[3]) if question_tuple in question_set: question[0] = None continue question_set.add(question_tuple) question[2] = article_data[question[1]] questions = filter(lambda x: x[0] is not None, questions) print("There are %d questions after deduplication" % len(questions)) print("Trying to prune extraneaous statements...") questions = prune_statements(dataset, questions) before_prune = len(questions) questions = filter(lambda x: len(x[2]) > 1, questions) after_prune = len(questions) print("Pruning invalidated %d questions", (before_prune - after_prune)) print("Final processing...") questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) return dataset, questions_seq, word_to_id, word_id
def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, pad=True, add_pruning=False): dataset = [] questions = [] null_word = '<NULL>' if null_word not in word_to_id: if update_word_ids == True: word_to_id[null_word] = word_id word_id += 1 else: print "Null word not found!! AAAAA" sys.exit(1) null_word_id = word_to_id[null_word] article_files = set() print("Parsing questions %s %s" % (questions_file, answers_file)) q_file = open(questions_file, 'r') a_file = open(answers_file, 'r') questions_data = q_file.readlines() answers_data = a_file.readlines() assert(len(questions_data) == len(answers_data)) more_than_1_word_answers = 0 answer_word_unknown = 0 for i in xrange(len(questions_data)): question_line = questions_data[i] answer_line = answers_data[i] question_pieces = question_line.strip().split('\t') assert(len(question_pieces) == 23) answer_pieces = answer_line.strip().split('\t') assert(len(answer_pieces) == 4) text = question_pieces[2] text = text.replace('\\newline', ' ') sentences = get_sentences(text) statements = [] for s in sentences: tokens = s.strip().split() if update_word_ids: for token in tokens: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 else: tokens = filter(lambda x: x in word_to_id, tokens) statements.append(tokens) dataset.append(tokens) # 4 questions for j in range(4): q_index = (j * 5) + 3 q_words = question_pieces[q_index] q_words = clean_sentence(q_words).split() options = [ only_words(question_pieces[q_index + 1]), only_words(question_pieces[q_index + 2]), only_words(question_pieces[q_index + 3]), only_words(question_pieces[q_index + 4]), ] correct = get_answer_index(answer_pieces[j]) answer = options[correct] # if len(answer) > 1: # more_than_1_word_answers += 1 # continue if update_word_ids: for token in q_words: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 for o in options: for token in o: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 else: q_words = filter(lambda x: x in word_to_id, q_words) if q_words[0] == 'multiple' or q_words[0] == 'one': del q_words[0] # Ignore questions with unknown words in the answer options_word_ids = [] skip = False for o in options: option_word_ids = [] for w in o: if w not in word_to_id: if update_word_ids: word_to_id[w] = word_id word_id += 1 option_word_ids.append(w) else: skip = True break else: option_word_ids.append(w) if skip: break else: #if len(option_word_ids) > 1: # skip = True # more_than_1_word_answers += 1 # break options_word_ids.append(option_word_ids) if skip: answer_word_unknown += 1 continue article_no = len(questions) questions.append([article_no, -1, statements, q_words, correct, options_word_ids]) print "There are %d questions" % len(questions) print "There are %d statements" % len(dataset) print "There are %d words" % len(word_to_id) print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers print "Ignored %d questions which had an unknown answer word" % answer_word_unknown if add_pruning: print("Trying to prune extraneaous statements...") questions = prune_statements(dataset, questions) before_prune = len(questions) questions = filter(lambda x: len(x[2]) > 1, questions) after_prune = len(questions) print("Pruning invalidated %d questions" % (before_prune - after_prune)) max_stmts = None max_words = None if pad: s_lens = [] q_lens = [] for i in xrange(len(questions)): q = questions[i] s_lens.append(len(q[2])) for j in xrange(len(q[2])): q_lens.append(len(q[2][j])) max_stmts = max(s_lens) max_words = max(q_lens) print "Max statement length: ", max_words print "Max number of statements: ", max_stmts for i in xrange(len(questions)): q = questions[i] # Statements for j in xrange(len(q[2])): q[2][j] = pad_statement(q[2][j], null_word, max_words) q[2] = pad_memories(q[2], null_word, max_stmts, max_words) q[3] = pad_statement(q[3], null_word, max_words) for j in xrange(len(q[5])): q[5][j] = pad_statement(q[5][j], null_word, max_words) print("Final processing...") questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words
def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, pad=True, add_pruning=False): dataset = [] questions = [] null_word = '<NULL>' if null_word not in word_to_id: if update_word_ids == True: word_to_id[null_word] = word_id word_id += 1 else: print "Null word not found!! AAAAA" sys.exit(1) null_word_id = word_to_id[null_word] article_files = set() print("Parsing questions %s %s" % (questions_file, answers_file)) q_file = open(questions_file, 'r') a_file = open(answers_file, 'r') questions_data = q_file.readlines() answers_data = a_file.readlines() assert (len(questions_data) == len(answers_data)) more_than_1_word_answers = 0 answer_word_unknown = 0 for i in xrange(len(questions_data)): question_line = questions_data[i] answer_line = answers_data[i] question_pieces = question_line.strip().split('\t') assert (len(question_pieces) == 23) answer_pieces = answer_line.strip().split('\t') assert (len(answer_pieces) == 4) text = question_pieces[2] text = text.replace('\\newline', ' ') sentences = get_sentences(text) statements = [] for s in sentences: tokens = s.strip().split() if update_word_ids: for token in tokens: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 else: tokens = filter(lambda x: x in word_to_id, tokens) statements.append(tokens) dataset.append(tokens) # 4 questions for j in range(4): q_index = (j * 5) + 3 q_words = question_pieces[q_index] q_words = clean_sentence(q_words).split() options = [ only_words(question_pieces[q_index + 1]), only_words(question_pieces[q_index + 2]), only_words(question_pieces[q_index + 3]), only_words(question_pieces[q_index + 4]), ] correct = get_answer_index(answer_pieces[j]) answer = options[correct] # if len(answer) > 1: # more_than_1_word_answers += 1 # continue if update_word_ids: for token in q_words: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 for o in options: for token in o: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 else: q_words = filter(lambda x: x in word_to_id, q_words) if q_words[0] == 'multiple' or q_words[0] == 'one': del q_words[0] # Ignore questions with unknown words in the answer options_word_ids = [] skip = False for o in options: option_word_ids = [] for w in o: if w not in word_to_id: if update_word_ids: word_to_id[w] = word_id word_id += 1 option_word_ids.append(w) else: skip = True break else: option_word_ids.append(w) if skip: break else: #if len(option_word_ids) > 1: # skip = True # more_than_1_word_answers += 1 # break options_word_ids.append(option_word_ids) if skip: answer_word_unknown += 1 continue article_no = len(questions) questions.append([ article_no, -1, statements, q_words, correct, options_word_ids ]) print "There are %d questions" % len(questions) print "There are %d statements" % len(dataset) print "There are %d words" % len(word_to_id) print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers print "Ignored %d questions which had an unknown answer word" % answer_word_unknown if add_pruning: print("Trying to prune extraneaous statements...") questions = prune_statements(dataset, questions) before_prune = len(questions) questions = filter(lambda x: len(x[2]) > 1, questions) after_prune = len(questions) print("Pruning invalidated %d questions" % (before_prune - after_prune)) max_stmts = None max_words = None if pad: s_lens = [] q_lens = [] for i in xrange(len(questions)): q = questions[i] s_lens.append(len(q[2])) for j in xrange(len(q[2])): q_lens.append(len(q[2][j])) max_stmts = max(s_lens) max_words = max(q_lens) print "Max statement length: ", max_words print "Max number of statements: ", max_stmts for i in xrange(len(questions)): q = questions[i] # Statements for j in xrange(len(q[2])): q[2][j] = pad_statement(q[2][j], null_word, max_words) q[2] = pad_memories(q[2], null_word, max_stmts, max_words) q[3] = pad_statement(q[3], null_word, max_words) for j in xrange(len(q[5])): q[5][j] = pad_statement(q[5][j], null_word, max_words) print("Final processing...") questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words
def parse_qa_dataset(input_dir, word_id=0, word_to_id={}, update_word_ids=True): dataset = [] questions = [] article_files = set() print("Parsing questions...") with open(input_dir + '/question_answer_pairs.txt') as f: for line in f: # Skip first line if 'ArticleFile' in line: continue line = line.strip() # Skip empty lines if len(line) == 0: continue parts = line.split('\t') if len(parts) != 6: print("Malformed line: " + line) continue question = parts[1] answer = parts[2] answer = canonicalize_tokens([only_words(answer).strip().lower()]) assert(len(answer) == 1) answer = answer[0] article_name = parts[5] # There are other fields in the dataset, use them later if you want # This dataset has repeated questions. What to do? # Don't answer questions with more than 1 word answers if len(answer) == 0 or len(answer.split(' ')) > 1: # Skip for now continue if not update_word_ids and answer not in word_to_id: continue question_parts = question.split('\t') tokens = clean_sentence(question_parts[0]).strip().split() tokens = filter(lambda x: len(x.strip()) > 0, tokens) tokens = map(lambda x: x.lower(), tokens) tokens = canonicalize_tokens(tokens) if not update_word_ids: tokens = filter(lambda x: x in word_to_id, tokens) question_tokens = tokens if update_word_ids: for token in (tokens + [answer]): if token not in word_to_id: word_to_id[token] = word_id word_id += 1 article_no = len(questions) article_file = input_dir + '/' + article_name + '.txt.clean' article_files.add(article_file) dataset.append(question_tokens) questions.append([article_no, article_file, None, question_tokens, answer]) article_data = {} print("Parsing articles...") for article_file in article_files: # Get all statements in the dataset for this question print("Parsing: " + article_file) s_file = open(article_file) statements = [] for statement in s_file: if len(statement.strip()) == 0: continue sentences = get_sentences(statement.strip()) for sentence in sentences: tokens = sentence.strip().split() tokens = filter(lambda x: len(x.strip()) > 0, tokens) tokens = map(lambda x: x.lower(), tokens) tokens = canonicalize_tokens(tokens) if not update_word_ids: tokens = filter(lambda x: x in word_to_id, tokens) article = tokens statements.append(article) dataset.append(article) if update_word_ids: for token in tokens: if token not in word_to_id: word_to_id[token] = word_id word_id += 1 article_data[article_file] = statements print("Mapping articles to statements...") print("There are %d questions before deduplication" % len(questions)) question_set = set() for i in xrange(len(questions)): question = questions[i] question_tuple = tuple(question[3]) if question_tuple in question_set: question[0] = None continue question_set.add(question_tuple) question[2] = article_data[question[1]] questions = filter(lambda x: x[0] is not None, questions) print("There are %d questions after deduplication" % len(questions)) print("Trying to prune extraneaous statements...") questions = prune_statements(dataset, questions) before_prune = len(questions) questions = filter(lambda x: len(x[2]) > 1, questions) after_prune = len(questions) print("Pruning invalidated %d questions", (before_prune - after_prune)) print("Final processing...") questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) return dataset, questions_seq, word_to_id, word_id