Esempio n. 1
0
def run_corenlp(answer_list):

    # open a CoreNLP Server
    with corenlp.CoreNLPServer(port=CORENLP_PORT,
                               logfile=CORENLP_LOG) as server:
        client = corenlp.CoreNLPClient(port=CORENLP_PORT)

        # parsing quesitons
        print >> sys.stderr, 'Parsing data...'
        for single_answer in answer_list:
            response = client.query_const_parse(single_answer["text"],
                                                add_ner=True)
            # cache[sentence] = response['sentences'][0]
            single_answer["corenlp"] = response['sentences'][0]
Esempio n. 2
0
def run_corenlp(dataset, qas):
	cache = {}
	with corenlp.CoreNLPServer(port=CORENLP_PORT, logfile=CORENLP_LOG) as server:
		client = corenlp.CoreNLPClient(port=CORENLP_PORT)
		print >> sys.stderr, 'Running NER for paragraphs...'
		for article in dataset['data']:
			for paragraph in article['paragraphs']:
				response = client.query_ner(paragraph['context'])
				cache[paragraph['context']] = response
		print >> sys.stderr, 'Parsing questions...'
		for question, answers, context in qas:
			response = client.query_const_parse(question, add_ner=True)
			cache[question] = response['sentences'][0]
	cache_file = CORENLP_CACHES[OPTS.dataset]
	with open(cache_file, 'w') as f:
		json.dump(cache, f, indent=2)
Esempio n. 3
0
def dump_data(dataset, prefix, use_answer_placeholder=False, alteration_strategy=None):
	corenlp_cache = load_cache()
	nearby_word_dict = load_nearby_words()
	postag_dict = load_postag_dict()
	out_data = []
	out_obj = {'version': dataset['version'], 'data': out_data}
	mturk_data = []

	with corenlp.CoreNLPServer(port=CORENLP_PORT, logfile=CORENLP_LOG) as server:
		client = corenlp.CoreNLPClient(port=CORENLP_PORT)
		for article in dataset['data']:
			out_paragraphs = []
			out_article = {'title': article['title'], 'paragraphs': out_paragraphs}
			out_data.append(out_article)
			for paragraph in article['paragraphs']:
				out_paragraphs.append(paragraph)
				for qa in paragraph['qas']:
					question = qa['question'].strip()
					if not OPTS.quiet:
						print('Question: %s' % question).encode('utf-8')
					if use_answer_placeholder:
						answer = 'ANSWER'
						determiner = ''
					else:
						p_parse = corenlp_cache[paragraph['context']]
						ind, a_toks = get_tokens_for_answers(qa['answers'], p_parse)
						determiner = get_determiner_for_answers(qa['answers'])
						answer_obj = qa['answers'][ind]
						for rule_name, func in ANSWER_RULES:
							answer = func(answer_obj, a_toks, question, determiner=determiner)
							if answer: break
						else:
							raise ValueError('Missing answer')
					answer_mturk = "<span class='answer'>%s</span>" % answer
					q_parse = corenlp_cache[question]
					q_tokens = q_parse['tokens']
					q_const_parse = read_const_parse(q_parse['parse'])
					if alteration_strategy:
						# Easiest to alter the question before converting
						q_list = alter_question(
							question, q_tokens, q_const_parse, nearby_word_dict,
							postag_dict, strategy=alteration_strategy)
					else:
						q_list = [(question, q_tokens, q_const_parse, 'unaltered')]
					for q_str, q_tokens, q_const_parse, tag in q_list:
						for rule in CONVERSION_RULES:
							sent = rule.convert(q_str, answer, q_tokens, q_const_parse)
							if sent:
								if not OPTS.quiet:
									print('  Sent (%s): %s' % (tag, colored(sent, 'cyan'))).encode('utf-8')
								cur_qa = {
									'question': qa['question'],
									'id': '%s-%s' % (qa['id'], tag),
									'answers': qa['answers']
								}
								if OPTS.prepend:
									cur_text = '%s %s' % (sent, paragraph['context'])
									new_answers = []
									for a in qa['answers']:
										new_answers.append({
											'text': a['text'],
											'answer_start': a['answer_start'] + len(sent) + 1
										})
									cur_qa['answers'] = new_answers
								elif OPTS.random:
									sentences = corenlp_cache[paragraph['context']]["sentences"]
									sentence_boundaries = []
									count = 0
									for s in sentences:
										sentence_boundaries.append(count)
										count += len(s['tokens'])
									tokens = [t for s in sentences for t in s['tokens']]
									offsets = [(token['characterOffsetBegin'],
												token['characterOffsetEnd']) for token in tokens]
									# Usually these token offsets are marked perfectly
									sentence_lengths = [len(" ".join(s)) for s in sentences]
								        # Pick a random position to insert the sentence
									for do in range(4):
									insert_position = numpy.random.randint(len(sentences) + 1)
									added_tokens = [token for s in client.query_ner(sent)["sentences"] for token in s['tokens']]
									added_offsets = [(token['characterOffsetBegin'],
												token['characterOffsetEnd']) for token in added_tokens]
									# Locate gold sentence
									if insert_position == len(sentences):
										change_offset = 0
										next_token = len(tokens)
										offset_shift_added = 1
										offser_shift_rest = 0
									else:
										change_offset = added_offsets[-1][1] - added_offsets[0][0]
										next_token = sentence_boundaries[insert_position]
										offset_shift_added = offsets[next_token][0]
										offser_shift_rest = added_offsets[-1][1] + 1

									new_answers = []
									new_tokens = tokens[:next_token] + added_tokens + tokens[next_token:]
									new_offsets = offsets[:next_token] + [(a[0]+offset_shift_added, a[1] + offset_shift_added) for a in added_offsets] + \
									[(a[0] + offser_shift_rest, a[1] + offser_shift_rest) for a in offsets[next_token:]]
									for ans in qa['answers']:
										if ans == None:
											new_answers.append(ans)
											continue
										found = find_answer(offsets,ans['answer_start'], ans['answer_start'] + len(ans['text']))
										if found is not None:
											start_token, end_token = found
											if start_token >= next_token:
												new_answers.append({
													'text': ans['text'],
													'answer_start': ans['answer_start'] + change_offset + 1
												})
												new_start_token, new_end_token = find_answer(new_offsets, new_answers[-1]['answer_start'], new_answers[-1]['answer_start'] + len(ans['text']))
												assert [t['word'] for t in tokens[start_token:end_token]] == [t['word'] for t in new_tokens[new_start_token:new_end_token]]
											else:
												new_answers.append(ans)
										else:
											new_answers.append(ans)
									# Add the sentence in that location and return the text
									cur_text = " ".join([token['word'] for token in new_tokens])
									# verify again on final splitting
									runtime_tokens = [token for s in client.query_ner(cur_text)["sentences"] for token in s['tokens']]
									runtime_offsets = [(token['characterOffsetBegin'], token['characterOffsetEnd']) for token in runtime_tokens]
									#assert new_offsets == runtime_offsets
									cur_qa['answers'] = new_answers
									# TODO: Handle, will not work when sentence splitting of the resultant text doesnt match at test
									# Sanity check code to make sure processing the changed text still gets the correct answer
								else:
									cur_text = '%s %s' % (paragraph['context'], sent)
								cur_paragraph = {'context': cur_text, 'qas': [cur_qa]}
								out_paragraphs.append(cur_paragraph)
								sent_mturk = rule.convert(q_str, answer_mturk, q_tokens, q_const_parse)
								mturk_data.append((qa['id'], sent_mturk))
								break



	if OPTS.dataset != 'dev':
		prefix = '%s-%s' % (OPTS.dataset, prefix)
	if OPTS.modified_answers:
		prefix = '%s-mod' % prefix
	if OPTS.prepend:
		prefix = '%s-pre' % prefix
	if OPTS.random:
		prefix = '%s-random' % prefix
	with open(os.path.join('out', prefix + '.json'), 'w') as f:
		json.dump(out_obj, f)
	with open(os.path.join('out', prefix + '-indented.json'), 'w') as f:
		json.dump(out_obj, f, indent=2)
	with open(os.path.join('out', prefix + '-mturk.tsv'), 'w') as f:
		for qid, sent in mturk_data:
			print >> f, ('%s\t%s' % (qid, sent)).encode('ascii', 'ignore')

def main():
	dataset = read_data()
	qas = get_qas(dataset)
	if OPTS.modified_answers:
		global ANSWER_RULES
		ANSWER_RULES = MOD_ANSWER_RULES
	if OPTS.seed >= 0:
		random.seed(OPTS.seed)
		random.shuffle(qas)
	if OPTS.command == 'print-questions':
		print_questions(qas)
	elif OPTS.command == 'print-answers':
		print_answers(qas)
	elif OPTS.command == 'corenlp':
		run_corenlp(dataset, qas)
	elif OPTS.command == 'convert-q':
		run_conversion(qas)
	elif OPTS.command == 'inspect-q':
		inspect_rule(qas, OPTS.rule)
	elif OPTS.command == 'alter-separate':
		alter_questions(qas, alteration_strategy='separate')
	elif OPTS.command == 'alter-best':
		alter_questions(qas, alteration_strategy='best')
	elif OPTS.command == 'alter-all':
		alter_questions(qas, alteration_strategy='all')
	elif OPTS.command == 'gen-a':
		generate_answers(qas)
	elif OPTS.command == 'e2e-lies':
		run_end2end(qas)
	elif OPTS.command == 'e2e-highConf':
		run_end2end(qas, alteration_strategy='high-conf')
	elif OPTS.command == 'e2e-all':
		run_end2end(qas, alteration_strategy='all')
	elif OPTS.command == 'dump-placeholder':
		dump_data(dataset, 'convPlaceholder', use_answer_placeholder=True)
	elif OPTS.command == 'dump-lies':
		dump_data(dataset, 'convLies')
	elif OPTS.command == 'dump-highConf':
		dump_data(dataset, 'convHighConf', alteration_strategy='high-conf')
	elif OPTS.command == 'dump-hcSeparate':
		dump_data(dataset, 'convHCSeparate', alteration_strategy='high-conf-separate')
	elif OPTS.command == 'dump-altAll':
		dump_data(dataset, 'convAltAll', alteration_strategy='all')
	else:
		raise ValueError('Unknown command "%s"' % OPTS.command)

if __name__ == '__main__':
	OPTS = parse_args()
	main()
def dump_data(dataset, prefix, use_answer_placeholder=False, alteration_strategy=None):
	corenlp_cache = load_cache()
	nearby_word_dict = load_nearby_words()
	postag_dict = load_postag_dict()
	out_data = []
	out_obj = {'version': dataset['version'], 'data': out_data}
	mturk_data = []

	with corenlp.CoreNLPServer(port=CORENLP_PORT, logfile=CORENLP_LOG) as server:
		client = corenlp.CoreNLPClient(port=CORENLP_PORT)
		for article in dataset['data']:
			out_paragraphs = []
			out_article = {'title': article['title'], 'paragraphs': out_paragraphs}
			out_data.append(out_article)
			for paragraph in article['paragraphs']:
				out_paragraphs.append(paragraph)
				for qa in paragraph['qas']:
					question = qa['question'].strip()
					if not OPTS.quiet:
						print('Question: %s' % question).encode('utf-8')
					if use_answer_placeholder:
						answer = 'ANSWER'
						determiner = ''
					else:
						p_parse = corenlp_cache[paragraph['context']]
						ind, a_toks = get_tokens_for_answers(qa['answers'], p_parse)
						determiner = get_determiner_for_answers(qa['answers'])
						answer_obj = qa['answers'][ind]
						for rule_name, func in ANSWER_RULES:
							answer = func(answer_obj, a_toks, question, determiner=determiner)
							if answer: break
						else:
							raise ValueError('Missing answer')
					answer_mturk = "<span class='answer'>%s</span>" % answer
					q_parse = corenlp_cache[question]
					q_tokens = q_parse['tokens']
					q_const_parse = read_const_parse(q_parse['parse'])
					if alteration_strategy:
						# Easiest to alter the question before converting
						q_list = alter_question(
							question, q_tokens, q_const_parse, nearby_word_dict,
							postag_dict, strategy=alteration_strategy)
					else:
						q_list = [(question, q_tokens, q_const_parse, 'unaltered')]
					for q_str, q_tokens, q_const_parse, tag in q_list:
						for rule in CONVERSION_RULES:
							sent = rule.convert(q_str, answer, q_tokens, q_const_parse)
							if sent:
								if not OPTS.quiet:
									print('  Sent (%s): %s' % (tag, colored(sent, 'cyan'))).encode('utf-8')
								cur_qa = {
									'question': qa['question'],
									'id': '%s-%s' % (qa['id'], tag),
									'answers': qa['answers']
								}
								if OPTS.prepend:
									cur_text = '%s %s' % (sent, paragraph['context'])
									new_answers = []
									for a in qa['answers']:
										new_answers.append({
											'text': a['text'],
											'answer_start': a['answer_start'] + len(sent) + 1
										})
									cur_qa['answers'] = new_answers
								elif OPTS.random:
									cur_text = None
									for do in range(4):
										if cur_text == None:
											sentences = corenlp_cache[paragraph['context']]["sentences"]
										else:
											sentences = client.query_ner(cur_text)["sentences"]
										sentence_boundaries = []
										count = 0
										for s in sentences:
											sentence_boundaries.append(count)
											count += len(s['tokens'])
										tokens = [t for s in sentences for t in s['tokens']]
										offsets = [(token['characterOffsetBegin'],
													token['characterOffsetEnd']) for token in tokens]
										# Usually these token offsets are marked perfectly
										sentence_lengths = [len(" ".join(s)) for s in sentences]
										# Pick a random position to insert the sentence
										insert_position = numpy.random.randint(len(sentences) + 1)
										added_tokens = [token for s in client.query_ner(sent)["sentences"] for token in s['tokens']]
										added_offsets = [(token['characterOffsetBegin'],
													token['characterOffsetEnd']) for token in added_tokens]
										# Locate gold sentence
										if insert_position == len(sentences):
											change_offset = 0
											next_token = len(tokens)
											offset_shift_added = 1
											offser_shift_rest = 0
										else:
											change_offset = added_offsets[-1][1] - added_offsets[0][0]
											next_token = sentence_boundaries[insert_position]
											offset_shift_added = offsets[next_token][0]
											offser_shift_rest = added_offsets[-1][1] + 1

										new_answers = []
										new_tokens = tokens[:next_token] + added_tokens + tokens[next_token:]
										new_offsets = offsets[:next_token] + [(a[0]+offset_shift_added, a[1] + offset_shift_added) for a in added_offsets] + \
										[(a[0] + offser_shift_rest, a[1] + offser_shift_rest) for a in offsets[next_token:]]
										for ans in qa['answers']:
											if ans == None:
												new_answers.append(ans)
												continue
											found = find_answer(offsets,ans['answer_start'], ans['answer_start'] + len(ans['text']))
											if found is not None:
												start_token, end_token = found
												if start_token >= next_token:
													new_answers.append({
														'text': ans['text'],
														'answer_start': ans['answer_start'] + change_offset + 1
													})
													new_start_token, new_end_token = find_answer(new_offsets, new_answers[-1]['answer_start'], new_answers[-1]['answer_start'] + len(ans['text']))
													assert [t['word'] for t in tokens[start_token:end_token]] == [t['word'] for t in new_tokens[new_start_token:new_end_token]]
												else:
													new_answers.append(ans)
											else:
												new_answers.append(ans)
										# Add the sentence in that location and return the text
										cur_text = " ".join([token['word'] for token in new_tokens])
										qa['answers'] = new_answers

									cur_qa['answers'] = new_answers	
								else:
									cur_text = '%s %s' % (paragraph['context'], sent)
								cur_paragraph = {'context': cur_text, 'qas': [cur_qa]}
								out_paragraphs.append(cur_paragraph)
								sent_mturk = rule.convert(q_str, answer_mturk, q_tokens, q_const_parse)
								mturk_data.append((qa['id'], sent_mturk))
								break



	if OPTS.dataset != 'dev':
		prefix = '%s-%s' % (OPTS.dataset, prefix)
	if OPTS.modified_answers:
		prefix = '%s-mod' % prefix
	if OPTS.prepend:
		prefix = '%s-pre' % prefix
	if OPTS.random:
		prefix = '%s-random' % prefix
	with open(os.path.join('out1', prefix + '.json'), 'w') as f:
		json.dump(out_obj, f)
	with open(os.path.join('out1', prefix + '-indented.json'), 'w') as f:
		json.dump(out_obj, f, indent=2)
	with open(os.path.join('out1', prefix + '-mturk.tsv'), 'w') as f:
		for qid, sent in mturk_data:
			print >> f, ('%s\t%s' % (qid, sent)).encode('ascii', 'ignore')