#!/usr/bin/env python import nltk import init import chunker #returns 1 if the character following the answer in the document is one of <, . " : !> #and returns 0 otherwise def punc_loc(question, (answer, doc_num, index, features, q_id)): doc = chunker.clean_punctuation(init.get_doc(doc_num)) answer = chunker.clean_punctuation(answer) #go to index location of candidate within the document alist = answer.split( ) #split candidate answer into words (space delimiter) answer_len = len(alist) #word length of the candidate answer #print answer_len punc_word_index = index + answer_len #index of the word that may contain the punctuation dlist = doc.split() #split the document by words punc_word = dlist[punc_word_index] #getting the actual word from the doc #check if that lastcharacter is a punctuation if punc_word == ',' or punc_word == '.' or punc_word == '"' or punc_word == ':' or punc_word == '!': return 1 else: return 0 #test case below was modified to work with specified doc instead of the actual doc to work with a known index def test():
import init import chunker import read_questions import sys MAX_INT = sys.maxint def literal_question_distance(question, (answer, doc_num, index, features, q_id)): """Evaluates a candidate based on how close it is to the longest fragment of the question in the document returns (distance, length of fragment) """ doc = chunker.clean_punctuation(init.get_doc(doc_num)) (start, _, length) = find_match(question, doc) words = doc.split() index = len(" ".join(words[0:index + 1])) return (min(abs(start - index), abs(start + length - index), 0 if start <= index <= start + length else MAX_INT), length) def literal_rewrite_distance(question, candidate): """Evaluates a candidate based on how close it is to the longest fragment of the re-written question in the document returns (distance, length of fragment) """ return literal_question_distance(rewriteQuestion(question), candidate)
from difflib import SequenceMatcher as SequenceMatcher import init import chunker import read_questions import sys MAX_INT = sys.maxint def literal_question_distance(question, (answer, doc_num, index, features, q_id)): """Evaluates a candidate based on how close it is to the longest fragment of the question in the document returns (distance, length of fragment) """ doc = chunker.clean_punctuation(init.get_doc(doc_num)) (start, _, length) = find_match(question, doc) words = doc.split() index = len(" ".join(words[0 : index + 1])) return ( min(abs(start - index), abs(start + length - index), 0 if start <= index <= start + length else MAX_INT), length, ) def literal_rewrite_distance(question, candidate): """Evaluates a candidate based on how close it is to the longest fragment of the re-written question in the document returns (distance, length of fragment) """
#!/usr/bin/env python import nltk import init import chunker #returns 1 if the character following the answer in the document is one of <, . " : !> #and returns 0 otherwise def punc_loc(question, (answer, doc_num, index, features,q_id)): doc = chunker.clean_punctuation(init.get_doc(doc_num)) answer = chunker.clean_punctuation(answer) #go to index location of candidate within the document alist = answer.split() #split candidate answer into words (space delimiter) answer_len = len(alist) #word length of the candidate answer #print answer_len punc_word_index = index+answer_len #index of the word that may contain the punctuation dlist = doc.split() #split the document by words punc_word = dlist[punc_word_index] #getting the actual word from the doc #check if that lastcharacter is a punctuation if punc_word == ',' or punc_word == '.' or punc_word == '"' or punc_word == ':' or punc_word == '!': return 1 else: return 0 #test case below was modified to work with specified doc instead of the actual doc to work with a known index def test(): question = 'What was the name, of the first Russian astronaut to do a spacewalk?' doc = 'The name of the first Russian astronaut to do a spacewalk is Aleksei A. Leonov!' doc_num = "LA072490-0034"