def question_candidates(q_id): '''Select some useful subset of the candidates for a particular question. Return them in a list. ''' init.get_corpus(qNum=q_id) foo = cache_file(q_id) candidate = cache_chunkers.uncache_chunks(open(foo))[q_id] new_l = [] for c in candidate: if (c[3] == "NP"): dist = align_question_distance(get_question(q_id), c) if dist[0] < DIST_CUTOFF and dist[1] > SCORE_CUTOFF: new_l.append(c) align.save_cache() print len(new_l) return new_l
def get_answer(first = 380, last = 399): """ the process by which the baseline finds answers from the corpus first : an integer corresponding to the first question id (inclusive) to answer last : an integer corresonding to the last question id (inclusive) to answer returns : an int list and string list of question id's and answers """ q_ids = [] ans_text = [] # make sure the parameters are good if first > last: last, first = first, last # read in all the questions and iterate through them questions = read_questions.read_questions_no_answers() questions = [q for q in questions if int(q[0]) >= first and int(q[0]) <= last] for question in questions: q_id = int(question[0]) topdoc = init.get_corpus(q_id) doc_nums = topdoc.keys() # baseline QA system answer process right here... for key in doc_nums[:5]: doc_text = topdoc[key].split() # find a random word from the question qs = question[1].split() qword = qs[random.randint(0, len(qs) - 1)] # pull out sentences from docs that have that word positions = [i for i,x in enumerate(doc_text) if x == qword] # get a random position if len(positions) == 0: positions = [len(doc_text) / 2] pos = positions[random.randint(0, len(positions) - 1)] q_ids.append(q_id) ans_text.append(' '.join(doc_text[(pos - 5):(pos + 5)])) return q_ids, ans_text
def question_candidates(q_id): '''Select some useful subset of the candidates for a particular question. Return them in a list. ''' init.get_corpus(qNum=q_id) foo=cache_file(q_id) candidate = cache_chunkers.uncache_chunks(open(foo))[q_id] new_l = [] for c in candidate: if (c[3] == "NP"): dist = align_question_distance(get_question(q_id), c) if dist[0] < DIST_CUTOFF and dist[1] > SCORE_CUTOFF: new_l.append(c) align.save_cache() print len(new_l) return new_l
def run(q_id): train_sents = conll2000.chunked_sents('train.txt') unigram_chunker = UnigramChunker(train_sents) import init #get document here and tag; put into this format: #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN"),(".", ".")] topdoc = init.get_corpus(q_id) doc_nums = topdoc.keys() answers = [] for key in doc_nums: doc_text = topdoc[key] docnum = key #print docnum doc_text = clean_punctuation(doc_text) #print doc_text doc_text = doc_text.split() tagged = pos_tag(doc_text) chunked = unigram_chunker.parse2(tagged) flatten = chunked.pos() #print flatten numbered = enumerate(flatten) currentTag = '' words = [] for i, v in numbered: #print i,v ((word, tag), phrasetag) = v if currentTag == '': currentTag = phrasetag if currentTag == phrasetag: words.append(word) else: answers.append((' '.join(words), docnum, i - len(words), currentTag, q_id)) currentTag = phrasetag words = [word] answers.append( (' '.join(words), docnum, i - len(words), currentTag, q_id)) #print answers return answers
def run(q_id): train_sents = conll2000.chunked_sents('train.txt') unigram_chunker = UnigramChunker(train_sents) import init #get document here and tag; put into this format: #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN"),(".", ".")] topdoc = init.get_corpus(q_id) doc_nums = topdoc.keys() answers= []; for key in doc_nums: doc_text = topdoc[key] docnum= key #print docnum doc_text = clean_punctuation(doc_text) #print doc_text doc_text= doc_text.split() tagged=pos_tag(doc_text) chunked=unigram_chunker.parse2(tagged) flatten= chunked.pos() #print flatten numbered= enumerate(flatten) currentTag='' words=[] for i,v in numbered: #print i,v ((word,tag),phrasetag)=v if currentTag=='': currentTag=phrasetag if currentTag==phrasetag: words.append(word) else: answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id)) currentTag= phrasetag words= [word] answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id)) #print answers return answers
def rewriteQuestionsDict(qList): result = {} # because a dictionary is easier for me than a list of lists # key: string_of_int(question_number) # value: question as a string qDict = {} b = 0 for q in qList: for q2 in q: if b == 0: qN = q2 b = 1 else: qDict[qN] = q2 b = 0 # now loop through the dict, rewriting if possible for key in qDict: result[key] = rewriteQuestion(qDict[key]) return result if __name__ == "__main__": init.get_corpus(qNum=209) question = "Who is the inventor of the phonograph?" doc = "SJMN91-06010225" print align_question_distance(question, (1, doc, 30, {}))