Example #1
0
def question_candidates(q_id):
    '''Select some useful subset of the candidates for a particular question.
    Return them in a list.
    '''
    init.get_corpus(qNum=q_id)
    foo = cache_file(q_id)
    candidate = cache_chunkers.uncache_chunks(open(foo))[q_id]
    new_l = []
    for c in candidate:
        if (c[3] == "NP"):
            dist = align_question_distance(get_question(q_id), c)
            if dist[0] < DIST_CUTOFF and dist[1] > SCORE_CUTOFF:
                new_l.append(c)
    align.save_cache()
    print len(new_l)
    return new_l
Example #2
0
def get_answer(first = 380, last = 399):
    """ the process by which the baseline finds answers from the corpus
        first : an integer corresponding to the first question id (inclusive) to answer
        last : an integer corresonding to the last question id (inclusive) to answer
        returns : an int list and string list of question id's and answers """
    
    q_ids = []
    ans_text = []

    # make sure the parameters are good
    if first > last: last, first = first, last

    # read in all the questions and iterate through them
    questions = read_questions.read_questions_no_answers()
    questions = [q for q in questions if int(q[0]) >= first and int(q[0]) <= last]
    for question in questions:
        q_id = int(question[0])
        topdoc = init.get_corpus(q_id)
        doc_nums = topdoc.keys()
        
        # baseline QA system answer process right here...
        for key in doc_nums[:5]:
            doc_text = topdoc[key].split()
            # find a random word from the question
            qs = question[1].split()
            qword = qs[random.randint(0, len(qs) - 1)]
            # pull out sentences from docs that have that word
            positions = [i for i,x in enumerate(doc_text) if x == qword]
            # get a random position
            if len(positions) == 0: positions = [len(doc_text) / 2]
            pos = positions[random.randint(0, len(positions) - 1)]
            q_ids.append(q_id)
            ans_text.append(' '.join(doc_text[(pos - 5):(pos + 5)]))

    return q_ids, ans_text
Example #3
0
def question_candidates(q_id):
    '''Select some useful subset of the candidates for a particular question.
    Return them in a list.
    '''
    init.get_corpus(qNum=q_id)
    foo=cache_file(q_id)
    candidate = cache_chunkers.uncache_chunks(open(foo))[q_id]
    new_l = []
    for c in candidate:
        if (c[3] == "NP"):
            dist = align_question_distance(get_question(q_id), c)
            if dist[0] < DIST_CUTOFF and dist[1] > SCORE_CUTOFF:
                new_l.append(c)
    align.save_cache()
    print len(new_l)
    return new_l
Example #4
0
def run(q_id):
    train_sents = conll2000.chunked_sents('train.txt')
    unigram_chunker = UnigramChunker(train_sents)

    import init
    #get document here and tag; put into this format:
    #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN"),(".", ".")]
    topdoc = init.get_corpus(q_id)
    doc_nums = topdoc.keys()
    answers = []
    for key in doc_nums:
        doc_text = topdoc[key]
        docnum = key
        #print docnum
        doc_text = clean_punctuation(doc_text)
        #print doc_text
        doc_text = doc_text.split()
        tagged = pos_tag(doc_text)

        chunked = unigram_chunker.parse2(tagged)
        flatten = chunked.pos()
        #print flatten
        numbered = enumerate(flatten)
        currentTag = ''
        words = []
        for i, v in numbered:
            #print i,v
            ((word, tag), phrasetag) = v
            if currentTag == '':
                currentTag = phrasetag
            if currentTag == phrasetag:
                words.append(word)
            else:
                answers.append((' '.join(words), docnum, i - len(words),
                                currentTag, q_id))
                currentTag = phrasetag
                words = [word]
        answers.append(
            (' '.join(words), docnum, i - len(words), currentTag, q_id))
        #print answers

    return answers
Example #5
0
def run(q_id):
    train_sents = conll2000.chunked_sents('train.txt')
    unigram_chunker = UnigramChunker(train_sents)

    import init
    #get document here and tag; put into this format:
    #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN"),(".", ".")]
    topdoc = init.get_corpus(q_id)
    doc_nums = topdoc.keys()
    answers= [];
    for key in doc_nums:
        doc_text = topdoc[key]
        docnum= key
        #print docnum
        doc_text = clean_punctuation(doc_text)
        #print doc_text
        doc_text= doc_text.split()
        tagged=pos_tag(doc_text)

    
        chunked=unigram_chunker.parse2(tagged)
        flatten= chunked.pos()
        #print flatten
        numbered= enumerate(flatten)
        currentTag=''
        words=[]
        for i,v in numbered:
            #print i,v
            ((word,tag),phrasetag)=v
            if currentTag=='':
                currentTag=phrasetag
            if currentTag==phrasetag:
                words.append(word)
            else:
                answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
                currentTag= phrasetag
                words= [word]
        answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
        #print answers
          
    return answers
Example #6
0

def rewriteQuestionsDict(qList):
    result = {}
    # because a dictionary is easier for me than a list of lists
    # key: string_of_int(question_number)
    # value: question as a string
    qDict = {}
    b = 0
    for q in qList:
        for q2 in q:
            if b == 0:
                qN = q2
                b = 1
            else:
                qDict[qN] = q2
                b = 0

    # now loop through the dict, rewriting if possible
    for key in qDict:
        result[key] = rewriteQuestion(qDict[key])

    return result


if __name__ == "__main__":
    init.get_corpus(qNum=209)
    question = "Who is the inventor of the phonograph?"
    doc = "SJMN91-06010225"
    print align_question_distance(question, (1, doc, 30, {}))
Example #7
0

def rewriteQuestionsDict(qList):
    result = {}
    # because a dictionary is easier for me than a list of lists
    # key: string_of_int(question_number)
    # value: question as a string
    qDict = {}
    b = 0
    for q in qList:
        for q2 in q:
            if b == 0:
                qN = q2
                b = 1
            else:
                qDict[qN] = q2
                b = 0

    # now loop through the dict, rewriting if possible
    for key in qDict:
        result[key] = rewriteQuestion(qDict[key])

    return result


if __name__ == "__main__":
    init.get_corpus(qNum=209)
    question = "Who is the inventor of the phonograph?"
    doc = "SJMN91-06010225"
    print align_question_distance(question, (1, doc, 30, {}))