Ejemplo n.º 1
0
def constructQueriesSingleTitleCollocations(qtitle, qbody):
    # 3 and 4. repeating collocations
    rawQuestion = qtitle + ' ' + qbody
    repCollocations = Utils.findCollocations1(rawQuestion)
    keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion)
    keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion)
    queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3))
    return queriesSingleTitleCollocations
Ejemplo n.º 2
0
def constructQueriesToGoogle(qtitle, qbody):
    """
    There're several ways we can construct queries.

    1. Take top 10 words by KLD for qtitle + qbody -> construct triples
    2. Take top 10 words by KLD for qtitle*2 + qbody -> construct triples
    3. Take top 10 by KLD (title + body) + glue together repeating bigrams and use them as tokens -> construct triples
    4. Take top 10 by KLD (title*2 + body) + glue together repeating bigrams and use them as tokens -> construct triples
    5. Fix typos and 1.
    6. Fix typos and 2.
    7. Fix typos and 3.
    8. Fix typos and 4.
    In this function we will implement 1-4. We can fix typos in the initial corpus and run it again to get 5-8.
    Run this when you want to construct all possible queries for new questions and have them googled.
    """
    rawQuestion = qtitle + ' ' + qbody

    # 1. Top 10 words by KLD
    keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion)
    print('KLD. Single title :: %s\n' % str(keywordsSingleTitle))

    # 2. Top 10 by KLD with doule title
    keywordsDoubleTitle = Utils.sortTokens(Keywords.keywordsNFromText(qtitle + ' ' + rawQuestion, 10), rawQuestion)
    print('KLD. Double title :: %s\n' % str(keywordsDoubleTitle))

    # 3 and 4. repeating collocations
    repCollocations = Utils.findCollocations1(rawQuestion)

    keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion)
    print('KLD + collocations. Single title :: %s\n' % str(keywordsSingleTitleColloc))

    keywordsDoubleTitleColloc = Utils.sortTokens(keywordsDoubleTitle + repCollocations, rawQuestion)
    print('KLD + collocations. Double title :: %s\n' % str(keywordsDoubleTitleColloc))


    # Now we have 4 lists of keywords. Construct queries.
    queriesSingleTitle = list(Utils.constructQueries(keywordsSingleTitle, 3))
    queriesDoubleTitle = list(Utils.constructQueries(keywordsDoubleTitle, 3))
    queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3))
    queriesDoubleTitleCollocations = list(Utils.constructQueries(keywordsDoubleTitleColloc, 3))

    # We don't want to google the same query twice, so we will get rid of all the duplicates
    return set(queriesSingleTitle + queriesDoubleTitle +
               queriesSingleTitleCollocations + queriesDoubleTitleCollocations)
Ejemplo n.º 3
0
def rankQueries(pathToDB, pathToIntersectionsFile, outputFile, withCollocations):


    sqlWiz = SQLWizard(pathToDB)
    output = open(outputFile, 'w')
    equalWeights = (0.25, 0.25, 0.25, 0.25)

    M = 3
    recallAtMAccum = 0
    precisionAtMAccum = 0
    counter = 0

    newlyCreatedQueries = open('newlyCreatedQueries.txt', 'w')

    for line in open(pathToIntersectionsFile):
        question = json.loads(line)
        qid = question['qid']
        qtitle = question['qtitle']
        qbody = question['qbody']
        questionObj = QAS.QuestionSQL(qid, qtitle, qbody, None, None)
        rawQuestion = qtitle + ' ' + qbody

        gtquery = question['gtquery'] if question['gtquery'] else ''

        output.write('%s<br/>\n' % ' '.join([qtitle, qbody]))

        partialIntersections = question['probes']
        scoredProbes = Keywords.scoreQueriesWithWeight(partialIntersections, equalWeights)

        if withCollocations:
            collocations = Utils.findCollocations1(' '.join([qtitle, qbody]))
        else:
            collocations = []

        scoredWords = Utils.rerankQueryWordsWithScores(scoredProbes, collocations)

        # question top words
        top20QWords = Keywords.keywordsNFromText(rawQuestion, 20)
        setTop20QWords = set(top20QWords)

        # answers top words
        allAnswers = sqlWiz.getAnswersForQID(questionObj.qid)
        top20AWords = Keywords.keywordsFromAnswers(allAnswers, 20)
        setTop20AWords = set(top20AWords)

        # deal with the newly created question
        newlyCreatedQuery = ' '.join([w[0] for w in scoredWords[:5]])
        newlyCreatedQueryScores = scoreQueryWithWordOverlap(newlyCreatedQuery, sqlWiz, questionObj, setTop20QWords, setTop20AWords)
        newlyCreatedQueryScoresJSON = {"query": newlyCreatedQuery, "totWQuest": newlyCreatedQueryScores[0],
                                       "aveWQuest": newlyCreatedQueryScores[1],
                                       "totWAns": newlyCreatedQueryScores[2],
                                       "aveWAns": newlyCreatedQueryScores[3]}
        partialIntersections.append(newlyCreatedQueryScoresJSON)
        scoredProbes = Keywords.scoreQueriesWithWeight(partialIntersections, equalWeights)
        scoredNewlyCreatedQuery = Keywords.scoreQueriesWithWeight([newlyCreatedQueryScoresJSON], equalWeights)


        for s in scoredProbes[:20]:
            queryStr = s[0]
            searchRef = 'http://www.google.com/search?hl=en&q=' + queryStr
            highlight = '->' if queryStr == newlyCreatedQuery else ''
            output.write('%s <a href=\"%s\">%s</a> :: %s <br/>\n' % (highlight, searchRef, queryStr, str(s[1])))

        searchRef = 'http://www.google.com/search?hl=en&q=' + newlyCreatedQuery
        output.write('Newly created query :: <a href=\"%s\">%s</a> :: %s <br/>\n' % (searchRef, newlyCreatedQuery, str(scoredNewlyCreatedQuery[0][1])))
        # output.write('Newly created query :: <br/>\n %s' % str(scoredNewlyCreatedQuery))

        output.write('<br/>\n<b>%s</b><br/>\n' % 'Reranked words:')
        for w in scoredWords:
            output.write('%s -- %f<br/>\n' % (w[0], w[1]))


        rankedWordsOnly = [w[0] for w in scoredWords]
        recallAtM = Utils.recallAtM(gtquery, rankedWordsOnly, M)
        precisionAtM = Utils.precisionAtM(gtquery, rankedWordsOnly, M)

        recallAtMAccum += recallAtM
        precisionAtMAccum += precisionAtM
        counter += 1

        output.write('<br/>\nRecall at %d :: %f<br/>\n' % (M, recallAtM))
        output.write('Precision at %d :: %f<br/>\n' % (M, precisionAtM))
        output.write('%s<br/>\n' % '\n****\n')

        newlyCreatedQuery = ' '.join([w[0] for w in scoredWords[:5]])
        newlyCreatedQueries.write('%s\n' % newlyCreatedQuery)


    output.write('%s<br/>Total:<br/>\n' % '\n****\n')
    output.write('Recall at %d :: %f<br/>\n' % (M, recallAtMAccum / counter))
    output.write('Precision at %d :: %f<br/>\n' % (M, precisionAtMAccum / counter))
    output.close()
    newlyCreatedQueries.close()