Ejemplo n.º 1
0
def constructQueriesSingleTitleCollocations(qtitle, qbody):
    # 3 and 4. repeating collocations
    rawQuestion = qtitle + ' ' + qbody
    repCollocations = Utils.findCollocations1(rawQuestion)
    keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion)
    keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion)
    queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3))
    return queriesSingleTitleCollocations
Ejemplo n.º 2
0
def constructQueriesToGoogle(qtitle, qbody):
    """
    There're several ways we can construct queries.

    1. Take top 10 words by KLD for qtitle + qbody -> construct triples
    2. Take top 10 words by KLD for qtitle*2 + qbody -> construct triples
    3. Take top 10 by KLD (title + body) + glue together repeating bigrams and use them as tokens -> construct triples
    4. Take top 10 by KLD (title*2 + body) + glue together repeating bigrams and use them as tokens -> construct triples
    5. Fix typos and 1.
    6. Fix typos and 2.
    7. Fix typos and 3.
    8. Fix typos and 4.
    In this function we will implement 1-4. We can fix typos in the initial corpus and run it again to get 5-8.
    Run this when you want to construct all possible queries for new questions and have them googled.
    """
    rawQuestion = qtitle + ' ' + qbody

    # 1. Top 10 words by KLD
    keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion)
    print('KLD. Single title :: %s\n' % str(keywordsSingleTitle))

    # 2. Top 10 by KLD with doule title
    keywordsDoubleTitle = Utils.sortTokens(Keywords.keywordsNFromText(qtitle + ' ' + rawQuestion, 10), rawQuestion)
    print('KLD. Double title :: %s\n' % str(keywordsDoubleTitle))

    # 3 and 4. repeating collocations
    repCollocations = Utils.findCollocations1(rawQuestion)

    keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion)
    print('KLD + collocations. Single title :: %s\n' % str(keywordsSingleTitleColloc))

    keywordsDoubleTitleColloc = Utils.sortTokens(keywordsDoubleTitle + repCollocations, rawQuestion)
    print('KLD + collocations. Double title :: %s\n' % str(keywordsDoubleTitleColloc))


    # Now we have 4 lists of keywords. Construct queries.
    queriesSingleTitle = list(Utils.constructQueries(keywordsSingleTitle, 3))
    queriesDoubleTitle = list(Utils.constructQueries(keywordsDoubleTitle, 3))
    queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3))
    queriesDoubleTitleCollocations = list(Utils.constructQueries(keywordsDoubleTitleColloc, 3))

    # We don't want to google the same query twice, so we will get rid of all the duplicates
    return set(queriesSingleTitle + queriesDoubleTitle +
               queriesSingleTitleCollocations + queriesDoubleTitleCollocations)
Ejemplo n.º 3
0
def constructQueriesKLDDoubleTitle(qtitle, qbody):
    # 2. Top 10 by KLD with doule title
    rawQuestion = qtitle + ' ' + qbody
    keywordsDoubleTitle = Utils.sortTokens(Keywords.keywordsNFromText(qtitle + ' ' + rawQuestion, 10), rawQuestion)
    queriesDoubleTitle = list(Utils.constructQueries(keywordsDoubleTitle, 3))
    return queriesDoubleTitle
Ejemplo n.º 4
0
def constructQueriesKLDSingleTitle(qtitle, qbody):
    # 1. Top 10 words by KLD
    rawQuestion = qtitle + ' ' + qbody
    keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion)
    queriesSingleTitle = list(Utils.constructQueries(keywordsSingleTitle, 3))
    return queriesSingleTitle