def constructQueriesSingleTitleCollocations(qtitle, qbody): # 3 and 4. repeating collocations rawQuestion = qtitle + ' ' + qbody repCollocations = Utils.findCollocations1(rawQuestion) keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion) keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion) queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3)) return queriesSingleTitleCollocations
def constructQueriesToGoogle(qtitle, qbody): """ There're several ways we can construct queries. 1. Take top 10 words by KLD for qtitle + qbody -> construct triples 2. Take top 10 words by KLD for qtitle*2 + qbody -> construct triples 3. Take top 10 by KLD (title + body) + glue together repeating bigrams and use them as tokens -> construct triples 4. Take top 10 by KLD (title*2 + body) + glue together repeating bigrams and use them as tokens -> construct triples 5. Fix typos and 1. 6. Fix typos and 2. 7. Fix typos and 3. 8. Fix typos and 4. In this function we will implement 1-4. We can fix typos in the initial corpus and run it again to get 5-8. Run this when you want to construct all possible queries for new questions and have them googled. """ rawQuestion = qtitle + ' ' + qbody # 1. Top 10 words by KLD keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion) print('KLD. Single title :: %s\n' % str(keywordsSingleTitle)) # 2. Top 10 by KLD with doule title keywordsDoubleTitle = Utils.sortTokens(Keywords.keywordsNFromText(qtitle + ' ' + rawQuestion, 10), rawQuestion) print('KLD. Double title :: %s\n' % str(keywordsDoubleTitle)) # 3 and 4. repeating collocations repCollocations = Utils.findCollocations1(rawQuestion) keywordsSingleTitleColloc = Utils.sortTokens(keywordsSingleTitle + repCollocations, rawQuestion) print('KLD + collocations. Single title :: %s\n' % str(keywordsSingleTitleColloc)) keywordsDoubleTitleColloc = Utils.sortTokens(keywordsDoubleTitle + repCollocations, rawQuestion) print('KLD + collocations. Double title :: %s\n' % str(keywordsDoubleTitleColloc)) # Now we have 4 lists of keywords. Construct queries. queriesSingleTitle = list(Utils.constructQueries(keywordsSingleTitle, 3)) queriesDoubleTitle = list(Utils.constructQueries(keywordsDoubleTitle, 3)) queriesSingleTitleCollocations = list(Utils.constructQueries(keywordsSingleTitleColloc, 3)) queriesDoubleTitleCollocations = list(Utils.constructQueries(keywordsDoubleTitleColloc, 3)) # We don't want to google the same query twice, so we will get rid of all the duplicates return set(queriesSingleTitle + queriesDoubleTitle + queriesSingleTitleCollocations + queriesDoubleTitleCollocations)
def constructQueriesKLDDoubleTitle(qtitle, qbody): # 2. Top 10 by KLD with doule title rawQuestion = qtitle + ' ' + qbody keywordsDoubleTitle = Utils.sortTokens(Keywords.keywordsNFromText(qtitle + ' ' + rawQuestion, 10), rawQuestion) queriesDoubleTitle = list(Utils.constructQueries(keywordsDoubleTitle, 3)) return queriesDoubleTitle
def constructQueriesKLDSingleTitle(qtitle, qbody): # 1. Top 10 words by KLD rawQuestion = qtitle + ' ' + qbody keywordsSingleTitle = Utils.sortTokens(Keywords.keywordsNFromText(rawQuestion, 10), rawQuestion) queriesSingleTitle = list(Utils.constructQueries(keywordsSingleTitle, 3)) return queriesSingleTitle