def buildContentSummary(categories, categoryData, database): # builds the content summary for a database iters = 2 if len(categories) > 1 else 1 keywords = [TAXONOMY.get(cat) for cat in categories[:iters]] for i in range(iters): keys = reduce(list.__add__, keywords[i:]) urls = getUniqueDocs(keys, categoryData) logger("Building the content summary for " + categories[i] + \ ". Total docs to fetch: " + str(len(urls)), highlight=True) crawler.getContentSummary(database, categories[i], urls, categoryData)
def classifyDb(database, Tc=100, Ts=0.6): # classifies a database based on values of # threshold and specificity categories, categoryData = ["Root"], {} for cat in categories: logger("Analyzing " + cat + " category") filename = cat.lower() + ".txt" keywords = TAXONOMY.get(cat) if keywords: queryUrlMap = buildQueryUrlMap(database, filename) categoryData.update(queryUrlMap) keywordCount = {k: sum([q["count"] for q in queryUrlMap[k].itervalues()]) for k in keywords} N = float(sum(keywordCount.values())) for k, v in keywordCount.items(): logger("Coverage for {0} : {1}, Specificity: {2}".format(k, str(v), str(v/N))) if v >= Tc and v/N >= Ts: logger(">>>>>> Adding " + k + " to category <<<<<<") categories.append(k) return (categories, categoryData)