Esempio n. 1
0
def buildContentSummary(categories, categoryData, database):
    # builds the content summary for a database
    iters = 2 if len(categories) > 1 else 1
    keywords = [TAXONOMY.get(cat) for cat in categories[:iters]]
    for i in range(iters):
        keys = reduce(list.__add__, keywords[i:])
        urls = getUniqueDocs(keys, categoryData)
        logger("Building the content summary for " + categories[i] + \
               ". Total docs to fetch: " + str(len(urls)), highlight=True)
        crawler.getContentSummary(database, categories[i], urls, categoryData)
Esempio n. 2
0
def classifyDb(database, Tc=100, Ts=0.6):
    # classifies a database based on values of
    # threshold and specificity
    categories, categoryData = ["Root"], {}
    for cat in categories:
        logger("Analyzing " + cat + " category")
        filename = cat.lower() + ".txt"
        keywords = TAXONOMY.get(cat)
        if keywords:
            queryUrlMap = buildQueryUrlMap(database, filename)
            categoryData.update(queryUrlMap)
            keywordCount = {k: sum([q["count"] for q in
                                    queryUrlMap[k].itervalues()]) for k in keywords}

            N = float(sum(keywordCount.values()))
            for k, v in keywordCount.items():
                logger("Coverage for {0} : {1}, Specificity: {2}".format(k, str(v), str(v/N)))
                if v >= Tc and v/N >= Ts:
                    logger(">>>>>> Adding " + k + " to category <<<<<<")
                    categories.append(k)
    return (categories, categoryData)