Esempio n. 1
0
def buildContentSummary(categories, categoryData, database):
    # builds the content summary for a database
    iters = 2 if len(categories) > 1 else 1
    keywords = [TAXONOMY.get(cat) for cat in categories[:iters]]
    for i in range(iters):
        keys = reduce(list.__add__, keywords[i:])
        urls = getUniqueDocs(keys, categoryData)
        logger("Building the content summary for " + categories[i] + \
               ". Total docs to fetch: " + str(len(urls)), highlight=True)
        crawler.getContentSummary(database, categories[i], urls, categoryData)
Esempio n. 2
0
 def _sorted_homologenes(self, homologenes):
     '''sort list of homologenes [(taxid, geneid),...] based on the order
         defined in species_li.
     '''
     d = {}
     for i, species in enumerate(list(TAXONOMY.keys())):
         d[TAXONOMY[species]["tax_id"]] = i
     gene_li = [(d.get(taxid, taxid), taxid, geneid)
                for taxid, geneid in homologenes]
     return [g[1:] for g in sorted(gene_li)]
Esempio n. 3
0
def classifyDb(database, Tc=100, Ts=0.6):
    # classifies a database based on values of
    # threshold and specificity
    categories, categoryData = ["Root"], {}
    for cat in categories:
        logger("Analyzing " + cat + " category")
        filename = cat.lower() + ".txt"
        keywords = TAXONOMY.get(cat)
        if keywords:
            queryUrlMap = buildQueryUrlMap(database, filename)
            categoryData.update(queryUrlMap)
            keywordCount = {k: sum([q["count"] for q in
                                    queryUrlMap[k].itervalues()]) for k in keywords}

            N = float(sum(keywordCount.values()))
            for k, v in keywordCount.items():
                logger("Coverage for {0} : {1}, Specificity: {2}".format(k, str(v), str(v/N)))
                if v >= Tc and v/N >= Ts:
                    logger(">>>>>> Adding " + k + " to category <<<<<<")
                    categories.append(k)
    return (categories, categoryData)
Esempio n. 4
0
 def __init__(self, data_folder):
     # if species_li is None, include all species
     self.set_species_li(list(TAXONOMY.keys()))
     self.data_folder = data_folder
     self.datafile = os.path.join(self.data_folder, self.DATAFILE)