def test(self):
        stats = dict.fromkeys(self.classes)
        success = 0.0
        errors = 0.0
        for k in stats:
            stats[k] = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}

        start = time.time()
        for doc in self.testing_docs.find():
            if not database.doc_has_data(doc):
                continue

            cls = self.classify(doc)
            if doc['field'] == cls:
                success += 1
                stats[cls]['tp'] += 1
                log.debug('Success (%s/%s)', doc['author'], doc['field'])
            else:
                errors += 1

                # doc['field'] is the correct label, cls is wrong.
                # Account a false positive at the correct label: doc['field']
                stats[doc['field']]['fp'] += 1

                # Account a false negative: cls
                stats[cls]['fn'] += 1
                log.debug('Error (%s/%s)', doc['author'], doc['field'])

        log.info("Took %.3f secs to classify", time.time() - start)
        info_stats = {}

        for cls, stat in stats.iteritems():
            log.info('Class %s (tp=%d,fp=%d,fn=%d,tn=%d)', cls, stat['tp'],
                     stat['fp'], stat['fn'], stat['tn'])
            precision = stat['tp'] / float(stat['tp'] + stat['fp'])
            try:
                recall = stat['tp'] / float(stat['tp'] + stat['fn'])
                f1 = 2.0 * precision * recall / (precision + recall)
            except ZeroDivisionError:
                f1 = recall = 'nan'

            log.debug('\tPrecision=%.3f, Recall=%.3f, F1=%.3f', precision,
                      recall, f1)

            info_stats[cls] = {}
            info_stats[cls]['precision'] = precision
            info_stats[cls]['recall'] = recall
            info_stats[cls]['f1'] = f1
            info_stats[cls]['features'] = len(self.features)

        global_accuracy = success / float(success + errors)

        log.info('Global accuracy: %.2f %%', global_accuracy * 100.0)
        info_stats['global_accuracy'] = global_accuracy

        return info_stats
    def test(self):
        stats = dict.fromkeys(self.classes)
        success = 0.0
        errors = 0.0
        for k in stats:
            stats[k] = {'tp': 0 , 'fp': 0, 'fn': 0, 'tn': 0}

        start = time.time()
        for doc in self.testing_docs.find():
            if not database.doc_has_data(doc):
                continue

            cls = self.classify(doc)
            if doc['field'] == cls:
                success += 1
                stats[cls]['tp'] += 1
                log.debug('Success (%s/%s)', doc['author'], doc['field'])
            else:
                errors += 1

                # doc['field'] is the correct label, cls is wrong.
                # Account a false positive at the correct label: doc['field']
                stats[doc['field']]['fp'] += 1

                # Account a false negative: cls
                stats[cls]['fn'] += 1
                log.debug('Error (%s/%s)', doc['author'], doc['field'])

        log.info("Took %.3f secs to classify", time.time() - start)
        info_stats = {}

        for cls, stat in stats.iteritems():
            log.info('Class %s (tp=%d,fp=%d,fn=%d,tn=%d)', cls, stat['tp'], stat['fp'], stat['fn'], stat['tn'])
            precision = stat['tp']/float(stat['tp'] + stat['fp'])
            try:
                recall = stat['tp']/float(stat['tp'] + stat['fn'])
                f1 = 2.0 * precision * recall / (precision + recall)
            except ZeroDivisionError:
                f1 = recall = 'nan'

            log.debug('\tPrecision=%.3f, Recall=%.3f, F1=%.3f', precision, recall, f1)

            info_stats[cls] = {}
            info_stats[cls]['precision'] = precision
            info_stats[cls]['recall'] = recall
            info_stats[cls]['f1'] = f1
            info_stats[cls]['features'] = len(self.features)

        global_accuracy = success/float(success + errors)

        log.info('Global accuracy: %.2f %%', global_accuracy * 100.0)
        info_stats['global_accuracy'] = global_accuracy

        return info_stats
Beispiel #3
0
    def train(self):
        """ Performs a naive-bayes on the given features. Fills out
        dictionaries prior and cond with prior probabilities
        P(c) of a class 'c' and conditional P(term|c).
        """
        docs = self.training_docs
        n = docs.count()
        prior = {}
        cond = {}
        memo = {}

        log.debug('starting training on %d documents..', n)
        start_time = time.time()

        for cls in self.classes:
            # Compute prior probabilities
            nc = docs.find({'field': cls}).count()

            # Maximum Likelihood Estimate (MLE)
            prior[cls] = numpy.log(nc / float(n))

            # Join all documents for faster counting
            textfile = StringIO.StringIO()
            for doc in docs.find({'field': cls}):
                if not database.doc_has_data(doc):
                    continue
                textfile.write((' '.join(tokenize(doc['data'])) + ' ').encode(
                    'utf-8', 'replace'))

            # Count vocabulary occurences on joined documents
            nterm = dict.fromkeys(self.features, 0)
            for term in textfile.getvalue().split():
                if term in self.features:
                    nterm[term] += 1

            # Precompute denominator for conditional prob. estimator
            base = float(sum(nterm.values()) + len(nterm))

            # Compute conditional probabilities
            for term in self.features:
                if term not in cond:
                    cond[term] = {}
                val = (nterm[term] + 1) / base
                if val not in memo:
                    memo[val] = numpy.log(val)

                cond[term][cls] = memo[val]

        log.info('finished training (took %.3f secs)',
                 time.time() - start_time)

        self.prior = prior
        self.cond = cond
    def train(self):
        """ Performs a naive-bayes on the given features. Fills out
        dictionaries prior and cond with prior probabilities
        P(c) of a class 'c' and conditional P(term|c).
        """
        docs = self.training_docs
        n = docs.count()
        prior = {}
        cond = {}
        memo = {}

        log.debug('starting training on %d documents..', n)
        start_time = time.time()

        for cls in self.classes:
            # Compute prior probabilities
            nc = docs.find({'field': cls}).count()

            # Maximum Likelihood Estimate (MLE)
            prior[cls] = numpy.log(nc/float(n))

            # Join all documents for faster counting
            textfile = StringIO.StringIO()
            for doc in docs.find({'field': cls}):
                if not database.doc_has_data(doc):
                    continue
                textfile.write((' '.join(tokenize(doc['data'])) + ' ').encode('utf-8', 'replace'))

            # Count vocabulary occurences on joined documents
            nterm = dict.fromkeys(self.features, 0)
            for term in textfile.getvalue().split():
                if term in self.features:
                    nterm[term] += 1

            # Precompute denominator for conditional prob. estimator
            base = float(sum(nterm.values()) + len(nterm))

            # Compute conditional probabilities
            for term in self.features:
                if term not in cond:
                    cond[term] = {}
                val = (nterm[term] + 1)/base
                if val not in memo:
                    memo[val] = numpy.log(val)

                cond[term][cls] = memo[val]

        log.info('finished training (took %.3f secs)', time.time() - start_time)

        self.prior = prior
        self.cond = cond
    def select_features(self):
        """ Select most frequent terms.
        """
        start_time = time.time()
        log.info('selecting features..')

        f = {}
        for doc in self.training_docs.find():
            if not database.doc_has_data(doc):
                continue

            for term in bag.bag_of_words(doc['data']):
                f.setdefault(term, 1)
                f[term] += 1

        cut = (numpy.max(f.values()) + numpy.mean(f.values()))/self.size_divider
        higher = dict(filter(lambda n: n[1] >= cut, f.iteritems()))
        log.info('selected %d terms (took %.3f secs)', len(higher), time.time() - start_time)
        return higher
    def select_features(self):
        """ Select most frequent terms.
        """
        start_time = time.time()
        log.info('selecting features..')

        f = {}
        for doc in self.training_docs.find():
            if not database.doc_has_data(doc):
                continue

            for term in bag.bag_of_words(doc['data']):
                f.setdefault(term, 1)
                f[term] += 1

        cut = (numpy.max(f.values()) + numpy.mean(f.values())) / 8.0
        higher = dict(filter(lambda n: n[1] >= cut, f.iteritems()))
        log.info('selected %d terms (took %.3f secs)', len(higher),
                 time.time() - start_time)
        return higher