def chi_sqr():
    """ chi-square statistic (CHI)
    A - ndocs where t and c co-occur
    B - ndocs where t and not c occurs
    C - ndocs where c and not t occurs
    D - ndocs where neither c or t occurs
    N - total of docs
    """
    def chisqr(A, B, C, D, N):
        den = (A + C) * (B + D) * (A + B) * (C + D)
        if den == 0:
            return -10**5
        return (N * (A * D - C * B)**2) / den

    # Find one with data
    doc = db.theses.find_one({'data': {'$exists': True}})
    bag = bag_of_words(doc['data'])
    field = doc['field']

    print 'Computing chi^2 for category', field
    # Compute I(t, c) for each bag term
    for term in bag:
        A = 0
        B = 0
        C = 0
        D = 0
        N = 0

        for doc in db.theses.find():
            if not has_data(doc):
                continue

            _bag = bag_of_words(doc['data'])

            if doc['field'] == field and term in _bag:
                A += 1
            elif doc['field'] != field and term in _bag:
                B += 1
            elif doc['field'] == field and term not in _bag:
                C += 1
            else:
                D += 1

            N += 1
        print 'Term: %s (A=%d, B=%d, C=%d, D=%d, N=%d), chi^2=%.4f' % (
            term, A, B, C, D, N, chisqr(A, B, C, D, N))
def chi_sqr():
    """ chi-square statistic (CHI)
    A - ndocs where t and c co-occur
    B - ndocs where t and not c occurs
    C - ndocs where c and not t occurs
    D - ndocs where neither c or t occurs
    N - total of docs
    """

    def chisqr(A, B, C, D, N):
        den = (A + C) * (B + D) * (A + B) * (C + D)
        if den == 0:
            return -10**5
        return (N * (A * D - C * B)**2) / den

    # Find one with data
    doc = db.theses.find_one({'data': { '$exists': True }})
    bag = bag_of_words(doc['data'])
    field = doc['field']

    print 'Computing chi^2 for category', field
    # Compute I(t, c) for each bag term
    for term in bag:
        A = 0
        B = 0
        C = 0
        D = 0
        N = 0

        for doc in db.theses.find():
            if not has_data(doc):
                continue

            _bag = bag_of_words(doc['data'])

            if doc['field'] == field and term in _bag:
                A += 1
            elif doc['field'] != field and term in _bag:
                B += 1
            elif doc['field'] == field and term not in _bag:
                C += 1
            else:
                D += 1

            N += 1
        print 'Term: %s (A=%d, B=%d, C=%d, D=%d, N=%d), chi^2=%.4f' % (term, A, B, C, D, N, chisqr(A, B, C, D, N))
def mi():
    """ Mutual Information for term t and category c

    A - ndocs where t and c co-occur
    B - ndocs where t and not c occurs
    C - ndocs where c and not t occurs
    N - total of docs
    """
    def I(A, B, C, N):
        return numpy.log(A * N / float((A + C) * (A + B)))

    # Find one with data
    doc = db.theses.find_one({'data': {'$exists': True}})
    bag = bag_of_words(doc['data'])
    field = doc['field']

    print 'Computing MI for category', field
    # Compute I(t, c) for each bag term
    for term in bag:
        A = 0
        B = 0
        C = 0
        N = 0

        for doc in db.theses.find():
            if not has_data(doc):
                continue

            _bag = bag_of_words(doc['data'])

            if doc['field'] == field and term in _bag:
                A += 1
            elif doc['field'] != field and term in _bag:
                B += 1
            elif doc['field'] == field and term not in _bag:
                C += 1

            N += 1
        print 'Term: %s (A=%d, B=%d, C=%d, N=%d), MI=%.4f' % (term, A, B, C, N,
                                                              I(A, B, C, N))
def mi():
    """ Mutual Information for term t and category c

    A - ndocs where t and c co-occur
    B - ndocs where t and not c occurs
    C - ndocs where c and not t occurs
    N - total of docs
    """

    def I(A, B, C, N):
       return numpy.log(A * N / float((A + C) * (A + B)))

    # Find one with data
    doc = db.theses.find_one({'data': { '$exists': True }})
    bag = bag_of_words(doc['data'])
    field = doc['field']

    print 'Computing MI for category', field
    # Compute I(t, c) for each bag term
    for term in bag:
        A = 0
        B = 0
        C = 0
        N = 0

        for doc in db.theses.find():
            if not has_data(doc):
                continue

            _bag = bag_of_words(doc['data'])

            if doc['field'] == field and term in _bag:
                A += 1
            elif doc['field'] != field and term in _bag:
                B += 1
            elif doc['field'] == field and term not in _bag:
                C += 1

            N += 1
        print 'Term: %s (A=%d, B=%d, C=%d, N=%d), MI=%.4f' % (term, A, B, C, N, I(A, B, C, N))
def df():
    """ Document frequency thresholding (DF) """
    account = {}
    freqs = {}

    for t in db.theses.find():
        if 'data' in t and len(t['data']) > 10:
            if t['field'] not in account:
                account[t['field']] = 0
            account[t['field']] += 1
    print 'total:', account

    for field in account:
        print 'Creating frequency map for', field
        f = {}
        for t in db.theses.find({'field': field}):
            if 'data' not in t or len(t['data']) == 0:
                continue
            bag = bag_of_words(t['data'])
            for item in bag:
                if item not in f:
                    f[item] = 1
                else:
                    f[item] += 1
        freqs[field] = f

    print 'Found frequencies, now filtering...'

    for name, field in freqs.items():
        fcut = (0.3 * numpy.max(field.values()) +
                0.7 * numpy.mean(field.values())) / 2.0
        freqs[name] = dict((k, v) for k, v in field.iteritems() if v > fcut)

        max = numpy.max(freqs[name].values())
        min = numpy.min(freqs[name].values())
        print '\n%s (cut=%d, min=%d, max=%d)' % (name, fcut, min, max)
        print '\tfeature dimension=%d' % len(freqs[name])
        print '\tMost frequent:', freqs[name].keys()[0:10]

        db.features.update({"field": name}, {
            "field": name,
            "features": freqs[name]
        },
                           upsert=True)
    def select_features(self):
        """ Select most frequent terms.
        """
        start_time = time.time()
        log.info('selecting features..')

        f = {}
        for doc in self.training_docs.find():
            if not database.doc_has_data(doc):
                continue

            for term in bag.bag_of_words(doc['data']):
                f.setdefault(term, 1)
                f[term] += 1

        cut = (numpy.max(f.values()) + numpy.mean(f.values()))/self.size_divider
        higher = dict(filter(lambda n: n[1] >= cut, f.iteritems()))
        log.info('selected %d terms (took %.3f secs)', len(higher), time.time() - start_time)
        return higher
    def select_features(self):
        """ Select most frequent terms.
        """
        start_time = time.time()
        log.info('selecting features..')

        f = {}
        for doc in self.training_docs.find():
            if not database.doc_has_data(doc):
                continue

            for term in bag.bag_of_words(doc['data']):
                f.setdefault(term, 1)
                f[term] += 1

        cut = (numpy.max(f.values()) + numpy.mean(f.values())) / 8.0
        higher = dict(filter(lambda n: n[1] >= cut, f.iteritems()))
        log.info('selected %d terms (took %.3f secs)', len(higher),
                 time.time() - start_time)
        return higher
def df():
    """ Document frequency thresholding (DF) """
    account = {}
    freqs = {}

    for t in db.theses.find():
        if 'data' in t and len(t['data']) > 10:
            if t['field'] not in account:
                account[t['field']] = 0
            account[t['field']] += 1
    print 'total:', account

    for field in account:
        print 'Creating frequency map for', field
        f = {}
        for t in db.theses.find({'field': field}):
            if 'data' not in t or len(t['data']) == 0:
                continue
            bag = bag_of_words(t['data'])
            for item in bag:
                if item not in f:
                    f[item] = 1
                else:
                    f[item] += 1
        freqs[field] = f

    print 'Found frequencies, now filtering...'

    for name, field in freqs.items():
        fcut = (0.3*numpy.max(field.values()) + 0.7*numpy.mean(field.values()))/2.0
        freqs[name] = dict((k, v) for k, v in field.iteritems() if v > fcut)

        max = numpy.max(freqs[name].values())
        min = numpy.min(freqs[name].values())
        print '\n%s (cut=%d, min=%d, max=%d)' % (name, fcut, min, max)
        print '\tfeature dimension=%d' % len(freqs[name])
        print '\tMost frequent:', freqs[name].keys()[0:10]

        db.features.update({"field": name}, {"field": name, "features": freqs[name]}, upsert=True)