def refine(dataset, n_fold = 4):
    folds = divide(dataset, n_fold)
    filtered_set = []

    matching_scheme = exactMatch
    voting_scheme = consensusVote

    for i in range(n_fold):
        # i-th fold is the target of refining
        target_set = folds[i]
        models = []
        for j in range(n_fold):
            if j == i:
                continue
            m = getModelFromPreprocessed(folds[j])
            models.append(m)

        # test
        hits = []
        for user_id, label in target_set:
            context = loadPreprocessedContext(user_id)
            if context is None:
                continue
            for m in models:
                l = m.predict(context)
                matched = matching_scheme(l, label)
                hits.append(matched)

            if voting_scheme(hits):
                filtered_set.append((user_id, label))

        print "Finished %d-th fold" % i
        print "  Outputted %d instances in %d-th fold" % (len(filtered_set), i)

    return filtered_set
def getModelFromPreprocessed(dataset, meta = 0):
    '''
    Train a maxent classifier with Twitter data
    '''
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    # add event reading the file one by one
    l_count = 0     # line count
    for user_id, target in dataset:
        context = loadPreprocessedContext(user_id, meta = meta)
        if context is None:
            continue
        weight = 1
        m.add_event(context, target, weight)

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m