def refine(dataset, n_fold = 4): folds = divide(dataset, n_fold) filtered_set = [] matching_scheme = exactMatch voting_scheme = consensusVote for i in range(n_fold): # i-th fold is the target of refining target_set = folds[i] models = [] for j in range(n_fold): if j == i: continue m = getModelFromPreprocessed(folds[j]) models.append(m) # test hits = [] for user_id, label in target_set: context = loadPreprocessedContext(user_id) if context is None: continue for m in models: l = m.predict(context) matched = matching_scheme(l, label) hits.append(matched) if voting_scheme(hits): filtered_set.append((user_id, label)) print "Finished %d-th fold" % i print " Outputted %d instances in %d-th fold" % (len(filtered_set), i) return filtered_set
def getModelFromPreprocessed(dataset, meta = 0): ''' Train a maxent classifier with Twitter data ''' m = cmaxent.MaxentModel() m.begin_add_event() # add event reading the file one by one l_count = 0 # line count for user_id, target in dataset: context = loadPreprocessedContext(user_id, meta = meta) if context is None: continue weight = 1 m.add_event(context, target, weight) m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m