def predict(model, features, cluster_type, atom_type, start_doc, ntest, **cluster_args): ''' Runs the process of clustering for each feature/calculating confidences and plugs these confidences into <model>, returning the weighted confidences of plag. for all passages parsed from start_doc -> start_doc + ntest documents TODO do one document at a time ''' test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, ntest) predicted = model.predict(test_matrix) # predict_proba returns a list of probabilities of being in class number i # Since we only have two classes (0 == nonplag, 1 == plag), just keep # the prob/confidence of plag. confidences = [x[1] for x in model.predict_proba(test_matrix)] pos_predictions = [x for x, y in zip(confidences, actuals) if y == 1] neg_predictions = [x for x, y in zip(confidences, actuals) if y == 0] print 'for those which are pos' print five_num_summary(pos_predictions) print 'for those which are neg' print five_num_summary(neg_predictions) print 'pct. plag', sum(actuals) / float(len(actuals)) print 'pct correct:' print sum([x == y for x, y in zip(predicted, actuals)]) / float(len(predicted)) metadata = { 'features' : features, 'cluster_type' : cluster_type, 'feature_selection' : True, 'atom_type' : atom_type, 'start_doc' : start_doc, 'ntest' : ntest } path, auc = BaseUtility.draw_roc(actuals, confidences, **metadata) print path, auc return confidences
def compare_params(): ''' [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'), ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'), ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'), ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf') ] ''' features = FeatureExtractor.get_all_feature_function_names() features = [f for f in features if 'unigram' not in f and 'trigram' not in f] cluster_type = 'outlier' atom_type = 'paragraph' start_doc = 0 ntrain = 100 ntest = 200 # Process the test set once test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest) # Options for Log regression regularization_options = ['l1', 'l2'] class_weight_options = ['auto', None] results = [] for regularization in regularization_options: for class_weight in class_weight_options: model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight) confidences = [x[1] for x in model.predict_proba(test_matrix)] path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination') results.append((regularization, class_weight, auc, path)) print results print results return results