def testOn(self, m, test_groups): ''' perform tests with the provided model 'm' and test set, and returns a tests object. one can specify the threshold when the prediction is made when the confidence is over the threshold. ''' print "test set:", Counter(zip(*test_groups)[1]).most_common() t = tests.tester(4) trials = [] f_thre = 0 for pair in test_groups: max_label, max_prob = self.getPredictionWithConfidence(m, pair[0]) #predicted = m.predict(pair[0]) true = pair[1] if max_prob > f_thre: trials.append((true, max_label)) #if true != max_label and true == 'g3': # meta = pair[2] # print meta['user_id'] trials = zip(*trials) t.record(trials[0], trials[1]) return t
def run_diagnose(self): standards = serial.objFromFile('prob_groups.txt') c = ClusterBridge() name_net = c.getNameNet() for thres in range(1, 10): # Experimental def lengthFilter(triplet): names = triplet[1] return len(names) == thres filtered_net = filter(lengthFilter, name_net) # Experimental-end observed = gstat.convertNameNet2Observed(filtered_net) prediction = matchstat.convertObserved2Prediction(standards, observed) # Format becomes: # [(true1, true2,...), (predict1, predict2, ...), (meta1, meta2, ...)] pairs = matchstat.unzipGroupPrediction(prediction) num_label = 4 tester = tests.tester(num_label) assert(len(pairs) >= 2) # sanity check tester.record(pairs[0], pairs[1]) tester.recordMeta(pairs[2]) #print thres print "Accuracy:", tester.accuracy() print "Conf matrix:" print tester.confusionMatrix(friends.groupToNumeric)
def useOnlyIntersection(self): # feature selection on the full dataset X, y = self.getData() fs = self.getFeatureSelector() fs.fit(X, y) oracle_indices = self.indices(fs, X.shape[1]) # feature selection on part of the dataset fs = LogisticRegression(penalty='l1', tol=1e-6, C=5e-1) kfold = cross_validation.KFold(X.shape[0], k = 5) for train, test in kfold: fs.fit(X[train], y[train]) normal_indices = self.indices(fs, X.shape[1]) break common_indices = oracle_indices & normal_indices print 'intersection size:', len(common_indices) # clf = LogisticRegression(penalty='l2', tol=1e-6, C=1e-1) kfold = cross_validation.KFold(X.shape[0], k = 5) tester = tests.tester(4) for train, test in kfold: clf.fit(self.reduceWith(X[train], common_indices), y[train]) predicted = clf.predict(self.reduceWith(X[test], common_indices)) print clf.score(self.reduceWith(X[test], common_indices), y[test]) tester.record(y[test], predicted) print tester.confusionMatrix() print "cv accuracy:", tester.accuracy()
def reg(self): X, y = self.getData() X_2 = X.multiply(X) X = hstack((X, X_2)).tocsr() print X.shape kfold = cross_validation.KFold(X.shape[0], k = 5) tester = tests.tester(4) rms_errors = [] diviations = [] for train, test in kfold: regr = LinearRegression() regr.fit(X[train], y[train]) y_pred = regr.predict(X[test]) rms_e = math.sqrt(np.mean((y_pred - y[test]) ** 2)) div_e = np.mean(np.absolute(y_pred - y[test])) rms_errors.append(rms_e) diviations.append(div_e) print 'rms_e:', np.mean(np.array(rms_errors)) print 'diviation:', np.mean(np.array(diviations))
def testOn(self, m, test_groups): t = tests.tester(4) trials = [] for pair in test_groups: predicted = m.predict(pair[0]) true = pair[1] trials.append((predicted, true)) trials = zip(*trials) t.record(trials[0], trials[1]) return t
def main(): pairs = pipelines(500, 100) # Use tester class num_label = 4 tester = tests.tester(num_label) assert(len(pairs) >= 2) # sanity check tester.record(pairs[0], pairs[1]) tester.recordMeta(pairs[2]) print "Accuracy:", tester.accuracy() print "Conf matrix:" print tester.confusionMatrix(friends.groupToNumeric)
def run(self): # get data X, y = self.getData() clf = self.classifier() kfold = cross_validation.KFold(X.shape[0], k = 5) tester = tests.tester(4) for train, test in kfold: clf.fit(X[train], y[train], X[test]) predicted = clf.predict(X[test]) tester.record(y[test], predicted) print tester.confusionMatrix() print "cv accuracy:", tester.accuracy()
def run(self): X, y = self.getTextData() kfold = cross_validation.KFold(X.shape[0], k = 5) tester = tests.tester(4) for train, test in kfold: # in the training stage, we should discard the part of the training # data after the feature selection. clf = self.getTrained(X[train], y[train]) predicted = self.getPredicted(clf, X[test]) tester.record(y[test], predicted) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def outputOnlyMatched(): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) n_total = 0 n_emit = 0 for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_pre = t_model.predict(text_context) f_pre = f_model.predict(following_context) if t_pre == f_pre: trials.append((target, t_pre)) n_emit += 1 n_total += 1 trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix() print 'emitted portion:', float(n_emit) / float(n_total)
def doCrossValidation(self, dataset, size_limit): tester = tests.tester(4) for train, test in data.kFolds(dataset): # training train = random.sample(train, size_limit) m = self.trainedModelOn(train) # prediction trials = [] for datum in test: context, target, weight = datum pre_target = m.predict(context) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print size_limit, tester.accuracy()
def simulateIdealFriendshipNetwork(n_friends): standards = serial.objFromFile('prob_groups.txt') #standards = gstat.generateEmpiricalDistributionFromSample(500, 20) name_net = friends.getPseudoNameNetSampled(n_sample = n_friends, n_num = 50) observed = gstat.convertNameNet2Observed(name_net) prediction = convertObserved2Prediction(standards, observed) pairs = unzipGroupPrediction(prediction) # Use tester class num_label = 4 tester = tests.tester(num_label) assert(len(pairs) >= 2) # sanity check tester.record(pairs[0], pairs[1]) tester.recordMeta(pairs[2]) #print "Accuracy:", tester.accuracy() #print "Conf matrix:" #print tester.confusionMatrix(friends.groupToNumeric) return tester.accuracy()
def simpleEnsemble(pickup): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) pre_target = str(pickup(t_conf, f_conf)) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def baggedTest(self, classifiers_models, test_set): ''' perform tests with the provided model 'm' and test set, and returns a tests object. classifiers_models = [(classifier, model), ...] test_set = [(user_group, label), ...] ''' t = tests.tester(4) trials = [] for pair in test_set: predicted = self.baggedPredict(classifiers_models, pair) true = pair[1] if predicted is not None: trials.append((true, predicted)) trials = zip(*trials) t.record(trials[0], trials[1]) return t
def doCrossValidation(dataset): tester = tests.tester(4) for train, test in data.kFolds(dataset): # training m = trainedModelOn(train) print 'train size', len(train) # prediction trials = [] for datum in test: context, target, weight = datum pre_target = m.predict(context) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def run(self): standards = serial.objFromFile('prob_groups.txt') c = ClusterBridge() name_net = c.getNameNet() observed = gstat.convertNameNet2Observed(name_net) prediction = matchstat.convertObserved2Prediction(standards, observed) # Format becomes: # [(true1, true2,...), (predict1, predict2, ...), (meta1, meta2, ...)] pairs = matchstat.unzipGroupPrediction(prediction) num_label = 4 tester = tests.tester(num_label) assert(len(pairs) >= 2) # sanity check tester.record(pairs[0], pairs[1]) tester.recordMeta(pairs[2]) #print thres print "Accuracy:", tester.accuracy() print "Conf matrix:" print tester.confusionMatrix(friends.groupToNumeric)
def cv(self): fs = LinearSVC(penalty='l1', dual=False, tol=1e-4, C=1e1, multi_class='ovr', fit_intercept=True) fs = SelectKBest(chi2, k=3000) data, target = self.getData(fs) #clf = MultinomialNB() #clf = svm.SVC(kernel = 'linear') clf = LinearSVC(penalty='l2', loss='l2', dual=True, tol=1e-4, C=1000.0, multi_class='ovr', fit_intercept=True) #clf = LogisticRegression(penalty='l2', tol=1e-6, C=1e-1) # Set up feature selection fs_enable = False # Start CV kfold = cross_validation.KFold(data.shape[0], k = 5) tester = tests.tester(4) for train, test in kfold: if fs_enable: print "before feature selection:", data[train].shape d_train = fs.fit_transform(data[train], target[train]) d_test = fs.transform(data[test]) print "feature selected:", d_train.shape else: print "no feature selection:", data[train].shape d_train = data[train] d_test = data[test] # experimental def _sparse_mean(A): A = reduce(lambda x,y: x+y, [A[k] for k in range(A.shape[0])]) return A X = d_train y = target[train] # combine examples into one array #X = vstack( # [_sparse_mean( # X[np.nonzero(y == k)[0]] # ) # for k in np.unique(y)]) #y = np.unique(y) #X = X.tocsr() # a very simple interpolation #gamma = 0.1 #Xi = lil_matrix(X.shape) #Xi[0,:] = X[0] + X[1] * gamma #Xi[1,:] = X[1] + (X[0] + X[2]) * gamma #Xi[2,:] = X[2] + (X[1] + X[3]) * gamma #Xi[3,:] = X[3] + X[2] * gamma #Xi = Xi.tocsr() clf.fit(X, y) predicted = clf.predict(d_test) print "training accuracy:", clf.score(d_train, target[train]) tester.record(target[test], predicted) print tester.confusionMatrix() print "cv accuracy:", tester.accuracy()
def classifierEnsemble(): text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) def _conf_to_feature(conf1, conf2): def _append_to_key(c): def _append(f): return (c + f[0], f[1]) return _append conf1 = map(_append_to_key('0'), conf1) conf2 = map(_append_to_key('1'), conf2) confs = conf1 confs.extend(conf2) return confs for train, test in data.kFolds(dataset): coffset = int(len(train) * .8) text_train, following_train = zip(*train[:coffset]) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # train a chooser chooser = cmaxent.MaxentModel() chooser.begin_add_event() for datum in train[coffset:]: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) confs = _conf_to_feature(t_conf, f_conf) chooser.add_event(confs, target) chooser.end_add_event(0) chooser.train(50, 'lbfgs', 1e-1, 1e-4) # retrain the underlying classifiers text_train, following_train = zip(*train) t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) confs = _conf_to_feature(t_conf, f_conf) pre_target = chooser.predict(confs) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()