def testOn(self, m, test_groups):
        ''' perform tests with the provided model 'm' and test set, and returns
        a tests object. one can specify the threshold when the prediction is
        made when the confidence is over the threshold. '''

        print "test set:", Counter(zip(*test_groups)[1]).most_common()

        t = tests.tester(4)
        trials = []
        f_thre = 0

        for pair in test_groups:
            max_label, max_prob = self.getPredictionWithConfidence(m, pair[0])
            #predicted = m.predict(pair[0])
            true = pair[1]

            if max_prob > f_thre:
                trials.append((true, max_label))

            #if true != max_label and true == 'g3':
            #    meta = pair[2]
            #    print meta['user_id']

        trials = zip(*trials)
        t.record(trials[0], trials[1])


        return t
Exemple #2
0
    def run_diagnose(self):

        standards = serial.objFromFile('prob_groups.txt')

        c = ClusterBridge()
        name_net = c.getNameNet()

        for thres in range(1, 10):
            # Experimental
            def lengthFilter(triplet):
                names = triplet[1]
                return len(names) == thres

            filtered_net = filter(lengthFilter, name_net)
            # Experimental-end

            observed = gstat.convertNameNet2Observed(filtered_net)
            prediction = matchstat.convertObserved2Prediction(standards, observed)

            # Format becomes:
            # [(true1, true2,...), (predict1, predict2, ...), (meta1, meta2, ...)]
            pairs = matchstat.unzipGroupPrediction(prediction)

            num_label = 4
            tester = tests.tester(num_label)
            assert(len(pairs) >= 2)     # sanity check
            tester.record(pairs[0], pairs[1])
            tester.recordMeta(pairs[2])

            #print thres
            print "Accuracy:", tester.accuracy()
            print "Conf matrix:"
            print tester.confusionMatrix(friends.groupToNumeric)
    def useOnlyIntersection(self):
        # feature selection on the full dataset
        X, y = self.getData()
        fs = self.getFeatureSelector()
        fs.fit(X, y)

        oracle_indices = self.indices(fs, X.shape[1])

        # feature selection on part of the dataset
        fs = LogisticRegression(penalty='l1', tol=1e-6, C=5e-1)

        kfold = cross_validation.KFold(X.shape[0], k = 5)
        for train, test in kfold:
            fs.fit(X[train], y[train])
            normal_indices = self.indices(fs, X.shape[1])
            break

        common_indices = oracle_indices & normal_indices
        print 'intersection size:', len(common_indices)

        #
        clf = LogisticRegression(penalty='l2', tol=1e-6, C=1e-1)

        kfold = cross_validation.KFold(X.shape[0], k = 5)
        tester = tests.tester(4)

        for train, test in kfold:
            clf.fit(self.reduceWith(X[train], common_indices), y[train])
            predicted = clf.predict(self.reduceWith(X[test], common_indices))
            print clf.score(self.reduceWith(X[test], common_indices), y[test])

            tester.record(y[test], predicted)

        print tester.confusionMatrix()
        print "cv accuracy:", tester.accuracy()
    def reg(self):
        X, y = self.getData()
        X_2 = X.multiply(X)
        X = hstack((X, X_2)).tocsr()

        print X.shape

        kfold = cross_validation.KFold(X.shape[0], k = 5)
        tester = tests.tester(4)

        rms_errors = []
        diviations = []

        for train, test in kfold:
            regr = LinearRegression()
            regr.fit(X[train], y[train])

            y_pred = regr.predict(X[test])

            rms_e = math.sqrt(np.mean((y_pred - y[test]) ** 2))
            div_e = np.mean(np.absolute(y_pred - y[test]))

            rms_errors.append(rms_e)
            diviations.append(div_e)

        print 'rms_e:', np.mean(np.array(rms_errors))
        print 'diviation:', np.mean(np.array(diviations))
    def testOn(self, m, test_groups):
        t = tests.tester(4)
        trials = []

        for pair in test_groups:
            predicted = m.predict(pair[0])
            true = pair[1]
            trials.append((predicted, true))

        trials = zip(*trials)
        t.record(trials[0], trials[1])

        return t
def main():
    pairs = pipelines(500, 100)

    # Use tester class
    num_label = 4
    tester = tests.tester(num_label)
    assert(len(pairs) >= 2)     # sanity check
    tester.record(pairs[0], pairs[1])
    tester.recordMeta(pairs[2])

    print "Accuracy:", tester.accuracy()
    print "Conf matrix:"
    print tester.confusionMatrix(friends.groupToNumeric)
    def run(self):
        # get data
        X, y = self.getData()

        clf = self.classifier()

        kfold = cross_validation.KFold(X.shape[0], k = 5)
        tester = tests.tester(4)

        for train, test in kfold:
            clf.fit(X[train], y[train], X[test])
            predicted = clf.predict(X[test])

            tester.record(y[test], predicted)

        print tester.confusionMatrix()
        print "cv accuracy:", tester.accuracy()
    def run(self):
        X, y = self.getTextData()

        kfold = cross_validation.KFold(X.shape[0], k = 5)
        tester = tests.tester(4)

        for train, test in kfold:
            # in the training stage, we should discard the part of the training
            # data after the feature selection.
            clf = self.getTrained(X[train], y[train])

            predicted = self.getPredicted(clf, X[test])
            tester.record(y[test], predicted)

        print 'accuracy:', tester.accuracy()
        print 'confusion matrix:'
        print tester.confusionMatrix()
def outputOnlyMatched():
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    n_total = 0
    n_emit = 0

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_pre = t_model.predict(text_context)
            f_pre = f_model.predict(following_context)

            if t_pre == f_pre:
                trials.append((target, t_pre))
                n_emit += 1

            n_total += 1

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
    print 'emitted portion:', float(n_emit) / float(n_total)
    def doCrossValidation(self, dataset, size_limit):
        tester = tests.tester(4)

        for train, test in data.kFolds(dataset):
            # training
            train = random.sample(train, size_limit)
            m = self.trainedModelOn(train)

            # prediction
            trials = []

            for datum in test:
                context, target, weight = datum
                pre_target = m.predict(context)
                trials.append((target, pre_target))

            trials = zip(*trials)
            tester.record(trials[0], trials[1])

        print size_limit, tester.accuracy()
def simulateIdealFriendshipNetwork(n_friends):
    standards = serial.objFromFile('prob_groups.txt')
    #standards = gstat.generateEmpiricalDistributionFromSample(500, 20)

    name_net = friends.getPseudoNameNetSampled(n_sample = n_friends, n_num = 50)
    observed = gstat.convertNameNet2Observed(name_net)
    prediction = convertObserved2Prediction(standards, observed)
    pairs = unzipGroupPrediction(prediction)

    # Use tester class
    num_label = 4
    tester = tests.tester(num_label)
    assert(len(pairs) >= 2)     # sanity check
    tester.record(pairs[0], pairs[1])
    tester.recordMeta(pairs[2])

    #print "Accuracy:", tester.accuracy()
    #print "Conf matrix:"
    #print tester.confusionMatrix(friends.groupToNumeric)

    return tester.accuracy()
def simpleEnsemble(pickup):
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            pre_target = str(pickup(t_conf, f_conf))

            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
    def baggedTest(self, classifiers_models, test_set):
        ''' perform tests with the provided model 'm' and test set, and returns
        a tests object.

            classifiers_models = [(classifier, model), ...]
            test_set = [(user_group, label), ...]
        '''

        t = tests.tester(4)
        trials = []

        for pair in test_set:
            predicted = self.baggedPredict(classifiers_models, pair)
            true = pair[1]

            if predicted is not None:
                trials.append((true, predicted))

        trials = zip(*trials)
        t.record(trials[0], trials[1])

        return t
def doCrossValidation(dataset):
    tester = tests.tester(4)

    for train, test in data.kFolds(dataset):
        # training
        m = trainedModelOn(train)
        print 'train size', len(train)

        # prediction
        trials = []

        for datum in test:
            context, target, weight = datum
            pre_target = m.predict(context)
            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
Exemple #15
0
    def run(self):
        standards = serial.objFromFile('prob_groups.txt')

        c = ClusterBridge()
        name_net = c.getNameNet()

        observed = gstat.convertNameNet2Observed(name_net)
        prediction = matchstat.convertObserved2Prediction(standards, observed)

        # Format becomes:
        # [(true1, true2,...), (predict1, predict2, ...), (meta1, meta2, ...)]
        pairs = matchstat.unzipGroupPrediction(prediction)

        num_label = 4
        tester = tests.tester(num_label)
        assert(len(pairs) >= 2)     # sanity check
        tester.record(pairs[0], pairs[1])
        tester.recordMeta(pairs[2])

        #print thres
        print "Accuracy:", tester.accuracy()
        print "Conf matrix:"
        print tester.confusionMatrix(friends.groupToNumeric)
    def cv(self):
        fs = LinearSVC(penalty='l1', dual=False, tol=1e-4,
                C=1e1, multi_class='ovr', fit_intercept=True)
        fs = SelectKBest(chi2, k=3000)
        data, target = self.getData(fs)

        #clf = MultinomialNB()
        #clf = svm.SVC(kernel = 'linear')
        clf = LinearSVC(penalty='l2', loss='l2', dual=True, tol=1e-4,
                C=1000.0, multi_class='ovr', fit_intercept=True)
        #clf = LogisticRegression(penalty='l2', tol=1e-6, C=1e-1)


        # Set up feature selection
        fs_enable = False

        # Start CV
        kfold = cross_validation.KFold(data.shape[0], k = 5)
        tester = tests.tester(4)

        for train, test in kfold:

            if fs_enable:
                print "before feature selection:", data[train].shape

                d_train = fs.fit_transform(data[train], target[train])
                d_test = fs.transform(data[test])

                print "feature selected:", d_train.shape

            else:
                print "no feature selection:", data[train].shape
                d_train = data[train]
                d_test = data[test]

            # experimental
            def _sparse_mean(A):
                A = reduce(lambda x,y: x+y, [A[k] for k in range(A.shape[0])])
                return A

            X = d_train
            y = target[train]

            # combine examples into one array
            #X = vstack(
            #        [_sparse_mean(
            #            X[np.nonzero(y == k)[0]]
            #            )
            #            for k in np.unique(y)])
            #y = np.unique(y)
            #X = X.tocsr()

            # a very simple interpolation
            #gamma = 0.1
            #Xi = lil_matrix(X.shape)
            #Xi[0,:] = X[0] + X[1] * gamma
            #Xi[1,:] = X[1] + (X[0] + X[2]) * gamma
            #Xi[2,:] = X[2] + (X[1] + X[3]) * gamma
            #Xi[3,:] = X[3] + X[2] * gamma
            #Xi = Xi.tocsr()

            clf.fit(X, y)
            predicted = clf.predict(d_test)

            print "training accuracy:", clf.score(d_train, target[train])

            tester.record(target[test], predicted)

        print tester.confusionMatrix()
        print "cv accuracy:", tester.accuracy()
def classifierEnsemble():
    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    def _conf_to_feature(conf1, conf2):
        def _append_to_key(c):
            def _append(f):
                return (c + f[0], f[1])
            return _append

        conf1 = map(_append_to_key('0'), conf1)
        conf2 = map(_append_to_key('1'), conf2)

        confs = conf1
        confs.extend(conf2)

        return confs


    for train, test in data.kFolds(dataset):
        coffset = int(len(train) * .8)
        text_train, following_train = zip(*train[:coffset])

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # train a chooser
        chooser = cmaxent.MaxentModel()
        chooser.begin_add_event()

        for datum in train[coffset:]:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            confs = _conf_to_feature(t_conf, f_conf)
            chooser.add_event(confs, target)

        chooser.end_add_event(0)
        chooser.train(50, 'lbfgs', 1e-1, 1e-4)

        # retrain the underlying classifiers
        text_train, following_train = zip(*train)
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)


        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            confs = _conf_to_feature(t_conf, f_conf)
            pre_target = chooser.predict(confs)

            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()