コード例 #1
0
def svmClassify(X_train, y_train, X_test, y_test, iteration):
    print("******************* SVM classification *********************\n")
    svm_model = svm.SVC(C=1, gamma=0.1)
    start_train_svm = time.time()
    svm_model.fit(X_train, y_train)
    end_train_svm = time.time()
    training_time_svm = end_train_svm - start_train_svm
    print("Training SVM model_selection %d took %.5f\n" %
          (iteration, training_time_svm))

    predict_train_svm = svm_model.predict(X_train)
    print("training accuracy")
    print(accuracy_score(y_train, predict_train_svm))
    print("\n")

    start_test_svm = time.time()
    predict_test_svm = svm_model.predict(X_test)
    end_test_svm = time.time()
    testing_time_svm = end_test_svm - start_test_svm
    print("Testing SVM model_selection %d took %.5f\n" %
          (iteration, testing_time_svm))
    print("testing accuracy")
    print(accuracy_score(y_test, predict_test_svm))
    print("\n")

    return training_time_svm, testing_time_svm
コード例 #2
0
def gaussianProcess(X_train, y_train, X_test, y_test, iteration):
    print("************ Gaussian Process Classification **************\n")
    gp_rbf_fix = GaussianProcessClassifier(kernel=76.5**2 *
                                           RBF(length_scale=179),
                                           optimizer=None)
    start_train_gp = time.time()
    gp_rbf_fix.fit(X_train, y_train)
    end_train_gp = time.time()
    training_time_gp = end_train_gp - start_train_gp
    print("Training GP model_selection %d took %.5f\n" %
          (iteration, training_time_gp))

    predict_train_gp = gp_rbf_fix.predict(X_train)
    print("training accuracy")
    print(accuracy_score(y_train, predict_train_gp))
    print("\n")

    start_test_gp = time.time()
    predict_test_gp = gp_rbf_fix.predict(X_test)
    end_test_gp = time.time()
    testing_time_gp = end_test_gp - start_test_gp
    print("Testing GP model_selection %d took %.5f\n" %
          (iteration, training_time_gp))
    print("testing accuracy")
    print(accuracy_score(y_test, predict_test_gp))
    print("\n")

    return training_time_gp, testing_time_gp
コード例 #3
0
def main():
    samplesIMG, labels = prepare_samples()

    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samplesIMG,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    print("TREE")
    testResults = tree(trainSamples, trainLabels, testSamples)
    accTree = accuracy_score(testLabels.argmax(axis=1),
                             testResults.argmax(axis=1))

    print("FLAT")
    testResults = flat_network(trainSamples, trainLabels, testSamples)
    accFlat = accuracy_score(testLabels.argmax(axis=1),
                             testResults.argmax(axis=1))

    print("CNN")
    testResults = cnn_network(trainSamples, trainLabels, testSamples)
    accCnn = accuracy_score(testLabels.argmax(axis=1),
                            testResults.argmax(axis=1))

    print("Accuracy TREE: {}".format(accTree))
    print("Accuracy FLAT: {}".format(accFlat))
    print("Accuracy CNN: {}".format(accCnn))
    plot_accuracy((accTree, accFlat, accCnn))
コード例 #4
0
def build_metrics(y_train, p_train, y_test, p_test, dataset):
    metrics = {}

    if y_train is not None and p_train is not None:
        y_train_argmax = y_train if dataset.binary else np.argmax(y_train,
                                                                  axis=1)
        p_train_argmax = p_train.round() if dataset.binary else np.argmax(
            p_train, axis=1)
        metrics['final_train_accuracy'] = accuracy_score(
            p_train_argmax, y_train_argmax)
    else:
        logging.getLogger(__name__).warning(
            "No training data available during report generation.")

    if y_test is not None and p_test is not None:
        y_test_argmax = y_test if dataset.binary else np.argmax(y_test, axis=1)
        p_test_argmax = p_test.round() if dataset.binary else np.argmax(p_test,
                                                                        axis=1)

        metrics['accuracy'] = accuracy_score(y_test_argmax, p_test_argmax)
        metrics['confusion_matrix'] = confusion_matrix(y_test_argmax,
                                                       p_test_argmax)

        metrics['classes'] = []
        p, r, f1, s = precision_recall_fscore_support(y_test_argmax,
                                                      p_test_argmax,
                                                      average=None)
        for (i, l) in enumerate(dataset.label_names):
            metrics['classes'].append({
                'name': l,
                'recall': r[i],
                'precision': p[i],
                'f1': f1[i],
                'support': s[i]
            })

        if dataset.binary:
            metrics['precision_recall_curve'] = precision_recall_curve(
                y_test, p_test)
            metrics['roc_curve'] = roc_curve(y_test, p_test)
        else:
            metrics['curves'] = []
            for (i, l) in enumerate(dataset.label_names):
                metrics['curves'].append({
                    'name':
                    l,
                    'precision_recall_curve':
                    precision_recall_curve(y_test[:, i], p_test[:, i]),
                    'roc_curve':
                    roc_curve(y_test[:, i], p_test[:, i])
                })

    else:
        logging.getLogger(__name__).warning(
            "No test data available during report generation.")

    return metrics
コード例 #5
0
    def by_class_evaluation(attack_test_y,
                            target_y,
                            p,
                            attack_test_x,
                            labels=None):
        if labels is None:
            labels = np.unique(target_y)

        precisions = [
            precision_score(attack_test_y[target_y == c], p[target_y == c]) *
            100 for c in np.unique(target_y)
        ]
        accuracies = [
            accuracy_score(attack_test_y[target_y == c], p[target_y == c]) *
            100 for c in np.unique(target_y)
        ]
        f1_scores = [
            f1_score(attack_test_y[target_y == c], p[target_y == c]) * 100
            for c in np.unique(target_y)
        ]
        recalls = [
            recall_score(attack_test_y[target_y == c], p[target_y == c]) * 100
            for c in np.unique(target_y)
        ]
        c_train_accs = [
            accuracy_score(
                target_y[np.logical_and(target_y == c, attack_test_y == 1)],
                np.argmax(attack_test_x[np.logical_and(target_y == c,
                                                       attack_test_y == 1)],
                          axis=1)) * 100 for c in np.unique(target_y)
        ]
        c_test_accs = [
            accuracy_score(
                target_y[np.logical_and(target_y == c, attack_test_y == 0)],
                np.argmax(attack_test_x[np.logical_and(target_y == c,
                                                       attack_test_y == 0)],
                          axis=1)) * 100 for c in np.unique(target_y)
        ]

        x = PrettyTable()
        x.float_format = '.2'
        x.add_column("Class", labels)
        x.add_column('Target Accuracy Train', np.round(c_train_accs, 2))
        x.add_column('Target Accuracy Test', np.round(c_test_accs, 2))
        x.add_column("Attack Precision", np.round(precisions, 2))
        x.add_column("Attack Accuracy", np.round(accuracies, 2))
        x.add_column("Attack Recall", np.round(recalls, 2))
        x.add_column("Attack F-1 Score", np.round(f1_scores, 2))
        x.add_column(
            "Percentage of Data",
            np.round(
                np.array([
                    len(target_y[target_y == c]) / len(target_y) * 100
                    for c in np.unique(target_y)
                ]), 2))
        print(x.get_string(title='Per Class Evaluation'))
コード例 #6
0
 def metric_scores(self, estimator, testt, testlabelt):
     
     
     y_pred = estimator.predict(testt)
     #secret_cm.append(accuracy_score(testlabelt, y_pred))
     training_manCV.secret_cm.append( metrics.confusion_matrix(testlabelt, y_pred).flatten())
     
     #print training_manCV.secret_cm
     training_manCV.secret_score.append( accuracy_score(testlabelt, y_pred))
     return accuracy_score(testlabelt, y_pred) 
コード例 #7
0
    def metric_scores(self, estimator, testt, testlabelt):

        y_pred = estimator.predict(testt)
        #secret_cm.append(accuracy_score(testlabelt, y_pred))
        training_manCV.secret_cm.append(
            metrics.confusion_matrix(testlabelt, y_pred).flatten())

        #print training_manCV.secret_cm
        training_manCV.secret_score.append(accuracy_score(testlabelt, y_pred))
        return accuracy_score(testlabelt, y_pred)
コード例 #8
0
def get_ada():
    tree = DecisionTreeClassifier(max_depth=None)
    ada = AdaBoostClassifier(base_estimator=tree,
                             n_estimators=300,
                             learning_rate=0.1)
    ada.fit(X_train,y_train)
    y_train_pred = ada.predict(X_train)
    y_test_pred = ada.predict(X_test)
    ada_train_score = accuracy_score(y_train, y_train_pred)
    ada_test_score = accuracy_score(y_test, y_test_pred)
    print 'AdaBoost train/test accuracies: %.4f/%.4f' \
            % (ada_train_score,ada_test_score)
コード例 #9
0
def get_rf():
    rf = RandomForestClassifier(n_estimators=200,max_depth=None,
                                random_state=1,bootstrap=True)
    rf.fit(X_train, y_train)
    
    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)
    y_train_score=accuracy_score(y_train,y_train_pred)
    y_test_score=accuracy_score(y_test,y_test_pred)
    
    print 'Random Forest train/test accuracies: %.4f/%.4f' \
            % (y_train_score,y_test_score)
コード例 #10
0
    def eval(self, fold, print_result=True):
        assert fold < self.loader.get_n_splits(), "fold >= {}".format(self.loader.get_n_splits())
        X_train, y_train, X_test, y_test = self.loader.get_train_test_xy(fold)
        gpc = self.gpc_dict[fold]
        train_acc = accuracy_score(y_train, gpc.predict(X_train))
        test_acc = accuracy_score(y_test, gpc.predict(X_test))

        if print_result:
            print("Fold: {}, Kernel: {}".format(fold, gpc.kernel))
            print("Train Acc: {}".format(train_acc))
            print("Test Acc: {}".format(test_acc))
            print("=" * 10)

        return train_acc, test_acc
コード例 #11
0
def classification_example():
    nsamples = 500
    periods = 500
    loss = np.zeros((periods, ))

    df = DataFactory()
    df.create_circles(nsamples)
    # df.create_moons(nsamples)

    train_x, train_y = df.get_train_samples()
    net = create_net(2, 2)
    net_loss = layers.CrossEntropyLayer()
    opt = optims.RMSPropOptim(net.named_parameters(),
                              lr=options["lr"],
                              weight_decay=options["weight_decay"],
                              beta=options["beta"])

    # begin to train.
    for j in range(periods):
        opt.zero_grad()
        y = net.forward(train_x)
        l = net_loss(y, train_y)
        loss[j] = l.data[0, 0]
        l.backward()
        opt.step()
    # plot train loss
    plt.plot(loss)
    plt.show()

    # plot train result
    with sn.no_grad():
        predict_y = net(train_x)
        l = net_loss(predict_y, train_y)
    print("trian set loss:", l.item())
    print(
        "train set accuracy score",
        accuracy_score(np.argmax(train_y.data, axis=1),
                       np.argmax(predict_y.data, axis=1)))

    # plot test result
    test_x, test_y = df.get_test_samples()
    with sn.no_grad():
        predict_y = net(test_x)
        l = net_loss(predict_y, test_y)
    print("\ntest set loss:", l.item())
    print(
        "test set accuracy score",
        accuracy_score(np.argmax(test_y.data, axis=1),
                       np.argmax(predict_y.data, axis=1)))
コード例 #12
0
ファイル: train.py プロジェクト: l0he1g/keras-exp
def run():
  data = BaiduQA(conf.baiduQA_pt)
  train_ys, test_ys, train_xs, test_xs = data.split()
  print("n(train)=%d, n(test)=%d" % (train_xs.shape[0], test_xs.shape[0]))

  lr = LogisticRegression()
  print("begin training")
  lr.fit(train_xs, train_ys)

  train_predicts = lr.predict(train_xs)
  test_predicts = lr.predict(test_xs)

  train_acc = accuracy_score(train_ys, train_predicts)
  test_acc = accuracy_score(test_ys, test_predicts)
  print("train_acc=%f, test_acc=%f" % (train_acc, test_acc))
コード例 #13
0
def run_experiment(clf_cls, loader, fold, print_result=True, **kwargs):
    X_train, y_train, X_test, y_test = loader.get_train_test_xy(fold)
    clf = clf_cls(**kwargs)
    clf.fit(X_train, y_train)

    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))

    if print_result:
        print("{}, fold: {}, params: {}".format(clf_cls.__name__, fold, kwargs))
        print("Train Acc: {}".format(train_acc))
        print("Test Acc: {}".format(test_acc))
        print("=" * 10)

    return train_acc, test_acc
コード例 #14
0
 def train_repeat_forest(self, seed, train, trainlabel, test, testlabel,
                         number_trees, number_features, repeat_times):
     seed_of_tree = {
         'rf':
         RandomForestClassifier(n_estimators=number_trees,
                                max_features=number_features),
         'adb':
         AdaBoostClassifier(n_estimators=number_trees),
         'bag':
         BaggingClassifier(n_estimators=number_trees),
         'ext':
         ExtraTreesClassifier(n_estimators=number_trees,
                              max_features=number_features),
         'gbt':
         GradientBoostingClassifier(n_estimators=number_trees,
                                    max_features=number_features),
         'bagging':
         RandomForestClassifier(n_estimators=number_trees, max_features=12)
     }
     rawforest = seed_of_tree[seed]
     score_list = []
     for i in np.arange(repeat_times):
         forest = rawforest.fit(train, trainlabel)
         outputtest = forest.predict(test)
         accuracytrain = accuracy_score(testlabel, outputtest)
         score_list.append(accuracytrain)
     score = np.mean(score_list)
     return score
コード例 #15
0
def agnews_bembmeans(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_agnews_data(size=sample)

    if sample:
        test_size = int(round(np.sum(2000*df.category.value_counts().values/32000)))
    else:
        test_size = 2000*4

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_sents = DataframeSentences(train_df, cols=['title', 'description'])
    vect = ClusteredEmbeddingsVectorizer(n_clusters=50000).fit(train_sents)

    train_docs = DataframeSentences(train_df, cols=['title', 'description'], flatten=True)
    test_docs = DataframeSentences(test_df, cols=['title', 'description'], flatten=True)
    X_train = vect.transform(train_docs)
    y_train = train_df.category
    X_test = vect.transform(test_docs)
    y_test = test_df.category

    model = LogisticRegression()
    grid = GridSearchCV(model, {'C': [.0001, .0003, .001, .003, .01, .03, .1, .3, 1, 3, 10, 30, 100]},
                        n_jobs=n_procs, verbose=1, cv=5)
    grid.fit(X_train, y_train)

    print(accuracy_score(y_test, grid.best_estimator_.predict(X_test)), grid.best_params_)
コード例 #16
0
    def testforest_confu(self, test, testlabel, forest):
        outputtest = forest.predict(test)
        accuracytrain = accuracy_score(testlabel, outputtest)
        #----------------------------------- print "The size of the test set is"
        #------------------------------------------------- print  np.shape(test)
        #------------------------------------------------------------------------------
        # print "The accuracy for the test set is %r" %accuracytrain, "and the confusion matrix is"
        #-------------------------- print confusion_matrix(outputtest,testlabel)
        #------------------------------------- #output the classification report
        #-------------------- print classification_report(testlabel, outputtest)
        #generate probability
        output_proba = forest.predict_proba(test)
        out_perfor = {
            'Classprob0': output_proba[:, 0],
            'Classprob1': output_proba[:, 1],
            'Classprob2': output_proba[:, 1],
            'output': outputtest,
            'target': testlabel
        }
        outframe = DataFrame(out_perfor)
        #       print accuracytrain
        #        print outframe
        # save the outprobability
        #        outframe.to_csv(r'D:\allprob.csv', header=0)

        #        return outputtest
        #        return (outframe)

        #        print confusion_matrix(outputtest,testlabel)
        return accuracytrain
コード例 #17
0
ファイル: run.py プロジェクト: zbagdzevicius/breast_cancer
 def deep_belief_network_prediction(
     self,
     learning_rate,
     training_iterations,
     testing_iterations=10,
     hidden_layer_sizes_array=[10, 10],
 ):
     accuracy_list = []
     for x in range(testing_iterations):
         self.prepare_training_data_from_csv_data(self.csv_data)
         classifier = SupervisedDBNClassification(
             hidden_layers_structure=hidden_layer_sizes_array,
             learning_rate_rbm=learning_rate / 2,
             learning_rate=learning_rate,
             n_epochs_rbm=int(training_iterations / 10),
             n_iter_backprop=training_iterations,
             batch_size=256,
             activation_function="relu",
             dropout_p=0.2,
         )
         classifier.fit(self.x_data_training, self.y_data_training)
         y_data_prediction = classifier.predict(self.x_data_testing)
         classifier_accuracy = accuracy_score(self.y_data_testing, y_data_prediction)
         accuracy_list.append(classifier_accuracy)
     return max(accuracy_list)
コード例 #18
0
def test_with_unigram_tfidf():
    train_x, train_y, test_x, test_y = get_features('dbn')
    train_x = np.array(train_x, dtype=np.float32)
    train_y = np.array(train_y, dtype=np.int32)
    test_x = np.array(test_x, dtype=np.float32)
    test_y = np.array(test_y, dtype=np.int32)
    print(test_x.shape)
    classifier = SupervisedDBNClassification(
        hidden_layers_structure=[256, 256, 256],
        learning_rate_rbm=0.05,
        learning_rate=0.1,
        n_epochs_rbm=10,
        n_iter_backprop=100,
        batch_size=32,
        activation_function='relu',
        dropout_p=0.2)
    classifier.fit(train_x, train_y)
    accuracies = []
    f_measures = []
    for i in range(1):
        y_pred = classifier.predict(test_x)
        accuracy = accuracy_score(test_y, y_pred)
        f_measure = f1_score(test_y, y_pred)
        accuracies.append(accuracy)
        f_measures.append(f_measure)

    classifier.save('SentimentClassification.pkl')

    print(accuracies)
    print('Accuracy ', mean(accuracies))
    print('F-measure', mean(f_measures))
    return
コード例 #19
0
def printConfusionMatrix(y_true, y_pred, class_names=None):
    """ Print a confusion matrix similar to R's confusionMatrix """
    confMatrix = classification.confusion_matrix(y_true, y_pred)
    accuracy = classification.accuracy_score(y_true, y_pred)

    print('Confusion Matrix (Accuracy {:.4f})\n'.format(accuracy))
    _printConfusionMatrix(confMatrix, class_names)
コード例 #20
0
ファイル: test_ovr.py プロジェクト: andrely/mimir
def test_iris_benchmark():
    data = iris()
    x = add_bias(data['x'])
    y = binarize(data['y'])

    train_split = [12, 39, 23, 5, 3, 29, 49, 47, 21, 30, 34, 48, 20, 45, 31, 27, 17, 22,
                   41, 6, 40, 38, 42, 19, 26, 15, 35, 10, 46, 25, 0, 32, 1, 16, 4, 13,
                   24, 33, 43, 18, 81, 65, 62, 50, 93, 92, 53, 58, 87, 55, 70, 72, 83,
                   56, 52, 73, 78, 64, 68, 59, 74, 89, 67, 51, 66, 98, 90, 69, 95, 63,
                   82, 54, 86, 85, 96, 97, 79, 71, 94, 80, 142, 147, 125, 145, 119, 101,
                   141, 105, 129, 138, 122, 120, 139, 124, 134, 111, 148, 117, 132, 133,
                   104, 130, 128, 115, 127, 131, 136, 112, 107, 143, 149, 106, 109, 108,
                   102, 100, 126, 103, 146, 113]

    test_split = [2, 7, 8, 9, 11, 14, 28, 36, 37, 44, 57, 60, 61, 75, 76, 77, 84, 88,
                  91, 99, 110, 114, 116, 118, 121, 123, 135, 137, 140, 144]

    xTrain = x[train_split, :]
    yTrain = y[train_split, :]
    xTest = x[test_split, :]
    yTest = y[test_split, :]

    model = OVRClassifier(LogisticModel(rho=1.)).train(xTrain, yTrain, verbose=False)
    pred = binarize(model.predict(xTest))
    assert_almost_equal(accuracy_score(yTest, pred), 0.96667, decimal=3)
コード例 #21
0
 def predict(self, model, X, y):
     predictions = model.predict_proba(X)
     if np.isfinite(y).all():
         self.accuracy.append(
             accuracy_score(y, np.argmax(predictions, axis=1)))
         # print('Accuracy: ', accuracy_score(y, np.argmax(predictions, axis=1)))
     return predictions
コード例 #22
0
def print_test_results(true_labels, pred_labels, pred_probs):
    """
            输出预测结果,包括准确率和AUC值
    """
    print '预测准确率:%.2f' % accuracy_score(true_labels, pred_labels)
    print '预测AUC值:%.4f' % roc_auc_score(true_labels, pred_probs[:, 1])
    print
コード例 #23
0
    def multi_class_measures(cls, y_true: list,
                             y_predicted: list) -> OrderedDict:
        """Assessment measures of a classification task with multiple
        classes i.e. multi-label and or multi-class task

        Parameters
        ----------
        y_true : list
            Expected class labels in binary form
        y_predicted : list
            Predicted class labels in binary form

        Returns
        -------
        OrderedDict
            An ordered dictionary of assessment measures
        """
        measures = OrderedDict()
        measures['accuracy'] = accuracy_score(y_true, y_predicted)
        measures['coverage error'] = coverage_error(y_true, y_predicted)
        measures['label ranking loss'] = label_ranking_loss(
            y_true, y_predicted)
        b_true = np.array(y_true)
        b_pred = np.array(y_predicted)
        measures['unsupported hamming loss'] = np.sum(
            np.not_equal(b_true, b_pred)) / float(b_true.size)
        measures[
            'label ranking average precision'] = label_ranking_average_precision_score(
                y_true, y_predicted)
        return measures
コード例 #24
0
def main():
    diabetes = datasets.fetch_openml('diabetes')
    y = sklearn.preprocessing.LabelEncoder().fit_transform(diabetes['target'])

    X_train, X_test, y_train, y_test = train_test_split(diabetes['data'], y)

    preds = []

    accs = []

    for x in range(0, 500):
        model = DecisionTreeClassifier()

        X_train_cur, _, y_train_cur, _ = train_test_split(X_train, y_train)

        model.fit(X_train_cur, y_train_cur)

        y_hat = model.predict_proba(X_test)

        preds.append(y_hat)

        acc = accuracy_score(np.argmax(np.mean(preds, axis=0), axis=1), y_test)
        accs.append(acc)
        print(acc)

    plt.plot(np.arange(1, 501), accs, label='Accuracy')
    plt.savefig('../figures/ensemble.pdf')
コード例 #25
0
    def _validate_model(self, x: np.ndarray, y: np.ndarray, validation_file_name: str = "validation.json") -> dict:
        logging.info("Creating predictions ...")
        y_predicted_categories = self._model.predict(x, batch_size=self._batch_size)
        gc.collect()

        from sklearn.metrics.classification import accuracy_score, precision_recall_fscore_support
        y_expected_1dim = self._label_enc.max_category(y)
        y_predicted_1dim = self._label_enc.max_category(y_predicted_categories)
        logging.info("Results:")
        logging.info("{}".format(precision_recall_fscore_support(y_true=y_expected_1dim, y_pred=y_predicted_1dim)))
        accuracy = accuracy_score(y_true=y_expected_1dim, y_pred=y_predicted_1dim)
        logging.info("{}".format(accuracy))

        from sklearn.metrics.classification import classification_report
        logging.info("\n{}".format(classification_report(y_true=y_expected_1dim,
                                                         y_pred=y_predicted_1dim,
                                                         target_names=["neg", "pos"],
                                                         )))

        results = classification_report(y_true=y_expected_1dim,
                                        y_pred=y_predicted_1dim,
                                        target_names=["neg", "pos"],
                                        output_dict=True)
        results["accuracy"] = accuracy
        write_text_file(
            file_path=self._experiment_folder / validation_file_name,
            text=json.dumps(results))

        return results
コード例 #26
0
def yelpstars_bembmeans(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_yelp_stars_data(size=sample)

    if sample:
        test_size = floor(len(df) * 1./14)
    else:
        test_size = 10000*len(df.stars.unique())

    split = StratifiedShuffleSplit(df.stars, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_sents = DataframeSentences(train_df, cols=['text'])
    vect = ClusteredEmbeddingsVectorizer(n_clusters=50000).fit(train_sents)

    train_docs = DataframeSentences(train_df, cols=['text'], flatten=True)
    test_docs = DataframeSentences(test_df, cols=['text'], flatten=True)
    X_train = vect.transform(train_docs)
    y_train = train_df.stars
    X_test = vect.transform(test_docs)
    y_test = test_df.stars

    model = LogisticRegression()
    grid = GridSearchCV(model, {'C': [.0001, .0003, .001, .003, .01, .03, .1, .3, 1, 3, 10, 30, 100]},
                        n_jobs=n_procs, verbose=1, cv=5)
    grid.fit(X_train, y_train)

    print(accuracy_score(y_test, grid.best_estimator_.predict(X_test)), grid.best_params_)
コード例 #27
0
ファイル: classify.py プロジェクト: alindquist19/ml-hw
def debug_accuracy(classifier, x, y, examples):
    predictions = classifier.predict(x)

    errors = []

    false_positive = 0
    false_negative = 0

    for i in range(len(list(x))):
        if predictions[i] != y[i]:
            errors.append((examples[i], y[i], predictions[i]))
            if predictions[i] == 1:
                false_positive += 1
            else:
                false_negative += 1

    for i in range(50):
        print(
            "True Label: %s, Prediction: %s, Data: %s, \t Original Data: %s" %
            (errors[i][1], errors[i][2], add_features(
                errors[i][0]), errors[i][0]))

    print("Accuracy: %f" % accuracy_score(y, predictions))

    print("False positive: %s \t False negative: %s" %
          (false_positive, false_negative))
コード例 #28
0
def main():
    ks = [3, 5, 10, 20]
    mapk = 200
    train, test = load_data()
    train, test = train.as_matrix(), test.as_matrix()
    x = train.T
    res = np.zeros(9)
    for u in xrange(train.shape[0]):
        y = x[:, u]
        truth = test[u]
        clf = LogisticRegression(random_state=42, C=0.001, solver='lbfgs')
        clf.fit(x, y)
        pred_buy_proba = clf.predict_proba(x)[:, 1].ravel()
        pruned_buy_proba = pred_buy_proba - y.ravel()
        pred_order = pruned_buy_proba.argsort()[::-1]
        actual_bought = truth.nonzero()[0]
        score = apk(actual_bought, pred_order, mapk)
        tmp = [score]
        for k in ks:
            tmp.append(prec(actual_bought, pred_order, k))
            tmp.append(recall(actual_bought, pred_order, k))
        res += np.array(tmp)
        if u % 50 == 0:
            print res / (u + 1)
            print u, classification.accuracy_score(clf.predict(x), y)
    return res / (u + 1)
コード例 #29
0
 def Predict(self, inp, labels, classifier, folds, name, paramdesc):
     X= inp
     y = labels
     X, y = X[y != 2], y[y != 2]
     n_samples, n_features = X.shape
     
     ###############################################################################
     # Classification and ROC analysis
     
     # Run classifier with cross-validation and plot ROC curves
     cv = StratifiedKFold(y, n_folds=folds)
     
     mean_tpr = 0.0
     mean_fpr = np.linspace(0, 1, 100)
     all_tpr = []
     
     _precision = 0.0
     _recall = 0.0
     _accuracy = 0.0
     _f1 = 0.0
     
     for i, (train, test) in enumerate(cv):
         probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
         pred_ = classifier.predict(X[test])
         _precision += precision_score(y[test], pred_)
         _recall += recall_score(y[test], pred_)
         _accuracy += accuracy_score(y[test], pred_)
         _f1 += f1_score(y[test], pred_)
         # Compute ROC curve and area the curve
         fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
         mean_tpr += interp(mean_fpr, fpr, tpr)
         mean_tpr[0] = 0.0
         roc_auc = auc(fpr, tpr)
         plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
     
     _precision /= folds
     _recall /= folds
     _accuracy /= folds
     _f1 /= folds
     
     
     plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
     
     mean_tpr /= len(cv)
     mean_tpr[-1] = 1.0
     mean_auc = auc(mean_fpr, mean_tpr)
     plt.plot(mean_fpr, mean_tpr, 'k--',
              label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
     
     plt.xlim([-0.05, 1.05])
     plt.ylim([-0.05, 1.05])
     plt.xlabel('False Positive Rate')
     plt.ylabel('True Positive Rate')
     plt.title('Receiver operating characteristic - {0}'.format(name))
     plt.legend(loc="lower right")
     plt.savefig(self.configObject['outputdir'] + '/' + name + '.png')
     plt.close()
     
     result = self.OutputResult(name, paramdesc, len(inp), floor(labels.size / folds), _precision, _recall, _accuracy, _f1) 
     Announce(result)
コード例 #30
0
def sogou_bwords(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_sogou_data(size=sample)

    input = [
        ' '.join([title, content])
        for title, content in zip(df.contenttitle.values, df.content.values)
    ]
    target = df.cat_en

    if sample:
        test_size = int(
            round(np.sum(12000 * df.cat_en.value_counts().values / 102000)))
    else:
        test_size = 12000 * 5

    X, X_, y, y_ = train_test_split(input,
                                    target,
                                    stratify=target,
                                    test_size=test_size)

    grid = bag_words_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)),
          grid.best_params_)
コード例 #31
0
 def get_cv_metrics(self, cv):
     fold_avg_p = []
     fold_avg_r = []
     fold_avg_f1 = []
     fold_accuracy = []
     fold_test_support = []
     fold_train_support = []
     for i, (train, test) in enumerate(cv):
         train_df, train_y = self.X.iloc[train], self.y.iloc[train]
         test_df, test_y = self.X.iloc[test], self.y.iloc[test]
         estimator = clone(self.pipeline)
         estimator.fit(train_df, train_y)
         y_pred = estimator.predict(test_df)
         p, r, f1, s = precision_recall_fscore_support(test_y, y_pred)
         accuracy = accuracy_score(test_y, y_pred)
         # support weighted average precision,recall,f1,support across classes
         avg_p, avg_r, avg_f1 = (np.average(p, weights=s),
                                 np.average(r, weights=s),
                                 np.average(f1, weights=s))
         test_support = test_y.shape[0]
         train_support = train_y.shape[0]
         fold_avg_p.append(avg_p)
         fold_avg_r.append(avg_r)
         fold_avg_f1.append(avg_f1)
         fold_accuracy.append(accuracy)
         fold_test_support.append(test_support)
         fold_train_support.append(train_support)
     return np.average(fold_avg_p), np.average(fold_avg_r), np.average(
         fold_avg_f1), np.average(fold_accuracy), np.average(
             test_support), np.average(train_support)
コード例 #32
0
    def evaluate_special(self,
                         session: tf.Session,
                         val_generator,
                         batch_size: int,
                         classification_samples,
                         size,
                         emnist=True,
                         class_weights=None):
        test_acc = []
        samples_per_shot = 6200
        total_data_processed = 0.0
        correct = 0.0
        correct_avg = 0.0

        # stuff for the weighted accuracy
        predictions = []
        sample_weights = []
        ground_truth = []

        for data, labels in val_generator(batch_size):
            data = data.reshape((data.shape[0], 28, 28, 1))
            print('[INFO] processing', total_data_processed, 'of', size)

            # calssify a single sample
            for i in range(len(data)):
                if emnist:
                    x1, y1 = classification_samples(samples_per_shot // 62)
                else:
                    x1, y1 = classification_samples(samples_per_shot // 10)
                x2 = np.asarray([list(data[i])] * len(y1))

                pc = session.run([self.sigmoidal_out],
                                 feed_dict={
                                     self.X1: x1,
                                     self.X2: x2
                                 })
                prediction = y1[np.argmin(pc)]
                prediction_avg = self._get_mean_prediction(np.squeeze(pc),
                                                           y1,
                                                           emnist=emnist)

                if prediction == labels[i]:
                    correct += 1.0
                if prediction_avg == labels[i]:
                    correct_avg += 1.0
                predictions.append(prediction)
                sample_weights.append(class_weights[labels[i]])
                ground_truth.append(labels[i])

                total_data_processed += 1.0
                # keep track of loss and accuracy
            try:
                weighted_acc = accuracy_score(ground_truth, predictions, True,
                                              sample_weights)
            except:
                weighted_acc = None
            print('weighted_acc:', weighted_acc)
        accuracy = correct / total_data_processed
        avg_acc = correct_avg / total_data_processed
        return accuracy, avg_acc, weighted_acc
コード例 #33
0
def agnews_bngrams(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_agnews_data(size=sample)

    input = [
        ' '.join([title, descr])
        for title, descr in zip(df.title.values, df.description.values)
    ]
    target = df.category

    if sample:
        test_size = int(
            round(np.sum(2000 * df.category.value_counts().values / 32000)))
    else:
        test_size = 2000 * 4

    X, X_, y, y_ = train_test_split(input,
                                    target,
                                    stratify=target,
                                    test_size=test_size)

    grid = bag_ngram_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)),
          grid.best_params_)
コード例 #34
0
def train_model(base_model,X,y,minm_image_process=None,threshold_accuracy=.9,classes=range(10),dump_file_path=None):
    """
    incremental training module
    returns a new model after partial fit on give data
    X=128 sized vector 
    y=labels of vectors
    minm_image_process='how many images of a specific label have to be trained, oversampling undersampling is done,  
    classes:number of that is going to be used in this model have to defined in advance
    """
    print("entering training module")
    [X_train,X_test,y_train,y_test]=get_stratified_sample(X,y,verbose=False)
    if minm_image_process is not None:
        [X_processed,y_processed]=process_data(X_train,y_train,minm_num=minm_image_process)
    else:
        [X_processed,y_processed]=[X_train,y_train]
    if dump_file_path is not None:
        pickle.dump([X_processed,y_processed],open(dump_file_path+'_resampled.pickle','wb'))
    accuracy=0
    idx=0
    while accuracy<threshold_accuracy:
        try:
            base_model.partial_fit(X_processed,y_processed)
        except Exception as e:
            print(e)
            base_model.partial_fit(X_processed,y_processed,classes=classes)
        y_pred=base_model.predict(X_test)
        accuracy=classification.accuracy_score(y_test,y_pred)
        print("accuracy in iteration ",idx+1,' is =',accuracy)
        idx+=1
        if idx>10:
            break
    print("returning from train module")    
    return base_model
コード例 #35
0
    def get_training_results(self,
                             database_file,
                             dataset,
                             cross_validation=10):

        # Connect DB
        self.apk_db.connect_db(database_file)

        results = []
        # K-Fold Cross Validation
        kf = KFold(n_splits=cross_validation, shuffle=True)
        for train, test in kf.split(dataset):

            # Get training and testing dataset
            training_dataset = [dataset[i] for i in train]
            testing_dataset = [dataset[i] for i in test]

            # Fit model
            self.fit(training_dataset)

            # Predict labels for testing samples
            testing_labels, predicted_labels = self.i_predict(testing_dataset)

            # Get score
            result = {}
            result['accuracy'] = accuracy_score(testing_labels,
                                                predicted_labels, True)
            result['f-score'] = f1_score(testing_labels, predicted_labels)

            results.append(result)

        # Disconnect DB
        self.apk_db.disconnect_db()

        return results
コード例 #36
0
def calc_fit(model, metric, train_x, train_y, test_x, test_y, p):
    train_x = map(lambda x: list(compress(x, p)), train_x)
    test_x = map(lambda x: list(compress(x, p)), test_x)
    clf = model.fit(train_x, train_y)
    predictions = clf.predict(test_x)
    if metric == 'precision': return precision_score(test_y, predictions, [0, 1])
    elif metric == 'recall': return recall_score(test_y, predictions, [0, 1])
    elif metric == 'accuracy': return accuracy_score(test_y, predictions, [0, 1])
    return precision_score(test_y, predictions, [0, 1]) + recall_score(test_y, predictions, [0, 1]) + accuracy_score(test_y, predictions, [0, 1])
コード例 #37
0
ファイル: ProjectAnalysis.py プロジェクト: jbjorne/CAMDA2015
 def analyse(self, inDir, fileStem=None, hidden=False, tag=None, clear=True, projects=None):
     meta = self._getMeta(inDir, fileStem)
     if clear:
         meta.drop("project_analysis")
     self.predictions = None
     if "prediction" in meta.db:
         self.predictions = {x["example"]:x["predicted"] for x in meta.db["prediction"].all()}
     #print predictions
     self.grouped = {}
     for example in meta.db.query("SELECT * FROM example"):
         projectCode = example["project_code"]
         if projects and projectCode not in projects:
             continue
         self._addToProject(example, example["project_code"])
         self._addToProject(example, "all projects")
     rows = []
     for project in sorted(self.grouped.keys()):
         for setName in ("train", "hidden"):
             labels = self.grouped[project][setName]["labels"]
             groups = self.grouped[project][setName]["groups"]
             predictions = self.grouped[project][setName]["predictions"]
             row = OrderedDict([("project",project), ("setName", setName), ("tag", tag)])
             row["examples"] = len(labels)
             row["pos"] = len([x for x in labels if x > 0])
             row["neg"] = len([x for x in labels if x < 0])
             row["majority"] = None
             if row["pos"] > 0 or row["neg"] > 0:
                 row["majority"] = max(set(labels), key=labels.count)
             row["auc_baseline"] = None
             row["auc"] = None
             #row["bas_baseline"] = None
             #row["bas"] = None
             row["accuracy"] = None
             row["accuracy_baseline"] = None
             if row["pos"] > 0 and row["neg"] > 0:
                 majorityPredictions = getMajorityPredictions(labels, groups)
                 row["auc"] = aucForPredictions(labels, self.grouped[project][setName]["predictions"])
                 row["auc_baseline"] = aucForPredictions(labels, majorityPredictions)
                 #row["bas"] = balanced_accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in predictions])
                 #row["bas_baseline"] = majorityBaseline(labels, [(-1.0 if x < 0 else 1.0) for x in majorityPredictions])
                 row["accuracy"] = accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in predictions])
                 row["accuracy_baseline"] = accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in majorityPredictions])
             rows.append(row)
     meta.insert_many("project_analysis", rows, True)
コード例 #38
0
def dbpedia_bngrams(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    X, X_, y, y_ = dbpedia_train_test_split(sample=sample)

    grid = bag_ngram_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
コード例 #39
0
    def test_classification(self, test, testlabel,bestmodel):
#        bestmodel=bestmodel
        outputtest = bestmodel.predict(test)
        accuracytest = accuracy_score(testlabel, outputtest)
        print ("The accuracy for the test set is %r" %accuracytest, "and the confusion matrix is")
        print (confusion_matrix(outputtest,testlabel))
        print( classification_report(testlabel, outputtest))
#        probaout=bestmodel.predict_prob(test)
#       probaout= DataFrame(probaout)
#        print probaout
        return outputtest
コード例 #40
0
ファイル: main.py プロジェクト: lewellen/digit-recognizer
def estimateAccuracy(model, limit):
    asTrain, asTest = split("../data/train.csv", limit)
     
    model.fit(asTrain)
   
    testY = [ x.Y for x in asTest ]
    testPredictions = model.predict(asTest)
  
    print("%f" % (accuracy_score(testY, testPredictions)))
    
    print confusion_matrix(testY, testPredictions)
コード例 #41
0
ファイル: estimator.py プロジェクト: bizreach/common-ml
 def score(self, X, y, sample_weight=None):
     from commonml.skchainer.classifier import Classifier
     from commonml.skchainer.regressor import Regressor
     if isinstance(self.model, Classifier):
         from sklearn.metrics.classification import accuracy_score
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
     elif isinstance(self.model, Regressor):
         from sklearn.metrics.regression import r2_score
         return r2_score(y, self.predict(X), sample_weight=sample_weight,
                         multioutput='variance_weighted')
     else:
         raise ValueError('Unsupported model.')
コード例 #42
0
ファイル: clasificador.py プロジェクト: mikamb93/TFG
def evaluacion_train_test():
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=0)
 
    # Creacion y entrenamiento del clasificador
    rfc = RFC(n_estimators=100,n_jobs=-1)
    rfc.fit(X_train, y_train)
     
    # Prediccion de la etiqueta del test y evaluacion
    y_pred = rfc.predict(X_test)
    y_pred_proba = rfc.predict_proba(X_test)
 
    return accuracy_score(y_test, y_pred),rfc
コード例 #43
0
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False))
    model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
コード例 #44
0
def four_algorythms(algorythm, n_est):
    if algorythm == "RandomForestClassifier":
        prediction = RandomForestClassifier(n_estimators=n_est)
    if algorythm == "ExtraTreesClassifier":
        prediction = ExtraTreesClassifier(n_estimators=n_est)
    if algorythm == "AdaBoostClassifier":
        prediction = AdaBoostClassifier(n_estimators=n_est)
    if algorythm == "GradientBoostingClassifier":
        prediction = GradientBoostingClassifier(n_estimators=n_est)

    prediction = prediction.fit(train, train_y)
    validate_y = np.concatenate((np.ones(50), np.zeros(50)))
    predicted_y = prediction.predict(validate)
    return(accuracy_score(validate_y, predicted_y))
コード例 #45
0
def __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name,
                            args):
    probablistic_predictions = False
    if args.predict_proba:
        predict_proba_func = getattr(clf, "predict_proba", None)
        if predict_proba_func is not None:
            probablistic_predictions = True
            prob_predictions = clf.predict_proba(x_test)
            predictions = []
            pos_predictions = []
            for prediction in prob_predictions:
                pos_predictions.append(prediction[1])
                if prediction[1] > args.predict_threshold:
                    predictions.append(1)
                else:
                    predictions.append(-1)
            pos_predictions = np.array(pos_predictions)
            mean_confidence = np.mean(pos_predictions)
            max_confidence = max(pos_predictions)
            min_confidence = min(pos_predictions)
            print "Mean confidence: " + str(mean_confidence)
            print "Max confidence: " + str(max_confidence)
            print "Min confidence: " + str(min_confidence)
            predictions = np.array(predictions)
        else:
            predictions = clf.predict(x_test)
    else:
        predictions = clf.predict(x_test)
    precision = precision_score(y_test, predictions, [-1, 1])
    recall = recall_score(y_test, predictions, [-1, 1])
    auc_score = roc_auc_score(y_test, predictions, None)
    accuracy = accuracy_score(y_test, predictions)
    print "Train/test set sizes: " + str(len(x_train)) + "/" + str(len(x_test))
    print "Precision is: " + str(precision)
    print "Recall is: " + str(recall)
    print "AUC ROC Score is: " + str(auc_score)
    print "Accuracy is: " + str(accuracy)
    true_count = len([1 for p in predictions if p == 1])
    actual_count = len([1 for y in y_test if y == 1])
    print "True count (prediction/actual): " + str(true_count) + "/" + str(actual_count)

    if args.write_to_log:
    # Write out results as a table to log file
        write_log(out_file_name=out_file_name, args=args, classifier=classifier,
                    precision=precision, recall=recall,
                    true_count=true_count, actual_count=actual_count,
                    X_train=x_train, X_test=x_test,
                    auc=auc_score, accuracy=accuracy,
                    probablistic_prediction=probablistic_predictions,
                    prediction_threshold=args.predict_threshold)
コード例 #46
0
ファイル: RFclass.py プロジェクト: jp1989326/Quant-Ver1
 def testforest(self, test, testlabel,forest):
     outputtest= forest.predict(test) 
     accuracytrain = accuracy_score(testlabel, outputtest)
     print "The size of the test set is"
     print  np.shape(test)
     print "The accuracy for the test set is %r" %accuracytrain, "and the confusion matrix is"
     #print confusion_matrix(outputtest,testlabel)
     print classification_report(testlabel, outputtest)
     # generate probability
     outputproba=forest.predict_proba(test)
     outperfor={'prob0':outputproba[:,0],'prob1':outputproba[:,1],'output':outputtest,'target':testlabel}
     outframe=DataFrame(outperfor)
     print outframe
     #outframe.to_csv(r'D:\allprob.csv', header=0)
     return accuracytrain, outframe
コード例 #47
0
 def train(self, data, target, deep):
     'En esta funcion se realiza 10-Fold CV para entrenar la red con una expansion de entre 20-75%.'
     'El algoritmo de entrenamiento es Descenso por Gradiente Estocastico.'
     # 10-Fold Cross Validation
     folds = 10; iters = 10;
     kf = KFold(data.shape[0], n_folds=folds)
     if deep:
         hiddenNodes = np.arange(data.shape[1],2*data.shape[1])+1
     else:
         hiddenNodes = np.arange(data.shape[1],10*data.shape[1])+1
     hiddenNodes = hiddenNodes[hiddenNodes>0]
     Error_HNodes = []
     Nets_HNodes = []
     for j in hiddenNodes:
         self.setHiddenNodes([j])
         Mean_error_iter = []
         Mean_nets_iter = []
         for train_index, val_index in kf:
             X, Xval = data[train_index], data[val_index]
             T, Tval = target[train_index], target[val_index]
             Error_iter = []
             Nets_iter = []
             for i in np.arange(iters):
                 self.initialization() # Inicializaciones comunes
                 Out,H,N = self.sim(X)
                 H = H[-1]
                 self.Weights[-1] = np.dot(pinv(H),T)
                 # Validation
                 Out_val,H_val,N_val = self.sim(Xval)
                 # Se guarda el error y la red
                 # MSE = [mean_squared_error(Tval,Out_val)]
                 # Error de clasificacion
                 Error = [accuracy_score(Tval, Out_val)]
                 #Error = [f1_score(Tval, Out_val)]
                 Networks = [self.Weights]
                 Error_iter.append(np.min(Error))
                 Nets_iter.append(Networks[np.argmin(Error)])
             Mean_error_iter.append(np.mean(Error_iter))
             Mean_nets_iter.append(Nets_iter[np.argmin(Error_iter)])
         Error_HNodes.append(np.mean(Mean_error_iter))
         Nets_HNodes.append(Mean_nets_iter[np.argmin(Mean_error_iter)])
     self.Weights = Nets_HNodes[np.argmin(Error_HNodes)]
     Final_Error = np.min(Error_HNodes)
     selected_Nodes = hiddenNodes[np.argmin(Error_HNodes)]
     self.setHiddenNodes([selected_Nodes])
     return Final_Error
コード例 #48
0
ファイル: hw4.py プロジェクト: Newstoryworld/homework
def resh(classifier, x):
    if classifier == "RandomForestClassifier":
        pred = RandomForestClassifier(n_estimators=x)

    if classifier == "ExtraTreesClassifier":
        pred = ExtraTreesClassifier(n_estimators=x)

    if classifier == "AdaBoostClassifier":
        pred = AdaBoostClassifier(n_estimators=x)

    if classifier == "GradientBoostingClassifier":
        pred = GradientBoostingClassifier(n_estimators=x)

    pred = pred.fit(train, train_y)
    validate_y = np.concatenate((np.ones(146), np.zeros(112)))
    predicted_y = pred.predict(validate)
    return(accuracy_score(validate_y, predicted_y))
コード例 #49
0
ファイル: RFclass.py プロジェクト: jp1989326/Quant-Ver1
 def trainforest(self, seed, train, trainlabel, number_trees, accuracy_train_calculation = False):
     seed_of_tree = {'rf': RandomForestClassifier(n_estimators= number_trees, max_features=8), 
                   'adb': AdaBoostClassifier(n_estimators= number_trees),
                   'bag': BaggingClassifier(n_estimators= number_trees, max_features=8),
                   'ext': ExtraTreesClassifier(n_estimators= number_trees, max_features=8),
                   'gbt': GradientBoostingClassifier(n_estimators= number_trees, max_features=8)}
     rawforest=seed_of_tree[seed]
     forest=rawforest.fit(train,trainlabel)
     outputtrain= forest.predict(train)
     
     print "The size of the training set is %r , %r" %(np.shape(train)[0],np.shape(train)[1])
     if accuracy_train_calculation == True : 
         accuracytrain = accuracy_score(trainlabel, outputtrain)
         print "The accuracy for the training set is %r" %accuracytrain
     
     #---------------------------------------- print "The method is %r" %seed
     # print "The accuracy for the training set is %r" %accuracytrain, "and the confusion matrix is"
     #------------------------ print confusion_matrix(outputtrain,trainlabel)
     return forest
コード例 #50
0
def agnews_bngrams(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_agnews_data(size=sample)

    input = [' '.join([title, descr]) for title, descr in zip(df.title.values, df.description.values)]
    target = df.category

    if sample:
        test_size = int(round(np.sum(2000*df.category.value_counts().values/32000)))
    else:
        test_size = 2000*4

    X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size)

    grid = bag_ngram_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
コード例 #51
0
def yelpstars_bngrams(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_yelp_stars_data(size=sample)

    input = df.text
    target = df.stars

    if sample:
        test_size = floor(len(df) * 1./14)
    else:
        test_size = 10000*len(df.stars.unique())

    X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size)

    grid = bag_ngram_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
コード例 #52
0
def sogou_bwords(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_sogou_data(size=sample)

    input = [' '.join([title, content]) for title, content in zip(df.contenttitle.values, df.content.values)]
    target = df.cat_en

    if sample:
        test_size = int(round(np.sum(12000*df.cat_en.value_counts().values/102000)))
    else:
        test_size = 12000*5

    X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size)

    grid = bag_words_grid(n_procs)
    grid.fit(X, y)

    print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
コード例 #53
0
 def fineTuning(self,data,target):
     # Una vez establecidos todos los pesos, se procede al ajuste fino
     epoch = 0
     Error = []
     Networks = []
     while epoch <= 10:
         Out,H,N = self.sim(data)
         H = H[-1]
         pseudoinverse = pinv(H)
         beta = np.dot(pseudoinverse,target)
         self.Weights[-1] = beta
         # Validation
         Out,H,N = self.sim(data)
         # Error de regresion. MSE
         #Error.append(mean_squared_error(data,Out))
         Networks.append(self.Weights)
         # Error de clasificacion
         Error.append(accuracy_score(target, Out))
         #Error.append(f1_score(target, Out))
         epoch += 1
     Final_Error = np.min(Error)
     self.Weights = Networks[np.argmin(Error)]
     return Final_Error
コード例 #54
0
def __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name,
                            args):
    predictions = clf.predict(x_test)
    precision = precision_score(y_test, predictions, [-1, 1])
    recall = recall_score(y_test, predictions, [-1, 1])
    auc_score = roc_auc_score(y_test, predictions, None)
    accuracy = accuracy_score(y_test, predictions)
    print "Train/test set sizes: " + str(len(x_train)) + "/" + str(len(x_test))
    print "Precision is: " + str(precision)
    print "Recall is: " + str(recall)
    print "AUC ROC Score is: " + str(auc_score)
    print "Accuracy is: " + str(accuracy)
    true_count = len([1 for p in predictions if p == 1])
    actual_count = len([1 for y in y_test if y == 1])
    print "True count (prediction/actual): " + str(true_count) + "/" + str(actual_count)

    if args.write_to_log:
    # Write out results as a table to log file
        write_log(out_file_name=out_file_name, args=args, classifier=classifier,
                    precision=precision, recall=recall,
                    true_count=true_count, actual_count=actual_count,
                    X_train=x_train, X_test=x_test,
                    auc=auc_score, accuracy=accuracy)
コード例 #55
0
ファイル: task0.py プロジェクト: zhivkoplias/pythoncourse
train = np.concatenate((train_meme_dogs, train_snuffle_dogs))
validate = np.concatenate((validate_meme_dogs, validate_snuffle_dogs))

train_y = np.concatenate((np.ones(150), np.zeros(150)))

list_of_n_estimators = (10, 20, 40, 80, 100, 150, 200, 300, 400, 500, 1000)
list_of_forests = (RandomForestClassifier, ExtraTreesClassifier,
                 AdaBoostClassifier, GradientBoostingClassifier)
a = []
for j in list_of_forests:
    for i in list_of_n_estimators:
        random_forest = j(n_estimators=i)
        random_forest = random_forest.fit(train, train_y)
        validate_y = np.concatenate((np.ones(39), np.zeros(39)))
        predicted_y = random_forest.predict(validate)
        print(accuracy_score(validate_y, predicted_y))
        a.append(accuracy_score(validate_y, predicted_y))

with open('forests_table.txt', 'w') as output_trees:
    output_trees.write('Algorithm\tn=10\tn=20\tn=40\tn=80\tn=100\tn=150\tn=200\tn=300\tn=400\tn=500\tn=1000\n')
    output_trees.write('RandomForestClassifier\t')
    for res in a[0:11]:
                output_trees.write("%s " % res + '\t')
    output_trees.write('\n')
    output_trees.write('ExtraTreesClassifier\t')
    for res in a[11:22]:
                output_trees.write("%s " % res + '\t')
    output_trees.write('\n')
    output_trees.write('AdaBoostClassifier\t')
    for res in a[22:33]:
                output_trees.write("%s " % res + '\t')
コード例 #56
0
# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features)

clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30)

print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)

print("\nPredictions on dev set")

# читаем отладочное множество
X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features)
y_pred = clf.predict(X_dev, lengths_dev)

print("Whole seq accuracy    ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
print("Element-wise accuracy ", accuracy_score(y_dev, y_pred))
print("Mean F1-score macro   ", f1_score(y_dev, y_pred, average="macro"))

print("\nPredictions on test set")

# читаем тестовое множество
X_test, _, lengths_test = load_conll(open("../resources/test.data", "r"), features)
y_pred = clf.predict(X_test, lengths_test)

print(pd.Series(y_pred).value_counts())

print("Saving predicted as a submission")

with open("submission.csv", "w") as wf:
    wf.write("id,tag\n")
    for id, tag in enumerate(list(y_pred)):
コード例 #57
0
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                  in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                 in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014,
                        weights=[char_embedding()], trainable=False))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
コード例 #58
0
validate_y = np.concatenate((np.ones(38), np.zeros(38)))

results = open("scores.txt", "w")
results.write("n_estimators" + "\t")
for i in accuracy:
    results.write(str(i) + "\t")
results.write("\n")

for classifier in algorithms:
    results.write(classifier.__name__ + "\t")
    for score in accuracy:
        algorithm = classifier(n_estimators=score)
        algorithm = algorithm.fit(train, train_y)

        predicted_y = algorithm.predict(validate)
        acc_score = accuracy_score(validate_y, predicted_y)
        results.write(str(acc_score) + "\t")
        print("done for " + classifier.__name__ + "with n_estimators" + str(score) + " " + str(acc_score))
    results.write("\n")


# I`ve got the biggest accuracy score (0.8684) with AdaBoostClassifier with n_estimators=300
ada = AdaBoostClassifier(n_estimators=300)
ada_train = ada.fit(train, train_y)

predicted_y = ada_train.predict(validate)
acc_score = accuracy_score(validate_y, predicted_y)
print(acc_score)

unknown = get_images("unknown")
un_predicted = ada_train.predict(unknown)
コード例 #59
0
    X_test_img = []
    for idx in ids_test:
        product = json.loads(data[idx]['product'])
        X_test_img.append(data[idx]['image_emb'])
        description = product['Description']
        tokenized = word_tokenize(description)
        tfidf = np.zeros(vocab_size)
        for w,c in Counter(tokenized).iteritems():
            if w in vocab:
                tfidf[vocab_dict[w]] = idfs[w] * float(c) / len(tokenized)
        X_test_txt.append(tfidf)
    X_test_txt = np.array(X_test_txt)
    X_test_img = np.array(X_test_img)
    
    # Training
    for cat, (y_train, y_test) in enumerate(zip(y_trains, y_tests)):
        lr_txt = LogisticRegression()
        lr_img = LogisticRegression()
        lr_txt.fit(X_train_txt, y_train)
        lr_img.fit(X_train_img, y_train)
        classes = lr_img.classes_
        p_txt = lr_txt.predict_proba(X_train_txt)
        p_img = lr_img.predict_proba(X_train_img)
        p = p_img + p_txt
        train_score = 100*accuracy_score(classes[p.argmax(axis=1)], y_train)    
        p_txt = lr_txt.predict_proba(X_test_txt)
        p_img = lr_img.predict_proba(X_test_img)
        p = p_img + p_txt
        test_score = 100*accuracy_score(classes[p.argmax(axis=1)], y_test)
        fwrite('Category %d:\n\tTrain score = %2.1f%%\n\tTest score = %2.1f%%\n\n' % (cat+1,train_score, test_score))
コード例 #60
0
# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))


# Plot posteriors
plt.figure(0)
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)