コード例 #1
0
def version2():  # Data cleaning in NLP Model
    corpus = []

    for i in range(0, 527383):
        review = re.sub(
            '[^a-zA-Z]', ' ',
            df.iloc[i,
                    1])  # Removing all elements except words from all reviews
        review = review.lower()
        review = review.split()
        review = [
            word for word in review if not word in set(sw.words('english'))
        ]
        stammer = ps()
        review = [stammer.stem(word) for word in review]
        review = " ".join(review)
        corpus.append(review)

    features = cv().fit_transform(corpus)
    labels = df.iloc[:, -1]

    train_test_split(features, labels, 100)

    features_test_vectorized = cv().transform(features_test)
    features_train_vectorized = cv().fit_transform(features_train)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
def loop_through_csv(df):
    FPR = []
    TPR = []
    n = 1  # n stores number of each run
    #for loop runs through point in the data set
    for i in range(1, 29, 3):
        y_pred = set_pred(df, i).copy()  #sets the predictor value
        y_true = set_true(df, i).copy()  #sets the true value
        #prints which run it is
        print("Run " + str(n))
        #Use Sklearn to check if matrix and summary is correct
        print("Confusion Matrix with summary using Sklearn to Check")
        print(cm(y_true, y_pred))
        cfm = cm(y_true, y_pred)
        print(classification_report(y_true, y_pred))
        #Field is calculated without library
        print("Calculated Confusion Matrix with Summary that is Calculated")
        print(pd.crosstab(y_true, y_pred))
        #Prints the the confusion matrix
        print(
            pd.crosstab(y_true,
                        y_pred,
                        rownames=['True'],
                        colnames=['Predicted'],
                        margins=True))
        #Returns the
        (TP, FP, TN, FN) = set_plot(cfm, n)
        print_metrics(TP, FP, TN, FN)
        FPRm, TPRm = return_rates(TP, FP, TN, FN)
        FPR.append(FPRm)
        TPR.append(TPRm)
        n = n + 1

    return FPR, TPR
def train_predict(classifier, sample_size, X_train, X_test, y_train,  y_test,typ): 
    
    # inputs:
    #   classifier: the learning algorithm to be trained and predicted on
    #   sample_size: the size of samples (number) to be drawn from training set
    #   X_train: features training set
    #   y_train: Activity_number_ID training set
    #   X_test: features testing set
    #   y_test: Activity_number_ID testing set
    
    # Empty dictionary will include all dataframes and info related to training and testing.
    results = {}
    
    # Fitting the classifier to the training data using slicing with 'sample_size'
    start= timer() # Get start time
    classifier = classifier.fit(X_train[0:sample_size,:],y_train[0:sample_size])# fiting the classfier
    end = timer() # Get end time
    
    # Calculate the training time
    results['train_time'] = end-start
        
    # Get the predictions on the test set(X_test),
    # then get predictions on the first 3000 training samples(X_train) using .predict()
    start = timer() # Get start time
    predictions_test = classifier.predict(X_test) # predict
    predictions_train =classifier.predict(X_train[:3000,:])
    end = timer() # Get end time
    
    # Calculate the total prediction time
    results['pred_time'] =end-start
            
    # Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy(y_train[:3000],predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy(y_test,predictions_test)
    
    # Adapting the confusion matrix shape to the type of data used
    if typ==1:
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6], sample_weight=None) # 
        columns=['WK','WU','WD','SI','ST','LD']
        index=['WK','WU','WD','SI','ST','LD']
    if typ==2:
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7,8,9,10,11,12], sample_weight=None)
        columns=['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St']
        index=  ['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] 
    if typ==3:   
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7], sample_weight=None)
        columns=['WK','WU','WD','SI','ST','LD','PT']
        index=['WK','WU','WD','SI','ST','LD','PT']
    
    if sample_size==len(X_train):# if 100% of training is achieved
        # apply the confusion matrix function to the last contingency table generated
        confusion_matrix_df=(pd.DataFrame(data=confusion_matrix,columns=columns,index=index)).pipe(full_confusion_matrix)
    else:# if not
        # create a dataframe from the contingency table
        confusion_matrix_df=pd.DataFrame(data=confusion_matrix,columns=columns,index=index)
        
    # Return the results
    return (results,confusion_matrix_df)
コード例 #4
0
    def classifier(self):
        db = self.db_prepared.copy()
        db['quality_range'] = db.quality.apply(lambda q: 0 if q <= 4 else 1
                                               if q <= 7 else 2)
        db['type'] = db.type.apply(lambda q: 0 if q == 'white' else 1)
        X = db[[
            'type', 'alcohol', 'density', 'volatile acidity', 'chlorides',
            'citric acid', 'fixed acidity', 'free sulfur dioxide',
            'total sulfur dioxide', 'sulphates', 'residual sugar', 'pH'
        ]]
        y = db.quality_range
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=40)
        lr = LogisticRegression(random_state=40)
        lr.fit(X_train, y_train)
        train_accuracy = lr.score(X_train, y_train)
        test_accuracy = lr.score(X_test, y_test)
        print('One-vs-rest',
              '-' * 35,
              'Accuracy in Train Group   : {:.2f}'.format(train_accuracy),
              'Accuracy in Test  Group   : {:.2f}'.format(test_accuracy),
              sep='\n')
        predictions = lr.predict(X_test)
        score = round(accuracy_score(y_test, predictions), 3)
        cm1 = cm(y_test, predictions)
        sns.heatmap(cm1, annot=True, fmt=".0f")
        plt.xlabel('Predicted Values')
        plt.ylabel('Actual Values')
        plt.title('Accuracy Score: {0}'.format(score), size=15)
        plt.show()

        pred_test = lr.predict(X_test)
        pred_train = lr.predict(X_train)

        quality_pred = LogisticRegression(random_state=40)
        quality_pred.fit(X_train, y_train)

        confusion_matrix_train = cm(y_train, pred_train)
        confusion_matrix_test = cm(y_test, pred_test)

        TN = confusion_matrix_test[0][0]
        TP = confusion_matrix_test[1][1]
        FP = confusion_matrix_test[0][1]
        FN = confusion_matrix_test[1][0]

        print("(Total) True Negative       :", TN)
        print("(Total) True Positive       :", TP)
        print("(Total) Negative Positive   :", FP)
        print("(Total) Negative Negative   :", FN)

        print("Accuracy Score of Our Model     : ",
              quality_pred.score(X_test, y_test))
        Error_Rate = 1 - (accuracy_score(y_test, pred_test))
        print("Error rate: ", Error_Rate)
コード例 #5
0
def version1():  # Logistic Regression Model
    train_test_split(df["reviewText"], df["Positivity"], 100)

    features_train_vectorized = cv().fit_transform(features_train)
    features_test_vectorized = cv().transform(features_test)

    model = lr().fit(features_train_vectorized,
                     labels_train)  # Model creation for logistic regression
    predictions = model.predict(features_test_vectorized)

    ras(labels_test, predictions)  # Generating prediction score
    cm(labels_test, predictions)

    return model
コード例 #6
0
def version3():  # TF_IDF Model
    global vect

    train_test_split(df["reviewText"], df["Positivity"], 100)

    vect = TfidfVectorizer(min_df=5)
    features_train_vectorized = vect.fit_transform(features_train)
    features_test_vectorized = vect.transform(features_test)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
コード例 #7
0
ファイル: feature_model.py プロジェクト: Ladvien/ladvien_ml
 def confusion_matrix_printed(self, actual_y, y_hat):
     tn, fn, tp, fp = cm(actual_y, y_hat).ravel()
     error = fn + fp
     correct = tn + tp
     total = error + correct
     print(f'Error: {round(error / total, 4) * 100}%')
     print(f'Accuracy: {round(correct / total, 4) * 100}%')
    def RandomForest_Classifier(self):  #Random foreste göre sınıflandırma
        xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]],
                                           veri["sex"],
                                           test_size=0.33)

        rfc = RFC(n_estimators=10)
        rfc.fit(xtrain, ytrain)
        tahmin = rfc.predict(xtest)
        ConfMatris = cm(tahmin, ytest)
        ConfMatris = p.DataFrame(data=ConfMatris,
                                 index=["Erkek Sayısı", "Kadın Sayısı"],
                                 columns=["Erkek Tahmini", " Kadın Tahmin"])
        plt.title(
            "RANDOM FOREST ALGORİTMASINA GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n"
        )
        plt.pcolormesh(ConfMatris)
        plt.show()
        print(
            "RANDOM FOREST ALGORİTMASINA GÖRE SINIFLANDIRMA İÇİN KARMAŞIKLIK MATRİSİ"
        )
        print(ConfMatris)
        dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1]
        yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1]
        print(
            "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}"
            .format(dogru, yanlis))
コード例 #9
0
ファイル: ML_snippets.py プロジェクト: roywright/ML_snippets
def cm_labeled(clf, Xtest, ytest, threshold = 0.5):
    '''Show a nicely-labeled version of the confusion matrix.'''
    return pd.DataFrame(
        cm(ytest, clf.predict_proba(Xtest)[:,1] >= threshold, labels = [1,0]), 
        columns = ['Predicted positive', 'Predicted negative'],
        index   = ['Actually positive',  'Actually negative']
    )
    def SupportVectorMachine(
            self):  #Destek vektör makinesi algoritmasına göre sınıflandırma
        xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]],
                                           veri["sex"],
                                           test_size=0.33)

        supportvector = SVC(kernel="linear")
        supportvector.fit(xtrain, ytrain)
        tahmin = supportvector.predict(xtest)
        ConfMatris = cm(tahmin, ytest)
        ConfMatris = p.DataFrame(data=ConfMatris,
                                 index=["Erkek Sayısı", "Kadın Sayısı"],
                                 columns=["Erkek Tahmini", " Kadın Tahmin"])
        plt.title(
            "DESTEK VEKTÖR MAKİNESİNE GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n"
        )
        plt.pcolormesh(ConfMatris)
        plt.show()
        print("DESTEK VEKTÖR MAKİNESİ İÇİN KARMAŞIKLIK MATRİSİ")
        print(ConfMatris)
        dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1]
        yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1]
        print(
            "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}"
            .format(dogru, yanlis))
コード例 #11
0
def acc(loader):
    accuracy = 0
    num_batches = 0
    act = np.array([])
    pred = np.array([])
    for batch in loader:
        gpu = batch.question_text.to(device).long()
        preds = bid_lstm_cnn(gpu)
        target = batch.target.numpy()
        preds = preds.cpu().detach().numpy()
        preds = np.array([np.argmax(row) for row in preds])
        total_correct = sum(target == preds)

        act = np.concatenate((act, target))
        pred = np.concatenate((pred, preds))

        accuracy += total_correct
        num_batches += 1
    ass = accuracy / (num_batches * batch_size)
    print(ass)
    formula1 = f1(act, pred)
    print(formula1)
    tn, fp, fn, tp = cm(act, pred).ravel()
    print(
        'True positives -> {}\nFalse positives -> {}\nTrue negatives -> {}\nFalse negatives -> {}\n'
        .format(tp, fp, tn, fn))
    return ass, formula1
コード例 #12
0
def error_display(result, num=1):
    y_true = result['gt_class']
    y_pred = result['pre_class']
    cmatrix = cm(y_true, y_pred)

    num_finechips = sum(cmatrix[0])
    num_flawchips = len(result) - num_finechips
    num_pre_finechips = sum(cmatrix[:, 0])
    num_pre_flawchips = len(result) - num_pre_finechips

    print('confusion matrix:\n')
    print(cmatrix)

    pres = []
    recs = []
    for i in range(4):
        precison = cmatrix[i, i] / (sum(cmatrix[i]) +1)* 100
        recall = cmatrix[i, i] / (sum(cmatrix[:, i])+1) * 100
        pres.append(precison)
        recs.append(recall)
        print('precision and recall on class%d : %d%%   %d%% \n' % (i, precison, recall))

    print('total validation samples num :  %d' % len(result))
    print('mean presicion : %d%%' % (np.mean(pres)))
    print('mean recall : %d%%' % (np.mean(recs)))
    print('mean ap :  %d%% ' % (np.mean(result['ap']) * 100))
    return
コード例 #13
0
def cm_f1_test(model, test_data, test_labels):

    test_pred = model.predict(test_data)
    scores = f1(test_labels, test_pred, average=None)
    argSort = scores.argsort()
    scores = scores[argSort]
    return cm(test_labels, test_pred), (argSort[:2], scores[:2])
コード例 #14
0
ファイル: cnn.py プロジェクト: q13245632/CourseGoodPractice
def train(ctx, vocab_size, num_classes, filter_num, batch_size,
          word_embed_size, training_steps, learning_rate, print_loss_every,
          confusion_matrix, keep_proba, filter_sizes, save_model):

    # Load dataset
    (x_train, y_train), (x_test, y_test) = get_dataset(ctx.train_path,
                                                       ctx.test_path)
    sequence_length = x_train.shape[1]
    dataset_size = x_train.shape[0]

    tf.reset_default_graph()
    with tf.Graph().as_default():
        cnn = TextCNN(sequence_length, vocab_size, word_embed_size,
                      filter_sizes, filter_num, num_classes)

        # Set eval feed_dict
        train_feed_dict = {
            cnn.input_x: x_train,
            cnn.input_y: y_train,
            cnn.keep_proba: 1.0
        }
        test_feed_dict = {
            cnn.input_x: x_test,
            cnn.input_y: y_test,
            cnn.keep_proba: 1.0
        }

        # Train
        saver = tf.train.Saver()
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(cnn.loss)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for i in range(training_steps):
                start = (i * batch_size) % dataset_size
                end = min(start + batch_size, dataset_size)
                feed_dict = {
                    cnn.input_x: x_train[start:end],
                    cnn.input_y: y_train[start:end],
                    cnn.keep_proba: keep_proba
                }
                sess.run(train_step, feed_dict=feed_dict)
                if i % print_loss_every == 0:
                    avg_cost = cnn.loss.eval(feed_dict=feed_dict)
                    train_acc = cnn.accuracy.eval(feed_dict=train_feed_dict)
                    test_acc = cnn.accuracy.eval(feed_dict=test_feed_dict)
                    test_pred = cnn.pred.eval(feed_dict=test_feed_dict)
                    print(f"Epoch: {i:04d} | AvgCost: {avg_cost:7.4f}", end="")
                    print(f" | Train/Test ACC: {train_acc:.3f}/{test_acc:.3f}")

            # After training, save the sess
            if save_model:
                save_path = saver.save(sess, SESS_PATH)
                print('Model state has been saved!')

        if confusion_matrix:
            binary = cm(y_true=y_test, y_pred=test_pred)
            print('\n', 'Confusion Matrix: ')
            print(binary)
            plot_confusion_matrix(binary)
            plt.show()
    def KNN_Classification(
            self):  #K-Nearest Neighbour algoritmasına göre sınıflandırma
        xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]],
                                           veri["sex"],
                                           test_size=0.33)

        knn = KNN(n_neighbors=3)
        knn.fit(xtrain, ytrain)
        tahmin = knn.predict(xtest)
        ConfMatris = cm(tahmin, ytest)
        ConfMatris = p.DataFrame(data=ConfMatris,
                                 index=["Erkek Sayısı", "Kadın Sayısı"],
                                 columns=["Erkek Tahmini", " Kadın Tahmin"])
        plt.title(
            "K-NN SINIFLANDIRMASINA GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n"
        )
        plt.pcolormesh(ConfMatris)
        plt.show()
        print("K-NN SINIFLANDIRMASI İÇİN KARMAŞIKLIK MATRİSİ")
        print(ConfMatris)
        dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1]
        yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1]
        print(
            "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}"
            .format(dogru, yanlis))
コード例 #16
0
ファイル: Palko_class.py プロジェクト: mrgtreeskier/Titanic
 def update(self, model, trnx, trny, tstx, tsty, prediction, count, con, score):
     model.fit(trnx, trny)
     acc = model.score(tstx, tsty)
     prediction[:,count] = model.predict(tstx) #predict the outcomes
     con.append(cm(tsty, prediction[:,count]))#creates confusion matrix
     prediction[:,count] = (prediction[:,count])*acc
     score[:,count] = acc
コード例 #17
0
    def confusion_matrix(self, X, y):
        """Return a confusion matrix with format:
                      -----------
                      | TP | FP |
                      -----------
                      | FN | TN |
                      -----------
        Parameters
        ----------
        y_true : ndarray - 1D
        y_pred : ndarray - 1D

        Returns
        -------
        ndarray - 2D
        """
        x_vectorized = self._vectorizer.transform(X)
        y_pred = self._classifier.predict(x_vectorized)
        [[tn, fp], [fn, tp]] = cm(y, y_pred)
        print '-----------'
        print '| TP | FP |'
        print '-----------'
        print '| FN | TN |'
        print '-----------'
        return np.array([[tp, fp], [fn, tn]])
コード例 #18
0
def cosine_similar_measure(test_firingtime, y_test, a, b, c, avg_class_dist):

    i = 0
    y_pred_val = []
    sim = []
    tot_sim = []
    for a_val in test_firingtime:
        sim = []
        for b_val in avg_class_dist:
            sim.append(
                cosine_similarity(a_val.reshape(1, len(a_val)),
                                  b_val.reshape(1, len(b_val))))
        tot_sim.append(sim)
        y_pred_val.append(np.argmax(tot_sim[i]))
        i = i + 1
    from sklearn.metrics import (precision_score, recall_score, f1_score,
                                 accuracy_score, mean_squared_error,
                                 mean_absolute_error)
    accuracy = accuracy_score(y_test, y_pred_val) * 100
    recall = recall_score(y_test, y_pred_val, average="macro")
    precision = precision_score(y_test, y_pred_val, average="macro")
    f1 = f1_score(y_test, y_pred_val, average="macro")
    print("accuracy")
    print("%.3f" % accuracy)
    print("precision")
    print("%.3f" % precision)
    print("recall")
    print("%.3f" % recall)
    print("f1score")
    print("%.3f" % f1)
    from sklearn.metrics import confusion_matrix as cm
    cm = cm(y_test, y_pred_val)
    print("Confusion matrix\n", cm)
    return y_pred_val
コード例 #19
0
    def print_metrics(self, predicted_output):
        """
        Print some MVP metrics. sklearn is used for calculation of all the
        metric values. Confusion matrix values (true positive, false negative,
        false positive and true negative), precision, recall, f1-score and
        accuracy is calculated. There are few other metrics which comes under
        classification report, but meh to them.

        We need the actual labels and the predicted labels to calculate the
        metrics. We can get the actual labels from the class variable and
        the predicted output or predicted labels are passed as a parameter
        after running each algorithm.

        :param predicted_output: Predicted labels

        """

        res = cm(self.y_test, predicted_output)
        tp = res[0][0]
        fn = res[1][0]
        fp = res[0][1]
        tn = res[1][1]
        print("Accuracy: ", acs(self.y_test, predicted_output))
        print("TP: ", tp, ", FN: ", fn, ", FP: ", fp, "TN: ", tn)
        print(cr(self.y_test, predicted_output))
コード例 #20
0
ファイル: yogapose.py プロジェクト: cnzmeca/tufts
def eval(Y_true, Y_pred):
    cms = []
    for Y in Y_pred:
        cms.append(cm(Y_true, Y))
    cms = np.array(cms)
    cm_mean = np.mean(cms, axis=0)
    cm_std = np.std(cms, axis=0)
    return cm_mean, cm_std
コード例 #21
0
def main():
    train_data, train_labels, test_data, test_labels = test_train_split()
    predicted_output = predict(train_data, test_data[:, :57])
    print("confusion matrix : \n", cm(test_labels, predicted_output))
    print("Recall : ", recall(test_labels, predicted_output))
    print("Accuracy:",
          accuracy_score(test_labels, predicted_output) * 100, "%")
    print("precision : ", precision_score(test_labels, predicted_output))
コード例 #22
0
ファイル: process.py プロジェクト: acumartini/neuralnet
def lrw():
	lw(str(clfr) + '\n')
	lw(cr(y_test, y_))
	lw('\n\n')
	lw(str(cm(y_test, y_)))
	lw('\n\n')
	log.close()
	log = open(log_file, "a")
コード例 #23
0
def self_cm(X, y):
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    from sklearn.metrics import confusion_matrix as cm
    print(cm(y_test, gs3.predict(X_test)))
コード例 #24
0
def confusion_matrix(truth, predictions):

    # I use Sklearn's confusion_matrix and add the names of the classes.
    classes = ['art', 'eve', 'geo', 'gpe', 'nat', 'org', 'per', 'tim']
    df = pd.DataFrame(cm(truth, predictions, labels=classes))
    df.index = classes
    df.columns = classes
    return df
コード例 #25
0
def compAccu(X_dev, y_dev_df, pred_y, vocab, k=1):
    X_def_list = list(X_dev['word'])
    y_dev = list(y_dev_df['tag'])
    correct = 0
    total = 0
    unk_count = 0
    incorrect = []
    unk_incorrect = []
    unk_incorrect_label = []
    known_incorrect = []
    # print (pred_y)
    sentence_labels = []
    sentence_preds = []
    optimal = 0
    suboptimal = 0
    for word, pred, truth in zip(X_def_list, pred_y, y_dev):
        # print (pred, dev)
        if truth == '*' or truth == '<STOP>':
            continue
        if not is_unk(word, vocab):
            unk_count += 1
        if pred == truth:
            correct += 1
        else:
            incorrect.append([word, pred, truth])
            if is_unk(word, vocab):
                known_incorrect.append([word, pred, truth])
            else:
                unk_incorrect.append([word, pred, truth])
                unk_incorrect_label.append(pred)
                unk_incorrect_label.append(truth)
        total += 1
        sentence_labels.append(truth)
        sentence_preds.append(pred)
        if word == '.' or word == '!' or word == '?':
            if not correct_sentence(sentence_preds, sentence_labels):
                suboptimal += 1
            else:
                optimal += 1
            sentence_labels = []
            sentence_preds = []
        accu = correct/total

    print(
        f"Suboptimal: {suboptimal} / {optimal + suboptimal} = {suboptimal / (optimal + suboptimal)}")

    incorrect_df = pd.DataFrame(incorrect, columns=['X', 'pred', 'truth'])
    unk_incorrect_df = pd.DataFrame(
        unk_incorrect, columns=['X', 'pred', 'truth'])
    unk_incorrect_df.to_csv('unk_incorrect.csv')
    #print (incorrect_df)
    #print (unk_incorrect_df)
    #print ('known_accu: ', 1-len(known_incorrect)/(total-unk_count))
    print('unk_accu: ', 1-len(unk_incorrect)/unk_count)
    conf = cm(unk_incorrect_df['truth'], unk_incorrect_df['pred'], labels=[
              'NN', 'NNS', 'NNP', 'NNPS'])
    np.savetxt("conf", conf, delimiter=",", fmt='%3.0f')
    return accu
コード例 #26
0
    def confusion_matrix(self, y_true, y_pred, labels=None):
        """Implementation of the confusion matrix.
        :param y_true: numpy.array
        :param y_pred: numpy.array
        :param labels: list[str] | list[int]
        :rtype: numpy.array
        """

        return cm(y_true, y_pred, labels=labels)
コード例 #27
0
def ACC(gt, pred):
    from sklearn.metrics import confusion_matrix as cm
    if type(gt) == list: gt = np.array(gt)
    if type(pred) == list: pred = np.array(pred)
    # ipdb.set_trace()
    acc = cm(gt, pred)
    acc_norm = acc.astype('float') / acc.sum(axis=1)[:, np.newaxis]
    aca = np.diag(acc_norm).mean()

    return acc, aca
コード例 #28
0
def evaluate(input_dir, corpus, metrics):
    classes = [
        "NOT", "PART-OF", "INTERACTOR", "REGULATOR-POSITIVE",
        "REGULATOR-NEGATIVE"
    ]
    predictions = []
    true_classes = []
    correct_predictions = 0

    data_file = open(corpus, "r")
    data_list = data_file.read().split("\n")
    device = torch.device("cuda")

    # Load model
    model = BertForSequenceClassification.from_pretrained(
        input_dir, local_files_only=True, cache_dir=None)
    tokenizer = BertTokenizer.from_pretrained(input_dir)
    model.to(device)

    # Predict classes
    for seq in data_list:
        text = json.loads(seq)["text"]
        true_class = json.loads(seq)["custom_label"]
        input_ids = torch.tensor(
            tokenizer.encode(text)).unsqueeze(0).to(device)
        outputs = model(input_ids)

        pred_class = classes[torch.softmax(outputs.logits, dim=1).argmax()]

        if pred_class == true_class:
            correct_predictions += 1

        predictions.append(pred_class)
        true_classes.append(true_class)

    precision, recall, fscore, _ = score(true_classes,
                                         predictions,
                                         average='macro')

    # Print the classification report and confusion matrices
    print(classification_report(true_classes, predictions))
    print(
        cm(true_classes,
           predictions,
           labels=[
               "INTERACTOR", "NOT", "PART-OF", "REGULATOR-NEGATIVE",
               "REGULATOR-POSITIVE"
           ]))

    metrics["f1-score"].append(fscore)
    metrics["recall"].append(recall)
    metrics["precision"].append(precision)
    metrics["accuracy"].append(correct_predictions / len(data_list))

    return metrics
コード例 #29
0
def conf_matrix(pred, test, tlist):
	'''Computes the confusion matrix over the predicitions from the model.
	-pred: set of predictions
	-test: set of ground truth
	-tlist: list of classes.'''
	test.loc[:,1] = pred
	test.loc[:,0] = [tlist[i] for i in test.loc[:,0]]
	test.loc[:,1] = [tlist[i] for i in test.loc[:,1]]
	classes = np.unique(test.loc[:,0])
	conf_mat = cm(test.loc[:,0], test.loc[:,1], classes)
	return conf_mat, classes
コード例 #30
0
def confusion_matrix(y_true,
                     y_pred,
                     classes,
                     normalize=False,
                     title='Confusion matrix',
                     cmap=plt.cm.Blues):
    c = cm(y_true.tolist(), y_pred.tolist(), labels=classes)
    plot_confusion_matrix(c,
                          classes,
                          normalize=normalize,
                          title=title,
                          cmap=cmap)
コード例 #31
0
ファイル: 03CNN.py プロジェクト: monado3/HAIT
def plot_cm(y_true, y_pred):
    confmat = cm(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
    plt.xticks(np.arange(0, 5, 1))  # x軸の目盛りを指定
    plt.yticks(np.arange(0, 5, 1))
    plt.xlabel('true label')
    plt.ylabel('predicted label')
    plt.show()
コード例 #32
0
def objective_recall(W, X1, X2, X1_label, X2_label):
    """
    Fairness objective based on recall
    """
    weighted_sum_m_norm, weighted_sum_f_norm = prepare_normalized_weighted_sum(
        W, X1, X2)

    predictions1 = [1 if w >= 0.5 else 0 for w in weighted_sum_m_norm]
    conf_mat1 = cm(X1_label, predictions1)
    TP1, FN1, FP1, TN1 = conf_mat1.ravel()
    recall1 = TP1 / (TP1 + FN1)

    predictions2 = [1 if w >= 0.5 else 0 for w in weighted_sum_f_norm]

    conf_mat2 = cm(X2_label, predictions2)

    TP2, FN2, FP2, TN2 = conf_mat2.ravel()
    recall2 = TP2 / (TP2 + FN2)

    ratio = get_ratio(recall1, recall2)

    return -(1 - ratio)
コード例 #33
0
ファイル: main.py プロジェクト: mattdns100689/iris
	def test(self):
		score = self.model.evaluate(self.X_te,self.y_te,
					batch_size=self.bs,
					verbose=1)

		print("*"*20)
		print("Test acc of ", score[1])
		self.pred_te = self.model.predict(self.X_te).argmax(1)
		y_te = self.y_te.argmax(1)
		cm_te = cm(y_te,self.pred_te)
		print('Test cm ==> ')
		print(cm_te)
		print("*"*20)
コード例 #34
0
ファイル: metrics.py プロジェクト: ElbaKramer/kaggle_EEG
def kappa(y_true, y_pred):
    O = cm(y_true, y_pred)

    N = max(max(y_true), max(y_pred)) + 1
    W = np.zeros((N, N), "float32")
    for i in np.arange(N):
        for j in np.arange(N):
            W[i, j] = (i - j) ** 2
    W /= (N - 1) ** 2

    hist_true = np.bincount(y_true, minlength=N)
    hist_pred = np.bincount(y_pred, minlength=N)
    E = np.outer(hist_true, hist_pred).astype("float32") / len(y_true)

    return 1 - (np.sum(W * O) / np.sum(W * E))
コード例 #35
0
def classifier(file_name):
    review_sparse_vect, rating_sparse_vect = bag_of_words(file_name)
    # support vector classifier one vs all
    clf = SVC(C=1, kernel='linear', gamma=1, verbose=False, probability=False,
              decision_function_shape='ovr')
    clf.fit(review_sparse_vect, rating_sparse_vect)
    # Model fitting completeion
    # print("Fitting completed")
    predicted = cv.cross_val_predict(clf, review_sparse_vect,
                                     rating_sparse_vect, cv=10)
    # calculation of metrics
    print("accuracy_score\t", acc_score(rating_sparse_vect, predicted))
    print("precision_score\t", pre_score(rating_sparse_vect, predicted))
    print("recall_score\t", rc_score(rating_sparse_vect, predicted))
    print("\nclassification_report:\n\n", cr(rating_sparse_vect, predicted))
    print("\nconfusion_matrix:\n", cm(rating_sparse_vect, predicted))
コード例 #36
0
ファイル: process.py プロジェクト: acumartini/neuralnet
	def benchmark(self, clf, X_train, y_train, X_test, y_test):
		output(80 * '_')

		# fit
		output("Training:")
		t0 = time()
		clf.fit(X_train, y_train)
		train_time = time() - t0
		output("train time: %0.3fs" % train_time)

		# predict
		t0 = time()
		pred = clf.predict(X_test)
		try:
			proba = clf.predict_proba(X_test)
		except:
			proba = None
		try:
			log_proba = clf.predict_log_proba(X_test)
		except:
			log_proba = None
		test_time = time() - t0
		output("test time:  %0.3fs" % test_time)

		# get metrics for the positve class only (heavy class imbalance)
		# p_score = mlu.get_pos_precision(cm(y_test, pred))
		# r_score = mlu.get_pos_recall(cm(y_test, pred))
		# f_measure = mlu.get_f_measure(p_score, r_score)

		# get metrics
		p_scores, r_scores, f_measures, support = get_scores(y_test, pred, self.beta)
		p_score_avg = p_scores.mean()
		r_score_avg = r_scores.mean()
		f_measure_avg = f_measures.mean()
		output("precision:  %0.3f \trecall:  %0.3f" % (p_score_avg, r_score_avg))

		# output results
		output("Classification results:")
		output(cr(y_test, pred))
		output(cm(y_test, pred))

		clf_descr = str(clf).split('(')[0] # get the name of the classifier from its repr()

		return clf_descr, p_score_avg, r_score_avg, f_measure_avg, train_time, test_time, proba
コード例 #37
0
               print 'Fold [%s]'%(i)
               X_train = train_pca[train_index]
               Y_train = tr_labels_tr[train_index]
               X_cv = train_pca[cv_index]
               Y_cv = tr_labels_tr[cv_index]
               clf.fit(X_train, Y_train)
               blend_train_pca[cv_index,j] = clf.predict(X_cv)
               blend_test_pca[:,j] = clf.predict(test_pca)
          blend_test_pca[:,j] = blend_test_pca.mean(1)

blend_test_train= np.append(blend_train,blend_train_pca,axis=1)
blend_test_test = np.append(blend_test,blend_test_pca,axis=1)

bclf = LogisticRegression()
pred = bclf.fit(blend_test_train,tr_labels_tr)
cm(pred,te_labels)

## class "OTHER" adaboost:
base_clf = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=None,class_weight='auto')
n_estimators = [10, 20, 30, 40, 50]
for n in n_estimators:
     clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
     clf.fit(train_,tr_labels_tr)
     pred = clf.predict(valid_)
     print("Validation data:")
     cm(pred, tr_labels_va)
     print("with {n} estimators:".format(n=n))
     pred_test = clf.predict(test_)
     print("Test data:")
     cm(pred_test, te_labels)
     
コード例 #38
0
ファイル: pilotTest3.py プロジェクト: julian-ramos/sampling

# log.act("creating full random sample")
# randData=np.genfromtxt(randFilename,delimiter=" ")
# botsData=np.genfromtxt(botsFilename,delimiter=" ")
# print(randData.shape)
# print(botsData.shape)
# fullRandData=np.vstack((randData,botsData))
# log.sum(fullRandData.shape,"data set size")
# log.sum(np.sum(fullRandData[:,1]),"number of bots")
# log.sum(fullRandData.shape[0]-np.sum(fullRandData[:,1]),"number of people")
# np.savetxt(fullRandFilename,fullRandData,delimiter="\t",fmt="%.2f")

log.act("building classifier")
data=np.genfromtxt(fullRandFilename,delimiter="\t")
lr=LogisticRegression(penalty="l1",C=0.8)
log.prevLine()
X=np.c_[data[:,2:27],data[:,28]]
print(X.shape)
y=data[:,1]
ids=np.argwhere(np.isnan(X))
print(ids.shape)
lr.fit(X,y)
# log.params("LogisticRegression(penalty=\"l1\",C=0.1)")
log.sum(f1_score(lr.predict(X),y),"f1 score")
log.sum(cm(y,lr.predict(X))," confusion matrix")
print(lr.get_params())
print(lr.coef_)


log.act("end")
コード例 #39
0
def confusion_matrix(actual, prediction):
    matrix = cm(actual, prediction)

    print matrix
コード例 #40
0
from sklearn.ensemble import RandomForestClassifier

def test_logit_classifier(X_train, X_valid, X_test, y_train, y_valid, y_test,param_c, pca = False):
     if pca = True:
          pca_transform = PCA()
          X_train = pca_transform.fit_transform(X_train)
          X_valid = pca_transform.transform(X_valid)
          X_test = pca_transform.transform(X_test)
          print("--PCA-transformed data--")
     for c in param_c:
          clf =LogisticRegression(C=c,class_weight='auto',penalty='l1',dual=False)
          clf.fit(X_train,y_train)
          pred = clf.predict(X_valid)
          print("Validation data:")
          print("  parameter C = {num}".format(num = c))
          cm(pred,y_valid)
          pred_test = clf.predict(X_test)
          print("Test data:)
          print("  parameter C = {num}".format(num=c))
          cm(pred_test,y_test)

def test_rf_classifier(X_train, X_valid, X_test, y_train, y_valid, y_test,param_n, pca = False):
     if pca = True:
          pca_transform = PCA()
          X_train = pca_transform.fit_transform(X_train)
          X_valid = pca_transform.transform(X_valid)
          X_test = pca_transform.transform(X_test)
          print("--PCA-transformed data--")
     for n_estimators in param_n:
          clf =RandomForestClassifier(n_estimators = n_estimators)
          clf.fit(X_train,y_train)
コード例 #41
0
def start_split_data(data_list):
    random_list = dc(data_list)
    random.shuffle(random_list)
    predicted_list = []
    mark = 0
    acc_list = []
    act_class_list = []
    for i in range(10):  # fold range
        test_list = []
        training_list = []
        while (mark < int(len(random_list))):
            for train_ele in range(0, mark):
                training_list.append(random_list[train_ele])
            else:
                index = mark
                mark = int(len(random_list) / 10) + index
                for test_element in range(index, mark):
                    test_list.append(random_list[test_element])
                for training_element in range(mark, int(len(random_list))):
                    training_list.append(random_list[training_element])
                    # print(training_list)
                    # fold completion
                Node.children = []
                Node.leaf_children = []
                Node.temp_children = []
                Node.new_children = []
                Node.len_training_list = len(training_list)
                Node.old_pessi_err = (node_err_cal(training_list, max_class(
                    training_list, class_column), class_column) + 1) / \
                                     Node.len_training_list
                root = Node(training_list)
                # print(root.data)
                root.node_type = 'root'
                build_tree(root)
                predicted_temp_list = []
                actual_list = []
                temp_root = dc(root)
                for test_element in test_list:
                    actual_list.append(int(test_element[class_column]))
                    found = int(class_finder(test_element, temp_root))
                    predicted_temp_list.append(found)
                    predicted_list.append(found)
                acc_list.append(
                    accuracy(actual_list, predicted_temp_list, class_column))
                break
    print(mean(acc_list))
    act_class_list = class_list_gen(random_list)
    # print(len(act_class_list),len(predicted_list))
    while (len(act_class_list) > len(predicted_list)):
        del act_class_list[-1]
    c_matrix = cm(act_class_list, predicted_list)
    print('Confusion matrix\n', c_matrix)
    c_report = cr(act_class_list, predicted_list)
    print("All Measures required for this data set \n", c_report)
    fpr, tpr, thd = rc(act_class_list, predicted_list)
    roc_auc = auc(fpr, tpr)
    if formula_input == 2:
        plt.title('ROC for %s with information gain(red) and gini(blue)'
                  % file_name[0])
        plt.plot(fpr, tpr,
                 label='%s  AUC = %0.2f' % (formula_measure, roc_auc))
        plt.legend(loc='lower right')
    else:
        plt.title('ROC for %s ' % file_name[0])
        plt.plot(fpr, tpr, label='%s  AUC = %0.2f' % (formula_measure,
                                                      roc_auc))
        plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
        plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
コード例 #42
0
 def update(self, model, trnx, trny, tstx, tsty):
     model.fit(trnx, trny)
     self.acc = model.score(tstx, tsty)
     prediction = model.predict(tstx)  # predict the outcomes
     self.con = cm(tsty, prediction) # creates confusion matrix
コード例 #43
0
def main(train_file, test_file, load_method="csv", opti_method=None, maxiter=100, 
		 batch_size=-1, units=None, lmbda=0, alpha=100, beta=1000):
	"""
	Manages files and operations for the neural network model creation, training, and testing.
	@parameters: 
		load_method - the dataset file format, either "csv" or "hdf"
		opti_method - specifies the optimization method to use, "l-bfgs", "cg", or
					   None (defaults to SGD)
		maxiter - the maximum number of iterations allowed for training
		batch_size - the number of instance for each mini-batch, -1 implies batch processing
		units - a sequence of integers separated by '.' such that each integer represents 
				 the number of units in a sequence of hidden layers.
		lmbda - the regularization term
		alpha - the numerator for the learning rate schedule (relevant for SGD only)
		beta - the denominator for the learning rate schedule (relevant for SGD only)
	"""
	# open and load csv files
	if load_method == "csv":
		X_train, y_train = mlu.load_csv(train_file, True) # load and shuffle training set
		X_test, y_test = mlu.load_csv(test_file)
	elif load_method == "hdf":
		X_train, y_train = mlu.loadh(train_file, True) # load and shuffle training set
		X_test, y_test = mlu.loadh(test_file)
	else:
		raise Exception("Dataset file type not recognized: acceptable formats are 'csv' and 'hfd'.")

	# perform feature scaling
	X_train = mlu.scale_features(X_train, 0.0, 1.0)
	X_test = mlu.scale_features(X_test, 0.0, 1.0)

	# create the neural network classifier using the training data
	NNC = NeuralNetClassifier(opti_method, maxiter, batch_size, units, lmbda, alpha, beta)
	print "\nCreated a neural network classifier\n\t", NNC

	# fit the model to the loaded training data
	print "\nFitting the training data..."
	# costs, mags = NNC.fit(X_train, y_train)
	NNC.fit(X_train, y_train)

	# predict the results for the test data
	print "\nGenerating probability prediction for the test data..."
	y_pred = NNC.predict(X_test)

	### output classification results ###
	# output class prediction probability for each instance in the test set
	print "\nThe probabilities for each instance in the test set are:\n"
	for prob in NNC.predict_proba(X_test):
		print prob
	# output accuracy
	print 'Accuracy: ', mlu.compute_accuracy(y_test, y_pred)

	# output sklearn style results if the module is availble
	try:
		from sklearn.metrics import classification_report as cr
		from sklearn.metrics import confusion_matrix as cm
		print
		print "Classification results:"
		print cr(y_test, y_pred)
		print cm(y_test, y_pred)
	except:
		pass

	# save model parameters as a pickle
	NNC.save_model("NNCModel.p")
コード例 #44
0
ファイル: util.py プロジェクト: tgsmith61591/skutil
def report_confusion_matrix(actual, pred, return_metrics=True):
    """Return a dataframe with the confusion matrix, and a series
    with the classification performance metrics.

    Parameters
    ----------

    actual : np.ndarray, shape=(n_samples,)
        The array of actual values

    pred : np.ndarray, shape=(n_samples,)
        The array of predicted values

    return_metrics : bool, optional (default=True)
        Whether to return the metrics in a pd.Series. If False,
        index 1 of the returned tuple will be None.


    Returns
    -------

    conf : pd.DataFrame, shape=(2, 2)
        The confusion matrix

    ser : pd.Series or None
        The metrics if ``return_metrics`` else None
    """

    # ensure only two classes in each
    lens = [len(set(actual)), len(set(pred))]
    max_len = np.max(lens)
    if max_len > 2:
        raise ValueError('max classes is 2, but got %i' % max_len)

    cf = cm(actual, pred)
    # format: (col = pred, index = act)
    # array([[TN, FP],
    #        [FN, TP]])

    ser = None
    if return_metrics:
        total_pop = np.sum(cf)
        condition_pos = np.sum(cf[1, :])
        condition_neg = np.sum(cf[0, :])

        # alias the elements in the matrix
        tp = cf[1, 1]
        fp = cf[0, 1]
        tn = cf[0, 0]
        fn = cf[1, 0]

        # sums of the prediction cols
        pred_pos = tp + fp
        pred_neg = tn + fn

        acc = (tp + tn) / total_pop  # accuracy
        tpr = tp / condition_pos  # sensitivity, recall
        fpr = fp / condition_neg  # fall-out
        fnr = fn / condition_pos  # miss rate
        tnr = tn / condition_neg  # specificity
        prev = condition_pos / total_pop  # prevalence
        plr = tpr / fpr  # positive likelihood ratio, LR+
        nlr = fnr / tnr  # negative likelihood ratio, LR-
        dor = plr / nlr  # diagnostic odds ratio
        prc = tp / pred_pos  # precision, positive predictive value
        fdr = fp / pred_pos  # false discovery rate
        fomr = fn / pred_neg  # false omission rate
        npv = tn / pred_neg  # negative predictive value

        # define the series
        d = {
            'Accuracy': acc,
            'Diagnostic odds ratio': dor,
            'Fall-out': fpr,
            'False discovery rate': fdr,
            'False Neg. Rate': fnr,
            'False omission rate': fomr,
            'False Pos. Rate': fpr,
            'Miss rate': fnr,
            'Neg. likelihood ratio': nlr,
            'Neg. predictive value': npv,
            'Pos. likelihood ratio': plr,
            'Pos. predictive value': prc,
            'Precision': prc,
            'Prevalence': prev,
            'Recall': tpr,
            'Sensitivity': tpr,
            'Specificity': tnr,
            'True Pos. Rate': tpr,
            'True Neg. Rate': tnr
        }

        ser = pd.Series(data=d)
        ser.name = 'Metrics'

    # create the DF
    conf = pd.DataFrame.from_records(data=cf, columns=['Neg', 'Pos'])
    conf.index = ['Neg', 'Pos']

    return conf, ser
コード例 #45
0
ファイル: metrics.py プロジェクト: ElbaKramer/kaggle_EEG
def confusion(y_true, y_pred):
    return cm(y_true, y_pred)