Beispiel #1
0
        print("count_one : ", count_one)
        print("count_zero : ", count_zero)
        df_new.to_csv('sample_data_4.csv')
        return df_new

    def show_topK(self, classifier, vectorizer, categories, K=10):
        feature_names = np.asarray(vectorizer.get_feature_names())
        for i, category in enumerate(categories):
            topK = np.argsort(classifier.coef_[0])[-K:]
            print("%s: %s" % (category, " ".join(feature_names[topK])))


if __name__ == '__main__':

    mf = main_file()
    cl = classification()

    df = pd.read_csv(mf.input_file_path, sep=',')
    X, y = mf.get_input_text_and_label(df)
    #mf.create_new_dataset(df)
    #sys.exit()
    '''
    X = data_preprocessing_1().process_data(X)
    X = data_preprocessing_2().process_data(X)    
    X = data_preprocessing_3().preprocess_text(X)
    
    X = mf.remove_nan(X)
    
    df = mf.update_dataframe(df, X)
    df.to_csv(mf.output_file_path)
    
    def basic_model(self, data, X, X_vec, labels, labelled_set, unlabelled_set,
                    n_gram, clf_name):
        skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
        #vectorizer = TfidfVectorizer(ngram_range=(1,1), use_idf=True, smooth_idf=True, norm='l2')
        #X_vec = vectorizer.fit_transform(X)
        resp_label = np.copy(labels)
        final_confusion_matrix = [[0, 0], [0, 0]]
        X_labelled = X[labelled_set]
        y_labelled = y2[labelled_set]
        X_unlabelled = X[unlabelled_set]
        data['feature_response_labels'] = -1

        #print(X_vec[labelled_set].shape)
        for train_index, test_index in skf.split(X_vec[labelled_set],
                                                 y_labelled):
            y = np.copy(labels)
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = X_labelled[train_index], X_labelled[test_index]
            y_train, y_test = y_labelled[train_index], y_labelled[test_index]
            #labelled_set = train_index
            #print("y shape before ", y.shape)
            y = np.delete(y, test_index)
            cl = classification()
            print("y shape after ", y1.shape, y2.shape)
            #print("Y before ", labels)
            print("X train ", X_train.shape)
            print("X test ", X_test.shape)

            train_index_orig = labelled_set[train_index]
            test_index_orig = labelled_set[test_index]
            print("train_index_orig shape ", train_index_orig.shape)
            print("test_index_orig shape ", test_index_orig.shape)

            pipeline = Pipeline([
                # Use FeatureUnion to combine the features from subject and body
                (
                    'union',
                    FeatureUnion(
                        transformer_list=[

                            # Pipeline for pulling features from the post's subject line
                            ('deadline_ppl',
                             Pipeline([
                                 ('selector',
                                  Custom_features_2(key='deadline_weight')),
                             ])),

                            # Pipeline for standard bag-of-words model for body
                            ('text_ppl',
                             Pipeline([
                                 ('selector', Custom_features_3(key='Text')),
                                 ('tfidf',
                                  TfidfVectorizer(ngram_range=(n_gram[0],
                                                               n_gram[1]),
                                                  use_idf=True,
                                                  smooth_idf=True,
                                                  norm='l2')),
                             ])),
                        ],

                        # weight components in FeatureUnion
                        transformer_weights={
                            'deadline_ppl': 1.0,
                            'text_ppl': 1.0,
                        },
                    )),
            ])
            #vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf=True, smooth_idf=True, norm='l2')
            X_ = pipeline.fit_transform(data)
            print(X_.shape)

            X_train_vec = X_[0:train_index_orig.shape[0]]
            X_test = X_[train_index_orig.shape[0]:train_index_orig.shape[0] +
                        test_index_orig.shape[0]]
            X_unlabelled_vec = X_[-unlabelled_set.shape[0]:]
            y_ = np.concatenate((y_train, resp_label[unlabelled_set]), axis=0)

            print(X_train_vec.shape, X_unlabelled_vec.shape, X_test.shape)

            #final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate)
            if (clf_name == 'EM'):
                final_labels, clf = cl.expectation_maximization(
                    X_train_vec, y_train, X_unlabelled_vec)
            elif (clf_name == 'LS'):
                final_labels, clf = cl.label_spreading(X_train_vec, y_,
                                                       X_unlabelled_vec)
            elif (clf_name == 'LP'):
                final_labels, clf = cl.label_propagation(
                    X_train_vec, y_, X_unlabelled_vec)

            #print("Y after ", labels)
            pred_labels = clf.predict(X_test)
            print("pred_labels :", pred_labels, "\tReal labels: ", y_test)
            confusion_mat = confusion_matrix(y_test,
                                             pred_labels,
                                             labels=[0, 1])
            print(confusion_mat)
            tn, fp, fn, tp = confusion_mat.ravel()
            print(tn, fp, fn, tp)
            final_confusion_matrix[0][0] += tn
            final_confusion_matrix[0][1] += fp
            final_confusion_matrix[1][0] += fn
            final_confusion_matrix[1][1] += tp

            print("Final confiusion matrix ", final_confusion_matrix)

        tn, fp, fn, tp = np.array(final_confusion_matrix).ravel()

        u_precision = tp / (tp + fp)
        u_recall = tp / (tp + fn)
        u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall)

        non_u_precision = tn / (tn + fn)
        non_u_recall = tn / (tn + fp)
        non_u_f1_score = 2 * non_u_precision * non_u_recall / (
            non_u_precision + non_u_recall)

        accuracy = (tp + tn) / (tp + tn + fp + fn)

        return np.array(
            final_confusion_matrix
        ), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy
    def feature_model(self, data, X, X_vec, y1, y2, labelled_set,
                      unlabelled_set, n_gram, clf_name):
        skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
        #vectorizer = TfidfVectorizer(ngram_range=(1,1), use_idf=True, smooth_idf=True, norm='l2')
        #X_vec = vectorizer.fit_transform(X)
        resp_label = np.copy(y1)
        urg_labels = np.copy(y2)
        final_confusion_matrix = [[0, 0], [0, 0]]
        X_labelled = X[labelled_set]
        y_labelled = y2[labelled_set]
        X_unlabelled = X[unlabelled_set]
        data['feature_response_labels'] = -1

        #print(X_vec[labelled_set].shape)
        for train_index, test_index in skf.split(X_vec[labelled_set],
                                                 y_labelled):
            #y = np.copy(labels)
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = X_labelled[train_index], X_labelled[test_index]
            y_train, y_test = y_labelled[train_index], y_labelled[test_index]
            #labelled_set = train_index
            #print("y shape before ", y.shape)
            y1 = np.delete(y1, test_index)
            y2 = np.delete(y2, test_index)
            cl = classification()
            print("y shape after ", y1.shape, y2.shape)
            #print("Y before ", labels)
            print("X train ", X_train.shape)
            print("X test ", X_test.shape)

            clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
            #clf = IsolationForest(max_samples=100, random_state = np.random.RandomState(42), contamination='auto')
            ppl = Pipeline([
                # Use FeatureUnion to combine the features from subject and body
                (
                    'union',
                    FeatureUnion(
                        transformer_list=[
                            ('Custom_features_ppl',
                             Pipeline([
                                 ('selector', Custom_features()),
                             ])),

                            # Pipeline for standard bag-of-words model for body
                            ('text_ppl',
                             Pipeline([
                                 ('tfidf',
                                  TfidfVectorizer(ngram_range=(n_gram[0],
                                                               n_gram[1]),
                                                  use_idf=True,
                                                  smooth_idf=True,
                                                  norm='l2')),
                             ])),
                        ],
                        # weight components in FeatureUnion
                        transformer_weights={
                            'Custom_features_ppl': 1.0,
                            'text_ppl': 1.0,
                        },
                    )),
                #('to_dense', DenseTransformer()),
                ('clf', clf)
            ])

            lab = data['Response_needed']
            unique, count = np.unique(lab, return_counts=True)
            #print(dict(zip(unique,count)))

            ppl.fit(X_train)
            y_pred = ppl.predict(X_unlabelled)

            filtered_index_orig_one = unlabelled_set[np.where(y_pred == 1)[0]]
            print(filtered_index_orig_one.shape)

            y_response_label = np.concatenate(
                (filtered_index_orig_one, labelled_set), axis=0)
            print("Shapes ", y1.shape[0] + test_index.shape[0])
            response_labels = []
            for i in range(data.shape[0]):
                if i in y_response_label:
                    response_labels.append(1)
                else:
                    response_labels.append(0)

            #print(response_labels)
            p = data['feature_response_labels']
            unique, count = np.unique(p, return_counts=True)
            print(dict(zip(unique, count)))

            response_labels = pd.Series(response_labels)
            print(response_labels.shape)

            #train_df_clf2.iloc[:,28] = combined_response_labels
            data = data.assign(feature_response_labels=response_labels.values)

            train_index_orig = labelled_set[train_index]
            test_index_orig = labelled_set[test_index]
            print("train_index_orig shape ", train_index_orig.shape)
            print("test_index_orig shape ", test_index_orig.shape)
            combined_train_index_orig = np.concatenate(
                (train_index_orig, test_index_orig, unlabelled_set), axis=0)
            print("combined_train_index_orig shape ",
                  combined_train_index_orig.shape)

            train_df = data.iloc[combined_train_index_orig, :]
            print(train_df.shape)

            pipeline = Pipeline([
                # Use FeatureUnion to combine the features from subject and body
                (
                    'union',
                    FeatureUnion(
                        transformer_list=[

                            # Pipeline for pulling features from the post's subject line
                            ('deadline_ppl',
                             Pipeline([
                                 ('selector',
                                  Custom_features_2(key='deadline_weight')),
                             ])),
                            ('response_label_ppl',
                             Pipeline([
                                 ('selector',
                                  Custom_features_2(
                                      key='feature_response_labels')),
                             ])),

                            # Pipeline for standard bag-of-words model for body
                            ('text_ppl',
                             Pipeline([
                                 ('selector', Custom_features_3(key='Text')),
                                 ('tfidf',
                                  TfidfVectorizer(ngram_range=(n_gram[0],
                                                               n_gram[1]),
                                                  use_idf=True,
                                                  smooth_idf=True,
                                                  norm='l2')),
                             ])),
                        ],

                        # weight components in FeatureUnion
                        transformer_weights={
                            'deadline_ppl': 1.0,
                            'response_label_ppl': 1.0,
                            'text_ppl': 1.0,
                        },
                    )),
            ])
            #vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf=True, smooth_idf=True, norm='l2')
            X_ = pipeline.fit_transform(train_df)
            print(X_.shape)

            X_train_vec = X_[0:train_index_orig.shape[0]]
            X_test = X_[train_index_orig.shape[0]:train_index_orig.shape[0] +
                        test_index_orig.shape[0]]
            X_unlabelled_vec = X_[-unlabelled_set.shape[0]:]

            print(X_train_vec.shape, X_unlabelled_vec.shape, X_test.shape)
            y_ = np.concatenate((y_train, resp_label[unlabelled_set]), axis=0)
            #final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate)
            if (clf_name == 'EM'):
                final_labels, clf = cl.expectation_maximization(
                    X_train_vec, y_train, X_unlabelled_vec)
            elif (clf_name == 'LS'):
                final_labels, clf = cl.label_spreading(X_train_vec, y_,
                                                       X_unlabelled_vec)
            elif (clf_name == 'LP'):
                final_labels, clf = cl.label_propagation(
                    X_train_vec, y_, X_unlabelled_vec)

            #print("Y after ", labels)
            pred_labels = clf.predict(X_test)
            print("pred_labels :", pred_labels, "\tReal labels: ", y_test)
            confusion_mat = confusion_matrix(y_test,
                                             pred_labels,
                                             labels=[0, 1])
            print(confusion_mat)
            tn, fp, fn, tp = confusion_mat.ravel()
            print(tn, fp, fn, tp)
            final_confusion_matrix[0][0] += tn
            final_confusion_matrix[0][1] += fp
            final_confusion_matrix[1][0] += fn
            final_confusion_matrix[1][1] += tp

            print("Final confiusion matrix ", final_confusion_matrix)

        tn, fp, fn, tp = np.array(final_confusion_matrix).ravel()

        u_precision = tp / (tp + fp)
        u_recall = tp / (tp + fn)
        u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall)

        non_u_precision = tn / (tn + fn)
        non_u_recall = tn / (tn + fp)
        non_u_f1_score = 2 * non_u_precision * non_u_recall / (
            non_u_precision + non_u_recall)

        accuracy = (tp + tn) / (tp + tn + fp + fn)

        return np.array(
            final_confusion_matrix
        ), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy
 def pseudo_labelling(self, final_y, X_train, y_train, X_test, labelled_set, unlabelled_set, sample_rate, clf=None):
 #def pseudo_labelling(self, X, y, X_train, y_train, X_test, X_orig):
     
     if(-1 not in final_y):
         return final_y, clf
     
     num_of_samples = math.ceil(len(X_train) * self.sample_rate)
     print("num_of_samples : ", num_of_samples)        
     #print("Y Lables: ", final_y, final_y.shape)
     #print("X_train ", X_train.shape)
     #print("y_train ", y_train, y_train.shape)
     #print("x_test ", X_test.shape)
     #print("labelled set : ", labelled_set, labelled_set.shape)
     #print("unlabelled set : ", unlabelled_set, unlabelled_set.shape)
     
     cl = classification()
     predicted_labels, prediction_confidence, clf = cl.linear_svc(X_train, y_train, X_test)
     #sys.exit()
     #print(predicted_labels, predicted_labels.shape)
     #print("Prediction_confidence_before along with predicted labels: ", prediction_confidence, "\t", predicted_labels)
     prediction_confidence = self.normalization(prediction_confidence)
     pred_conf_sorted = np.argsort(np.absolute(prediction_confidence))
     p_index = pred_conf_sorted[-num_of_samples:]
     #print("Prediction_confidence : \n", prediction_confidence)
     #print(pred_conf_sorted, "\n", p_index, "\n",  prediction_confidence[p_index])
     #print(unlabelled_set.shape)
     
     '''
     deadline_values = metadata().calculate_deadline_weight(unlabelled_set)
     deadline_val_sorted = np.argsort(np.absolute(deadline_values))
     d_index = deadline_val_sorted[-num_of_samples:]
     '''
     #print("deadline values : \n", deadline_values, "\n", deadline_val_sorted, "\n", d_index, "\n", deadline_values[d_index])
     #pred_conf = prediction_confidence + 0 * deadline_values
     pred_conf = prediction_confidence
     #print("Combined \n:", pred_conf)
     pseudo_labels, pseudo_labelled_indices = self.compute_final_label(pred_conf, num_of_samples)
     
     '''
     sorted_indices = np.argsort(np.absolute(pred_conf))
     print("Sorted indices: ", sorted_indices)
     
     #print("prediction confidence ",prediction_confidence[sorted_indices[-num_of_samples:]])
     
     pseudo_labelled_indices = sorted_indices[-num_of_samples:]
     print("pseudo_labelled_indices :", pseudo_labelled_indices)
     #sys.exit()
     '''
     
     
     
     new_train_X = []
     new_train_y = []
     #unlabelled_indices = unlabelled_set.copy()
     for ind in pseudo_labelled_indices:
         #print("Index ", ind)
         #print("unlabelled_indices :", unlabelled_set)
         delete_orig_index = unlabelled_set[ind]
         #print("Delete index : ", delete_orig_index)
         if(final_y[delete_orig_index] == -1):
             final_y[delete_orig_index] = pseudo_labels[ind]
             new_train_y.append(pseudo_labels[ind])
             new_train_X.append(X_test[ind])
             labelled_set = np.append(labelled_set, delete_orig_index)
             #unlabelled_set = np.delete(unlabelled_set, ind, axis = 0)
         else:
             print("Value already been updated : ", delete_orig_index, final_y[delete_orig_index])
             sys.exit()
     unlabelled_set = np.delete(unlabelled_set, pseudo_labelled_indices, axis = 0)    
     new_train_X = np.array(new_train_X)
     new_train_y = np.array(new_train_y)
     #print("New train X : ", new_train_X, new_train_X.shape)
     #print("New train Y : ", new_train_y, new_train_y.shape)
     X_train = np.concatenate((X_train, new_train_X), axis = 0)
     y_train = np.concatenate((y_train, new_train_y), axis = 0)
     X_test = np.delete(X_test, pseudo_labelled_indices, axis=0)
     print()
     return self.pseudo_labelling(final_y, X_train, y_train, X_test, labelled_set, unlabelled_set, sample_rate, clf)    
Beispiel #5
0
 def __init__(self):
     self.x = 0
     self.cl = classification()
Beispiel #6
0
    def skfold_cv(self, X1, y1, X2, y2, response_labels, labelled_set, unlabelled_set, ppl, data, ngrams, semi_clf):
        
        skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
        labels = np.copy(y2)
        final_confusion_matrix = [[0,0],[0,0]] 
        X_labelled = X2[labelled_set]
        y_labelled = y2[labelled_set]
        X_unlabelled = X2[unlabelled_set]
        y_unlabelled = y2[unlabelled_set]
        i = 1
        for train_index, test_index in skf.split(X_labelled, y_labelled):
            print("Cross Validation iteration #",i)
            i+=1
            y2 = np.copy(labels)
            #print("TRAIN:", train_index, "TEST:", test_index)
            print("Train_index_shape ", train_index.shape, "\t Test index shape ", test_index.shape)
            X_train, X_test = X_labelled[train_index], X_labelled[test_index]
            y_train, y_test = y_labelled[train_index], y_labelled[test_index]
            response_labels_train = response_labels[train_index]
            response_labels_test = response_labels[test_index]
            
            X_train_clf1 = np.concatenate((X1, X_train),axis=0)
            y_train_clf1 = np.concatenate((y1, response_labels_train),axis=0)
            y_train_clf1 = y_train_clf1.astype(int)
            
            #labelled_set = train_index
            print("y shape before ", y2.shape)
            y2 = np.delete(y2, test_index)
            print("y shape after ", y2.shape)
            sample_rate=0.2
            
            #unique, counts = np.unique(y_train_clf1, return_counts=True)
            #print(dict(zip(unique, counts)))
            
            ppl.fit(X_train_clf1, y_train_clf1)
            y_test_pred = ppl.predict(X_test)
            #print(y_test_pred, "\n", y_test_pred.shape)
            
            y_unlabelled_pred = ppl.predict(X_unlabelled)
            print(y_unlabelled_pred, y_unlabelled_pred.shape)
            
            cl = classification()
            train_index_orig = labelled_set[train_index]
            test_index_orig = labelled_set[test_index]
            
            #Combining predcited response labels with originial ones to pass as feature for vectorization
            combined_train_index_orig = np.concatenate((train_index_orig, test_index_orig, unlabelled_set),axis=0)
            response_label_pred = y_unlabelled_pred
            combined_response_labels =  np.concatenate((response_labels_train, response_labels_test, response_label_pred),axis=0)
            print(response_label_pred.shape, response_labels_train.shape)
            print(combined_train_index_orig.shape,combined_response_labels.shape)
            train_df_clf2 = data.iloc[combined_train_index_orig,:]
            
            #combined_response_labels = np.transpose(np.matrix(combined_response_labels))      
            combined_response_labels = pd.Series(combined_response_labels)
            response_required_label = combined_response_labels
            
            print("Shape before ", train_df_clf2.shape, combined_response_labels.shape)
            train_df_clf2 = train_df_clf2.assign(response_required_label= response_required_label.values)
            
            #print(dict(zip(combined_train_index_orig, combined_response_labels)))
            #print(train_df_clf2.iloc[130:180,])
            
            pipeline = Pipeline([
                # Use FeatureUnion to combine the features from subject and body
                ('union', FeatureUnion(
                    transformer_list=[
            
                        # Pipeline for pulling features from the post's subject line
                        ('deadline_ppl', Pipeline([
                            ('selector', Custom_features_2(key = 'deadline_weight')),
                        ])),
                        
                        ('response_label_ppl', Pipeline([
                            ('selector', Custom_features_2(key = 'response_required_label')),
                        ])),            
                        
                        # Pipeline for standard bag-of-words model for body
                        ('text_ppl', Pipeline([
                            ('selector', Custom_features(key = 'Text')),
                            ('tfidf',  TfidfVectorizer(ngram_range = ngrams, use_idf=True, smooth_idf=True, norm='l2')),
                        ])),
            
                    ],
            
                    # weight components in FeatureUnion
                    transformer_weights={
                        'deadline_ppl': 1.0,
                        'response_label_ppl':1.0,
                        'text_ppl': 1.0,
                    },
                )),
            ])
                
            X_vec = pipeline.fit_transform(train_df_clf2)
            #print(X_vec, X_vec.shape)
                        
            '''
            vectorizer = TfidfVectorizer(ngram_range=(1,3), norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
            X_vec = vectorizer.fit_transform(X)
            #print("Vec torized_text \n", X_vec)
            print(X_vec.shape)
            '''
            
            X_train_vec = X_vec[0:train_index.shape[0]]
            X_test_vec = X_vec[train_index.shape[0]:(train_index.shape[0]+test_index.shape[0])]
            X_unlabelled_vec = X_vec[-X_unlabelled.shape[0]:]
            print(X_vec.shape, X_train_vec.shape, X_unlabelled_vec.shape, X_test_vec.shape)  
            
            '''
            X_unlabelled_vec = X_vec[0: X_unlabelled.shape[0]]
            X_labelled_vec = X_vec[-X_labelled.shape[0]:]
            X_train_vec = X_labelled_vec[train_index]
            X_test_vec = X_labelled_vec[test_index]
            print(X_unlabelled_vec.shape, X_labelled_vec.shape)
            '''
            #print(X_unlabelled_vec.shape, X_labelled_vec.shape, y_train.shape)
            #print("XYZZZZZ \n", X_unlabelled_vec[0])
            #predicted_labels, prediction_confidence, clf = cl.linear_svc(X_train, y_train, X_test)
            y_ = np.concatenate((y_train, y_unlabelled), axis=0)  
            if(semi_clf == 'LS'):
                predicted_labels, clf = cl.label_spreading(X_train_vec, y_, X_unlabelled_vec)
            elif(semi_clf == 'EM'):
                predicted_labels, clf = cl.expectation_maximization(X_train_vec, y_train, X_unlabelled_vec)
            #print("final_labels :", predicted_labels, predicted_labels.shape)
            unique, counts = np.unique(predicted_labels, return_counts=True)
            print("Predicted label summary ", dict(zip(unique, counts)))
            y_pred = clf.predict(X_test_vec)
            #print(classification_report(y_test, y_pred))
            #print("Accuracy ", accuracy_score(y_test, y_pred))
            #print(sklearn.metrics.confusion_matrix(y_test, y_pred))
            print("pred_labels :", y_pred, "\tReal labels: ", y_test)
            confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
            print(confusion_matrix)
            print("Type is ", type(confusion_matrix))
            tn, fp, fn, tp = confusion_matrix.ravel()
            #print(tn, fp, fn, tp)
            final_confusion_matrix[0][0] += tn
            final_confusion_matrix[0][1] += fp
            final_confusion_matrix[1][0] += fn
            final_confusion_matrix[1][1] += tp

            #print("Final confiusion matrix ", final_confusion_matrix)  
        
        #tn, fp, fn, tp = final_confusion_matrix[0][0], final_confusion_matrix[0][1], final_confusion_matrix[1][0], final_confusion_matrix[1][1]
        tn, fp, fn, tp = np.array(final_confusion_matrix).ravel()
            
        u_precision = tp/(tp + fp)
        u_recall = tp/(tp + fn)
        u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall)
        
        non_u_precision = tn/(tn + fn)
        non_u_recall = tn/(tn + fp)
        non_u_f1_score = 2 * non_u_precision * non_u_recall / (non_u_precision + non_u_recall)
        
        
        accuracy = (tp + tn)/(tp + tn + fp + fn)
        
        return np.array(final_confusion_matrix), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy
        
        
            
            
            
            
        '''
            
            
            #unique, counts = np.unique(y_unlabelled_pred, return_counts=True)
            #print(dict(zip(unique, counts)))
            #sys.exit()
            
            
            final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate)
            #final_labels, clf = self.cl.expectation_maximization(X_train, y_train, X_unlabelled)
            #final_labels, clf = self.cl.label_spreading(X_train, y, X_unlabelled)
            print("Y after ", labels)
            pred_labels = clf.predict(X_test)
            print("pred_labels :", pred_labels, "\tReal labels: ", y_test)
            print(self.classification_rep(X_train, y_train, clf))
            confusion_matrix = self.confusion_mat(X_test, y_test, clf)
            print(confusion_matrix)
            tn, fp, fn, tp = confusion_matrix.ravel()
            print(tn, fp, fn, tp)
            final_confusion_matrix[0][0] += tn
            final_confusion_matrix[0][1] += fp
            final_confusion_matrix[1][0] += fn
            final_confusion_matrix[1][1] += tp

            print("Final confiusion matrix ", final_confusion_matrix)  
        
        tp, fp, fn, tp = final_confusion_matrix[0][0], final_confusion_matrix[0][1], final_confusion_matrix[1][0], final_confusion_matrix[1][1]
        overall_precision = tp/(tp + fp)
        overall_recall = tp/(tp + fn)
        overall_accuracy = (tp + tn)/(tp + tn + fp + fn)
        overall_f1_score = 2 * overall_precision * overall_recall / (overall_precision + overall_recall)
        return np.array(final_confusion_matrix), overall_precision, overall_recall, overall_accuracy, overall_f1_score
            
         '''