Beispiel #1
0
    def test(self):
        #train - data segments
        pre_train = self.data_model.preceding
        mid_train = self.data_model.middle
        suc_train = self.data_model.succeeding
        c1_train = self.data_model.concept1
        c2_train = self.data_model.concept2
        y_train = self.data_model.train_label

        binary_y_train = self.data_model.binarize_labels(y_train, True)

        # test data segments
        pre_test = self.data_model.test_preceding
        mid_test = self.data_model.test_middle
        suc_test = self.data_model.test_succeeding
        c1_test = self.data_model.test_concept1
        c2_test = self.data_model.test_concept2
        track_test = self.data_model.test_track

        if not self.data_model.write_Predictions:
            y_test = self.data_model.y_test
            binary_y_test = self.data_model.binarize_labels(y_test, True)

        labels = [str(i) for i in self.data_model.encoder.classes_]

        cv_model = self.build_segment_cnn(len(
            self.data_model.encoder.classes_))
        cv_model.fit([pre_train, mid_train, suc_train, c1_train, c2_train],
                     binary_y_train,
                     epochs=self.epochs,
                     batch_size=self.batch_size)

        if self.data_model.write_Predictions:
            pred = evaluate.predict_test_only(
                cv_model, [pre_test, mid_test, suc_test, c1_test, c2_test],
                labels)
            # save files in numpy to write predictions in BRAT format
            np.save('track', np.array(track_test))
            np.save('pred', np.array(pred))
            Predictions(self.initial_predictions, self.final_predictions,
                        self.write_No_rel)
        else:
            y_pred, y_true = evaluate.predict(
                cv_model, [pre_test, mid_test, suc_test, c1_test, c2_test],
                binary_y_test, labels)
            print(classification_report(y_true, y_pred, labels=labels))
            print(confusion_matrix(y_true, y_pred))
Beispiel #2
0
    def test(self):
        """
        Train - Test - Split
        """
        x_train = self.data_model.train
        y_train = self.data_model.train_label
        binary_y_train = self.data_model.binarize_labels(y_train, True)

        labels = [str(i) for i in self.data_model.encoder.classes_]

        x_test = self.data_model.x_test
        track_test = self.data_model.test_track
        if not self.data_model.write_Predictions:
            y_test = self.data_model.y_test
            binary_y_test = self.data_model.binarize_labels(y_test, True)

        cv_model = self.model_without_Label(
            len(self.data_model.encoder.classes_))

        if self.data_model.multilabel:
            cv_model.fit(x_train,
                         binary_y_train,
                         epochs=self.epochs,
                         batch_size=self.batch_size)
            y_pred, y_true = evaluate.multilabel_predict(
                cv_model, x_test, binary_y_test)
            print(classification_report(y_true, y_pred, target_names=labels))
        else:
            cv_model = self.fit_Model(cv_model, x_train, binary_y_train)
            if self.data_model.write_Predictions:
                pred = evaluate.predict_test_only(cv_model, x_test, labels)
                # save files in numpy to write predictions in BRAT format
                np.save('track', np.array(track_test))
                np.save('pred', np.array(pred))
                Predictions(self.initial_predictions, self.final_predictions,
                            self.write_No_rel)
            else:
                y_pred, y_true = evaluate.predict(cv_model, x_test,
                                                  binary_y_test, labels)
                print(confusion_matrix(y_true, y_pred))
                print(
                    classification_report(y_true, y_pred, target_names=labels))
Beispiel #3
0
    def cross_validate(self, num_folds=5):
        """
        Train the NN model while running cross validation.
        :param num_folds: no of CV fold (default = 5)
        """
        X_data = self.data_model.train
        Y_data = self.data_model.train_label

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        assert X_data is not None and Y_data is not None, \
            "Must have features and labels extracted for cross validation"

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
        skf.get_n_splits(X_data, Y_data)

        evaluation_statistics = {}
        fold = 1

        for train_index, test_index in skf.split(X_data, Y_data):
            binary_Y = self.data_model.binarize_labels(Y_data, True)
            x_train, x_test = X_data[train_index], X_data[test_index]
            y_train, y_test = binary_Y[train_index], binary_Y[test_index]
            print("Training Fold %i", fold)

            labels = [str(i) for i in self.data_model.encoder.classes_]

            cv_model = self.define_model(len(self.data_model.encoder.classes_))
            cv_model, loss, acc = self.fit_Model(cv_model, x_train, y_train)

            y_pred, y_true = evaluate.predict(cv_model, x_test, y_test, labels)
            fold_statistics = evaluate.cv_evaluation_fold(
                y_pred, y_true, labels)

            evaluation_statistics[fold] = fold_statistics
            fold += 1

        evaluate.cv_evaluation(labels, evaluation_statistics)
Beispiel #4
0
    def cross_validate(self, num_folds=5):
        """
        Train the CNN model while running cross validation.
        :param num_folds: no of fold in CV (default = 5)
        """
        X_data = self.data_model.train
        Y_data = self.data_model.train_label
        if self.data_model.with_Labels:
            C1_label = self.data_model.train_concept1_label
            C2_label = self.data_model.train_concept1_label

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        assert X_data is not None and Y_data is not None, \
            "Must have features and labels extracted for cross validation"

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
        skf.get_n_splits(X_data, Y_data)

        evaluation_statistics = {}
        fold = 1

        originalclass = []
        predictedclass = []
        #to track the entity pairs for each relation
        brat_track = []

        if self.data_model.multilabel:

            binary_Y = self.data_model.binarize_labels(Y_data, True)
            for train_index, test_index in skf.split(X_data,
                                                     binary_Y.argmax(1)):
                x_train, x_test = X_data[train_index], X_data[test_index]
                y_train, y_test = binary_Y[train_index], binary_Y[test_index]
                print("Training Fold %i", fold)

                labels = [str(i) for i in self.data_model.encoder.classes_]
                cv_model = self.model_without_Label(
                    len(self.data_model.encoder.classes_))

                cv_model.fit(x_train,
                             y_train,
                             epochs=self.epochs,
                             batch_size=self.batch_size)
                y_pred, y_true = evaluate.multilabel_predict(
                    cv_model, x_test, y_test)
                originalclass.extend(y_true)
                predictedclass.extend(y_pred)
                print(
                    "--------------------------- Results ------------------------------------"
                )
                print(
                    classification_report(y_true, y_pred, target_names=labels))

            print(
                "--------------------- Results --------------------------------"
            )
            print(
                classification_report(np.array(originalclass),
                                      np.array(predictedclass),
                                      target_names=labels))

        else:
            Track = self.data_model.train_track

            for train_index, test_index in skf.split(X_data, Y_data):
                binary_Y = self.data_model.binarize_labels(Y_data, True)
                y_train, y_test = binary_Y[train_index], binary_Y[test_index]
                x_train, x_test = X_data[train_index], X_data[test_index]
                train_track, test_track = Track[train_index], Track[test_index]

                labels = [str(i) for i in self.data_model.encoder.classes_]
                if self.data_model.with_Labels:
                    binary_C1_label = self.data_model.binarize_labels(
                        C1_label, True)
                    binary_C2_label = self.data_model.binarize_labels(
                        C2_label, True)
                    if self.data_model.generalize:
                        train_C1_label, test_C1_label = binary_C1_label[
                            train_index], binary_C1_label[test_index]
                        train_C2_label, test_C2_label = binary_C2_label[
                            train_index], binary_C2_label[test_index]
                        cv_model = self.model_with_Label(
                            len(self.data_model.encoder.classes_),
                            train_C1_label, train_C2_label)
                        cv_model.fit([x_train, train_C1_label, train_C2_label],
                                     y_train,
                                     epochs=self.epochs,
                                     batch_size=self.batch_size)
                        y_pred, y_true = evaluate.predict(
                            cv_model, [x_test, test_C1_label, test_C2_label],
                            y_test, labels)
                    else:
                        train_C1_label, test_C1_label = binary_C1_label[
                            train_index], binary_C1_label[test_index]

                        cv_model = self.model_with_Label(
                            len(self.data_model.encoder.classes_),
                            train_C1_label)
                        cv_model.fit([x_train, train_C1_label],
                                     y_train,
                                     epochs=self.epochs,
                                     batch_size=self.batch_size)
                        y_pred, y_true = evaluate.predict(
                            cv_model, [x_test, test_C1_label], y_test, labels)
                        print(y_true, y_pred)
                else:
                    cv_model = self.model_without_Label(
                        len(self.data_model.encoder.classes_))
                    cv_model = self.fit_Model(cv_model, x_train, y_train)
                    y_pred, y_true = evaluate.predict(cv_model, x_test, y_test,
                                                      labels)

                originalclass.extend(y_true)
                predictedclass.extend(y_pred)
                brat_track.extend(test_track)

                print(
                    "--------------------------- Results ------------------------------------"
                )
                print(classification_report(y_true, y_pred, labels=labels))
                # print(confusion_matrix(y_true, y_pred))
                fold_statistics = evaluate.cv_evaluation_fold(
                    y_pred, y_true, labels)

                evaluation_statistics[fold] = fold_statistics
                fold += 1
            print(
                "--------------------- Results --------------------------------"
            )
            print(
                classification_report(np.array(originalclass),
                                      np.array(predictedclass),
                                      target_names=labels))
            print(
                confusion_matrix(np.array(originalclass),
                                 np.array(predictedclass)))

            if self.data_model.write_Predictions:
                # save files in numpy to write predictions in BRAT format
                np.save('track', np.array(brat_track))
                np.save('pred', np.array(predictedclass))
                Predictions(self.initial_predictions, self.final_predictions,
                            self.write_No_rel)

            #print results using MedaCy evaluation (similar to the sklearn evaluation above)
            print(
                "---------------------medacy Results --------------------------------"
            )
            evaluate.cv_evaluation(labels, evaluation_statistics)
Beispiel #5
0
    def end_to_end_test(self):
        if self.data_model.multilabel:
            x_train = self.data_model.true_train_x
            y_train = self.data_model.true_train_y
            binary_y_train = self.data_model.binarize_labels(y_train, True)

            x_test = self.data_model.true_test_x
            y_test = self.data_model.true_test_y
            binary_y_test = self.data_model.binarize_labels(y_test, True)
            labels = ['no', 'yes']
        else:
            x_train = self.data_model.train
            y_train = self.data_model.train_label
            binary_y_train = self.data_model.binarize_labels(y_train, True)

            x_test = self.data_model.x_test
            y_test = self.data_model.y_test
            binary_y_test = self.data_model.binarize_labels(y_test, True)

            labels = [str(i) for i in self.data_model.encoder.classes_]
        cv_model = self.model_without_Label(len(labels))

        cv_model = self.fit_Model(cv_model, x_train, binary_y_train)
        y_pred, y_true = evaluate.predict(cv_model, x_test, binary_y_test,
                                          labels)

        print(
            "---------------------  binary results ---------------------------------"
        )
        print(confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred, target_names=labels))

        x_train1, y_train1, x_test1, y_test1 = self.data_model.remove_instances(
            y_pred)
        if self.data_model.multilabel:
            self.data_model.binary_label = False
            df_train = model.reduce_duplicate_data(x_train1, y_train1)
            y_train2 = df_train.label.tolist()
            y_train1 = y_train2
            df_test = model.reduce_duplicate_data(x_test1, y_test1)
            y_test2 = df_test.label.tolist()
            y_test1 = y_test2
            train, test, word_index = self.data_model.vectorize_words(
                df_train.sentence, df_test.sentence)
            x_train1 = train
            x_test1 = test
        binary_y_train1 = self.data_model.binarize_labels(y_train1, True)
        labels1 = [str(i) for i in self.data_model.encoder.classes_]
        binary_y_test1 = self.data_model.binarize_labels(y_test1, True)
        cv_model1 = self.model_without_Label(len(labels1))

        if self.data_model.multilabel:
            cv_model1.fit(x_train1,
                          binary_y_train1,
                          epochs=self.epochs,
                          batch_size=self.batch_size)
            y_pred1, y_true1 = evaluate.multilabel_predict(
                cv_model1, x_test1, binary_y_test1)
        else:
            cv_model1 = self.fit_Model(cv_model1, np.array(x_train1),
                                       np.array(binary_y_train1))
            y_pred1, y_true1 = evaluate.predict(cv_model1, np.array(x_test1),
                                                np.array(binary_y_test1),
                                                labels1)
            print(
                "---------------------  Final results ---------------------------------"
            )
            print(confusion_matrix(y_true1, y_pred1))
        print(classification_report(y_true1, y_pred1, target_names=labels1))
Beispiel #6
0
    def cross_validate(self, num_folds=5):
        """
        Perform cross validation
        :param num_folds: no of fold for cross validation (default = 5)
        """
        Pre_data = self.data_model.preceding
        Mid_data = self.data_model.middle
        Suc_data = self.data_model.succeeding
        C1_data = self.data_model.concept1
        C2_data = self.data_model.concept2
        Track = self.data_model.train_track
        Y_data = self.data_model.train_label

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
        skf.get_n_splits(C1_data, Y_data)
        evaluation_statistics = {}
        fold = 1
        originalclass = []
        predictedclass = []
        # to track the entity pairs for each relation
        brat_track = []

        for train_index, test_index in skf.split(C1_data, Y_data):
            binary_Y = self.data_model.binarize_labels(Y_data, True)

            pre_train, pre_test = Pre_data[train_index], Pre_data[test_index]
            mid_train, mid_test = Mid_data[train_index], Mid_data[test_index]
            suc_train, suc_test = Suc_data[train_index], Suc_data[test_index]
            c1_train, c1_test = C1_data[train_index], C1_data[test_index]
            c2_train, c2_test = C2_data[train_index], C2_data[test_index]
            track_train, track_test = Track[train_index], Track[test_index]
            y_train, y_test = binary_Y[train_index], binary_Y[test_index]

            labels = [str(i) for i in self.data_model.encoder.classes_]

            cv_model = self.build_segment_cnn(
                len(self.data_model.encoder.classes_))
            cv_model.fit([pre_train, mid_train, suc_train, c1_train, c2_train],
                         y_train,
                         epochs=self.epochs,
                         batch_size=self.batch_size)
            y_pred, y_true = evaluate.predict(
                cv_model, [pre_test, mid_test, suc_test, c1_test, c2_test],
                y_test, labels)
            fold_statistics = evaluate.cv_evaluation_fold(
                y_pred, y_true, labels)
            originalclass.extend(y_true)
            predictedclass.extend(y_pred)
            brat_track.extend(track_test)

            print(
                "--------------------------- Results ------------------------------------"
            )
            print(classification_report(y_true, y_pred, labels=labels))
            # print(confusion_matrix(y_true, y_pred))
            evaluation_statistics[fold] = fold_statistics
            fold += 1
        if self.data_model.write_Predictions:
            # save files in numpy to write predictions in BRAT format
            np.save('track', np.array(brat_track))
            np.save('pred', np.array(predictedclass))
            Predictions(self.initial_predictions, self.final_predictions,
                        self.write_No_rel)

        print("--------------------- Results --------------------------------")
        print(
            classification_report(np.array(originalclass),
                                  np.array(predictedclass),
                                  target_names=labels))
        print(
            confusion_matrix(np.array(originalclass),
                             np.array(predictedclass)))

        # print results using MedaCy evaluation (similar to the sklearn evaluation above)
        print(
            "---------------------medacy Results --------------------------------"
        )
        evaluate.cv_evaluation(labels, evaluation_statistics)
Beispiel #7
0
    def end_to_end_test(self):

        pre_train = self.data_model.preceding
        mid_train = self.data_model.middle
        suc_train = self.data_model.succeeding
        c1_train = self.data_model.concept1
        c2_train = self.data_model.concept2
        y_train = self.data_model.train_label
        binary_y_train = self.data_model.binarize_labels(y_train, True)

        labels = [str(i) for i in self.data_model.encoder.classes_]

        pre_test = self.data_model.test_preceding
        mid_test = self.data_model.test_middle
        suc_test = self.data_model.test_succeeding
        c1_test = self.data_model.test_concept1
        c2_test = self.data_model.test_concept2
        y_test = self.data_model.y_test
        binary_y_test = self.data_model.binarize_labels(y_test, True)

        cv_model = self.build_segment_cnn(len(
            self.data_model.encoder.classes_))
        cv_model.fit([pre_train, mid_train, suc_train, c1_train, c2_train],
                     binary_y_train,
                     epochs=self.epochs,
                     batch_size=self.batch_size)
        y_pred, y_true = evaluate.predict(
            cv_model, [pre_test, mid_test, suc_test, c1_test, c2_test],
            binary_y_test, labels)
        print(
            "---------------------  binary results ---------------------------------"
        )
        print(confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred, target_names=labels))

        df_train, df_test = self.data_model.remove_instances(y_pred)
        binary_y_train1 = self.data_model.binarize_labels(
            df_train.label.tolist(), True)
        labels1 = [str(i) for i in self.data_model.encoder.classes_]
        binary_y_test1 = self.data_model.binarize_labels(
            df_test.true.tolist(), True)
        cv_model1 = self.build_segment_cnn(len(labels1))
        cv_model1.fit([
            df_train.preceding.tolist(),
            df_train.middle.tolist(),
            df_train.succeeding.tolist(),
            df_train.c1.tolist(),
            df_train.c2.tolist()
        ],
                      binary_y_train1,
                      epochs=self.epochs,
                      batch_size=self.batch_size)
        y_pred1, y_true1 = evaluate.predict(cv_model1, [
            df_test.preceding.tolist(),
            df_test.middle.tolist(),
            df_test.succeeding.tolist(),
            df_test.c1.tolist(),
            df_test.c2.tolist()
        ], binary_y_test1, labels1)
        print(
            "---------------------  Final results ---------------------------------"
        )
        print(confusion_matrix(y_true1, y_pred1))
        print(classification_report(y_true1, y_pred1, target_names=labels1))