Ejemplo n.º 1
0
    def draw_roc_curve_saved_model(self):
        self.logger.println("drawing roc curve from saved model")
        start_time = timeit.default_timer()
        cs = CrfSuite()
        crf = cs.load_model("current_crf_model.pkl")

        dataset = Dataset()
        data = dataset.read(nr_of_files=1000)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.read()
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        evaluator = Evaluator()
        evaluator.draw_roc_proba(crf, test_features, y_test)
Ejemplo n.º 2
0
    def train_model(self, nr_of_files=-1):
        self.logger.println("train model called")
        start_time = timeit.default_timer()
        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=nr_of_files)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        model = cs.train_model(train_features, y_train)
        cs.save_model(model)
        y_train_pred = cs.test_model(model, train_features)
        y_test_pred = cs.test_model(model, test_features)

        print("printing training results")
        cs.print_classification_report(dataset.docs2lines(y_train),
                                       y_train_pred)
        score_train = cs.score_model(dataset.docs2lines(y_train), y_train_pred)
        print("training f1 score: %s" % score_train)

        print("printing test results")
        cs.print_classification_report(dataset.docs2lines(y_test), y_test_pred)
        score_test = cs.score_model(dataset.docs2lines(y_test), y_test_pred)
        print("test f1 score: %s" % score_test)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)

        evaluator = Evaluator()
        evaluator.perform_roc_analysis(dataset.docs2lines(y_train),
                                       y_train_pred)
        evaluator.perform_roc_analysis(dataset.docs2lines(y_test), y_test_pred)
Ejemplo n.º 3
0
    def optimise_model(self, argv):
        self.logger.println("optimise model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=argv)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)

        word2count, word2idx = dataset.encode_dataset(data)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(data)
        y_train = f_generator.generate_true_outcome(data)

        cs.optimise_model(train_features, y_train)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("optimise model operation took",
                                     elapsed_seconds)
Ejemplo n.º 4
0
    def train_model_learning_curve(self, arg):
        self.logger.println("train model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=arg)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        #w2v_model = we_model.load_pretrained_model() # optionally load a pretrained model here
        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        cs.plot_learning_curve(train_features, y_train)
        plt.show()

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)
    def get_feature_generator_results(self):
        input = [[("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")],
                 [("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")]]

        dataset = Dataset()
        word2count, word2idx = dataset.encode_dataset([input])

        f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx)

        X = f_generator.generate_features_docs([input])
        y = f_generator.generate_true_outcome([input])

        # run tests on X and y
        return input, X, y
    def perform_bootstrapping(self, dataset, sample_size, iterations):
        """
        bootstraps a sample n times. Averages the precision, recall, f1, tpr and
        fpr for each of the entities. Prints results of precision, recall and
        f1. Plots roc curves for tpr and fpr of each entity.
        """
        training_scores = []
        test_scores = []

        emp_pos_scores = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_major_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_scores = np.empty(shape=(0, 3), dtype='float64')

        mean_fpr = np.linspace(0, 1, 100)
        lb = LabelBinarizer()

        emp_pos_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_pos_fpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_fpr = np.empty(shape=(0, 3), dtype='float64')

        for x in range(0, iterations):
            print("iteration nr %s" % x)
            sampled_train_set, oob_test_set = self.resample_data(
                dataset, sample_size, return_leftovers=True)
            cs = CrfSuite()
            ds = Dataset()
            we_model = WeModel()
            w2v_model = we_model.train(
                dataset)  # optionally load a pretrained model here
            word2count, word2idx = ds.encode_dataset(sampled_train_set)

            f_generator = FeatureGenerator(w2v_model, word2count, word2idx)

            train_features = f_generator.generate_features_docs(
                sampled_train_set)
            y_train = f_generator.generate_true_outcome(sampled_train_set)

            test_features = f_generator.generate_features_docs(oob_test_set)
            y_test = f_generator.generate_true_outcome(oob_test_set)

            trainer = cs.train_model(train_features, y_train)
            y_train_pred = cs.test_model(trainer, train_features)
            y_test_pred = cs.test_model(trainer, test_features)

            score_train = cs.score_model(ds.docs2lines(y_train), y_train_pred)
            score_test = cs.score_model(ds.docs2lines(y_test), y_test_pred)

            y_true_combined = lb.fit_transform(
                list(chain.from_iterable(ds.docs2lines(y_test))))
            y_pred_combined = lb.transform(
                list(chain.from_iterable(y_test_pred)))

            class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
            """
            # fpr and tpr for one class
            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-POS"]], y_pred_combined[:, class_indices["B-EMP-POS"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-POS"]], y_pred_combined[:, class_indices["I-EMP-POS"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_pos_tpr = np.vstack([emp_pos_tpr, temp_tpr.mean(axis=0)])
            emp_pos_fpr = np.vstack([emp_pos_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-COMP"]], y_pred_combined[:, class_indices["B-EMP-COMP"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-COMP"]], y_pred_combined[:, class_indices["I-EMP-COMP"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_comp_tpr = np.vstack([emp_comp_tpr, temp_tpr.mean(axis=0)])
            emp_comp_fpr = np.vstack([emp_comp_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-MAJOR"]], y_pred_combined[:, class_indices["B-EDU-MAJOR"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-MAJOR"]], y_pred_combined[:, class_indices["I-EDU-MAJOR"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_major_tpr = np.vstack([edu_major_tpr, temp_tpr.mean(axis=0)])
            edu_major_fpr = np.vstack([edu_major_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-INST"]], y_pred_combined[:, class_indices["B-EDU-INST"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-INST"]], y_pred_combined[:, class_indices["I-EDU-INST"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_inst_tpr = np.vstack([edu_inst_tpr, temp_tpr.mean(axis=0)])
            edu_inst_fpr = np.vstack([edu_inst_fpr, temp_fpr.mean(axis=0)])
            """

            emp_pos_scores = np.vstack([
                emp_pos_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-POS")
            ])
            emp_comp_scores = np.vstack([
                emp_comp_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-COMP")
            ])
            edu_major_scores = np.vstack([
                edu_major_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-MAJOR")
            ])
            edu_inst_scores = np.vstack([
                edu_inst_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-INST")
            ])

            w2v_model = None
            train_features = test_features = None
        """
        print("EMP-POS")
        print("precision %s" % np.mean(emp_pos_scores[:,0]))
        print("recall %s" % np.mean(emp_pos_scores[:,1]))
        print("f1 %s" % np.mean(emp_pos_scores[:,2]))

        print("EMP-COMP")
        print("precision %s" % np.mean(emp_comp_scores[:,0]))
        print("recall %s" % np.mean(emp_comp_scores[:,1]))
        print("f1 %s" % np.mean(emp_comp_scores[:,2]))

        print("EDU-MAJOR")
        print("precision %s" % np.mean(edu_major_scores[:,0]))
        print("recall %s" % np.mean(edu_major_scores[:,1]))
        print("f1 %s" % np.mean(edu_major_scores[:,2]))

        print("EDU-INST")
        print("precision %s" % np.mean(edu_inst_scores[:,0]))
        print("recall %s" % np.mean(edu_inst_scores[:,1]))
        print("f1 %s" % np.mean(edu_inst_scores[:,2]))

        emp_pos_tpr = emp_pos_tpr.mean(axis=0)
        emp_pos_fpr = emp_pos_fpr.mean(axis=0)

        emp_comp_tpr = emp_comp_tpr.mean(axis=0)
        emp_comp_fpr = emp_comp_fpr.mean(axis=0)

        edu_major_tpr = edu_major_tpr.mean(axis=0)
        edu_major_fpr = edu_major_fpr.mean(axis=0)

        edu_inst_tpr = edu_inst_tpr.mean(axis=0)
        edu_inst_fpr = edu_inst_fpr.mean(axis=0)

        lw=2
        plt.subplot(221)
        plt.plot(emp_pos_fpr, emp_pos_tpr, color='g', linestyle='--', label='EMP-POS', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(222)
        plt.plot(emp_comp_fpr, emp_comp_tpr, color='g', linestyle='--', label='EMP-COMP', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(223)
        plt.plot(edu_major_fpr, edu_major_tpr, color='g', linestyle='--', label='EDU-MAJOR', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(224)
        plt.plot(edu_inst_fpr, edu_inst_tpr, color='g', linestyle='--', label='EDU-INST', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.show()
        """

        return emp_pos_scores, emp_comp_scores, edu_inst_scores, edu_major_scores