Esempio n. 1
0
    def draw_roc_curve_saved_model(self):
        self.logger.println("drawing roc curve from saved model")
        start_time = timeit.default_timer()
        cs = CrfSuite()
        crf = cs.load_model("current_crf_model.pkl")

        dataset = Dataset()
        data = dataset.read(nr_of_files=1000)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.read()
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        evaluator = Evaluator()
        evaluator.draw_roc_proba(crf, test_features, y_test)
Esempio n. 2
0
    def train_model(self, nr_of_files=-1):
        self.logger.println("train model called")
        start_time = timeit.default_timer()
        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=nr_of_files)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        model = cs.train_model(train_features, y_train)
        cs.save_model(model)
        y_train_pred = cs.test_model(model, train_features)
        y_test_pred = cs.test_model(model, test_features)

        print("printing training results")
        cs.print_classification_report(dataset.docs2lines(y_train),
                                       y_train_pred)
        score_train = cs.score_model(dataset.docs2lines(y_train), y_train_pred)
        print("training f1 score: %s" % score_train)

        print("printing test results")
        cs.print_classification_report(dataset.docs2lines(y_test), y_test_pred)
        score_test = cs.score_model(dataset.docs2lines(y_test), y_test_pred)
        print("test f1 score: %s" % score_test)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)

        evaluator = Evaluator()
        evaluator.perform_roc_analysis(dataset.docs2lines(y_train),
                                       y_train_pred)
        evaluator.perform_roc_analysis(dataset.docs2lines(y_test), y_test_pred)
Esempio n. 3
0
    def optimise_model(self, argv):
        self.logger.println("optimise model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=argv)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)

        word2count, word2idx = dataset.encode_dataset(data)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(data)
        y_train = f_generator.generate_true_outcome(data)

        cs.optimise_model(train_features, y_train)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("optimise model operation took",
                                     elapsed_seconds)
Esempio n. 4
0
    def train_model_learning_curve(self, arg):
        self.logger.println("train model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=arg)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        #w2v_model = we_model.load_pretrained_model() # optionally load a pretrained model here
        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        cs.plot_learning_curve(train_features, y_train)
        plt.show()

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)
    def get_feature_generator_results(self):
        input = [[("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")],
                 [("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")]]

        dataset = Dataset()
        word2count, word2idx = dataset.encode_dataset([input])

        f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx)

        X = f_generator.generate_features_docs([input])
        y = f_generator.generate_true_outcome([input])

        # run tests on X and y
        return input, X, y
    def perform_bootstrapping(self, dataset, sample_size, iterations):
        """
        bootstraps a sample n times. Averages the precision, recall, f1, tpr and
        fpr for each of the entities. Prints results of precision, recall and
        f1. Plots roc curves for tpr and fpr of each entity.
        """
        training_scores = []
        test_scores = []

        emp_pos_scores = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_major_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_scores = np.empty(shape=(0, 3), dtype='float64')

        mean_fpr = np.linspace(0, 1, 100)
        lb = LabelBinarizer()

        emp_pos_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_pos_fpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_fpr = np.empty(shape=(0, 3), dtype='float64')

        for x in range(0, iterations):
            print("iteration nr %s" % x)
            sampled_train_set, oob_test_set = self.resample_data(
                dataset, sample_size, return_leftovers=True)
            cs = CrfSuite()
            ds = Dataset()
            we_model = WeModel()
            w2v_model = we_model.train(
                dataset)  # optionally load a pretrained model here
            word2count, word2idx = ds.encode_dataset(sampled_train_set)

            f_generator = FeatureGenerator(w2v_model, word2count, word2idx)

            train_features = f_generator.generate_features_docs(
                sampled_train_set)
            y_train = f_generator.generate_true_outcome(sampled_train_set)

            test_features = f_generator.generate_features_docs(oob_test_set)
            y_test = f_generator.generate_true_outcome(oob_test_set)

            trainer = cs.train_model(train_features, y_train)
            y_train_pred = cs.test_model(trainer, train_features)
            y_test_pred = cs.test_model(trainer, test_features)

            score_train = cs.score_model(ds.docs2lines(y_train), y_train_pred)
            score_test = cs.score_model(ds.docs2lines(y_test), y_test_pred)

            y_true_combined = lb.fit_transform(
                list(chain.from_iterable(ds.docs2lines(y_test))))
            y_pred_combined = lb.transform(
                list(chain.from_iterable(y_test_pred)))

            class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
            """
            # fpr and tpr for one class
            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-POS"]], y_pred_combined[:, class_indices["B-EMP-POS"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-POS"]], y_pred_combined[:, class_indices["I-EMP-POS"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_pos_tpr = np.vstack([emp_pos_tpr, temp_tpr.mean(axis=0)])
            emp_pos_fpr = np.vstack([emp_pos_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-COMP"]], y_pred_combined[:, class_indices["B-EMP-COMP"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-COMP"]], y_pred_combined[:, class_indices["I-EMP-COMP"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_comp_tpr = np.vstack([emp_comp_tpr, temp_tpr.mean(axis=0)])
            emp_comp_fpr = np.vstack([emp_comp_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-MAJOR"]], y_pred_combined[:, class_indices["B-EDU-MAJOR"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-MAJOR"]], y_pred_combined[:, class_indices["I-EDU-MAJOR"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_major_tpr = np.vstack([edu_major_tpr, temp_tpr.mean(axis=0)])
            edu_major_fpr = np.vstack([edu_major_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-INST"]], y_pred_combined[:, class_indices["B-EDU-INST"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-INST"]], y_pred_combined[:, class_indices["I-EDU-INST"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_inst_tpr = np.vstack([edu_inst_tpr, temp_tpr.mean(axis=0)])
            edu_inst_fpr = np.vstack([edu_inst_fpr, temp_fpr.mean(axis=0)])
            """

            emp_pos_scores = np.vstack([
                emp_pos_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-POS")
            ])
            emp_comp_scores = np.vstack([
                emp_comp_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-COMP")
            ])
            edu_major_scores = np.vstack([
                edu_major_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-MAJOR")
            ])
            edu_inst_scores = np.vstack([
                edu_inst_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-INST")
            ])

            w2v_model = None
            train_features = test_features = None
        """
        print("EMP-POS")
        print("precision %s" % np.mean(emp_pos_scores[:,0]))
        print("recall %s" % np.mean(emp_pos_scores[:,1]))
        print("f1 %s" % np.mean(emp_pos_scores[:,2]))

        print("EMP-COMP")
        print("precision %s" % np.mean(emp_comp_scores[:,0]))
        print("recall %s" % np.mean(emp_comp_scores[:,1]))
        print("f1 %s" % np.mean(emp_comp_scores[:,2]))

        print("EDU-MAJOR")
        print("precision %s" % np.mean(edu_major_scores[:,0]))
        print("recall %s" % np.mean(edu_major_scores[:,1]))
        print("f1 %s" % np.mean(edu_major_scores[:,2]))

        print("EDU-INST")
        print("precision %s" % np.mean(edu_inst_scores[:,0]))
        print("recall %s" % np.mean(edu_inst_scores[:,1]))
        print("f1 %s" % np.mean(edu_inst_scores[:,2]))

        emp_pos_tpr = emp_pos_tpr.mean(axis=0)
        emp_pos_fpr = emp_pos_fpr.mean(axis=0)

        emp_comp_tpr = emp_comp_tpr.mean(axis=0)
        emp_comp_fpr = emp_comp_fpr.mean(axis=0)

        edu_major_tpr = edu_major_tpr.mean(axis=0)
        edu_major_fpr = edu_major_fpr.mean(axis=0)

        edu_inst_tpr = edu_inst_tpr.mean(axis=0)
        edu_inst_fpr = edu_inst_fpr.mean(axis=0)

        lw=2
        plt.subplot(221)
        plt.plot(emp_pos_fpr, emp_pos_tpr, color='g', linestyle='--', label='EMP-POS', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(222)
        plt.plot(emp_comp_fpr, emp_comp_tpr, color='g', linestyle='--', label='EMP-COMP', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(223)
        plt.plot(edu_major_fpr, edu_major_tpr, color='g', linestyle='--', label='EDU-MAJOR', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(224)
        plt.plot(edu_inst_fpr, edu_inst_tpr, color='g', linestyle='--', label='EDU-INST', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.show()
        """

        return emp_pos_scores, emp_comp_scores, edu_inst_scores, edu_major_scores
Esempio n. 7
0
class CrfSuite(Tags):
    __seperator = "/"
    __crf_model_name = "current_crf_model.pkl"

    def __init__(self):
        self.logger = Logger()
        self.logger.println("CrfSuite created")

    def train_model(self, X, y):
        self.logger.println("transforming data to train model")
        X_combined = list(chain.from_iterable(X))
        y_combined = list(chain.from_iterable(y))

        self.logger.println("crf trainer init")
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.35,
                                   c2=0.35,
                                   max_iterations=125,
                                   all_possible_transitions=True,
                                   verbose=False)
        crf.fit(X_combined, y_combined)
        return crf

    def save_model(self, model, name=__crf_model_name):
        joblib.dump(model, name)

    def load_model(self, name=__crf_model_name):
        return joblib.load(name)

    def score_model(self, y_true, y_pred):
        lb = LabelBinarizer()

        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])

        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        return f1_score(y_true_combined,
                        y_pred_combined,
                        average="weighted",
                        labels=[class_indices[cls] for cls in tagset])

    def print_classification_report(self, y_true, y_pred):
        lb = LabelBinarizer()

        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])

        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        # TODO return f1 score or other wise here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
        print(
            classification_report(
                y_true_combined,
                y_pred_combined,
                labels=[class_indices[cls] for cls in tagset],
                target_names=tagset))

    def load_tagger(self):
        self.__trained_tagger = pycrfsuite.Tagger()
        self.__trained_tagger.open('test_NER.crfsuite')

        we_model = WeModel()
        self.w2v_model = we_model.read()
        dataset = Dataset()
        data = dataset.read(nr_of_files=-1)
        word2count, word2idx = dataset.encode_dataset(data)
        self.f_generator = FeatureGenerator(self.w2v_model, word2count,
                                            word2idx)

    # doc: in format of tagged tuples
    def tag_doc(self, doc):
        feature_input = self.f_generator.generate_features_docs([doc])
        model = self.load_model()
        """
        xseq = []
        for line_idx, line in enumerate(feature_input[0]):
            for token_idx, token in enumerate(line):
                xseq.append(token)
        """

        predicted_tags = model.predict(feature_input[0])

        # TODO change this to take in doc and not xseq (convert predicted tags
        # to the structure of doc)
        return self.interpret_predicted_tags(doc, predicted_tags)

    def interpret_predicted_tags(self, doc, tags):
        dataset = Dataset()
        identified_entities = []
        doc = dataset.docs2lines(doc)
        tags = dataset.docs2lines(tags)
        for tag_idx, tag in enumerate(tags):
            if tag in Tags.start_tagset:
                entity_found = ""
                tag_idx_forward = tag_idx
                while True:
                    if tag_idx_forward >= len(tags) or tags[
                            tag_idx_forward] == self._Tags__outside_tag:
                        break
                    entity_found = entity_found + " " + doc[tag_idx_forward][0]
                    #entity_found = entity_found + " " + doc[line_idx]['word']
                    tag_idx_forward += 1

                identified_entities.append((entity_found, tags[tag_idx]))

        return identified_entities

    # use an existing model to tag data
    def test_model(self, model, features):
        X_features = list(chain.from_iterable(features))
        y_pred = model.predict(X_features)
        return y_pred

    # hyperparamter optimisation
    def optimise_model(self, X, y):
        # prepare data structure
        xseq = []
        yseq = []
        # transform data structure to group tokens by lines
        for doc_x, doc_y in zip(X, y):
            for line_idx, line in enumerate(doc_x):
                xseq.append(line)
                yseq.append(doc_y[line_idx])

        # define fixed parameters and parameters to search
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=200,
                                   all_possible_transitions=True,
                                   verbose=True)
        params_space = {
            'c1': scipy.stats.expon(scale=0.03),
            'c2': scipy.stats.expon(scale=0.03),
        }

        labels = Tags.tag_list
        labels.remove('O')
        # use the same metric for evaluation
        f1_scorer = make_scorer(metrics.flat_f1_score,
                                average='weighted',
                                labels=labels)

        # search
        rs = RandomizedSearchCV(crf,
                                params_space,
                                cv=5,
                                verbose=1,
                                n_jobs=2,
                                n_iter=50,
                                scoring=f1_scorer)
        rs.fit(xseq, yseq)

        print('best params:', rs.best_params_)
        print('best CV score:', rs.best_score_)
        print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ /
                                            1000000))

        _x = [s.parameters['c1'] for s in rs.grid_scores_]
        _y = [s.parameters['c2'] for s in rs.grid_scores_]
        _c = [s.mean_validation_score for s in rs.grid_scores_]

        fig = plt.figure()
        fig.set_size_inches(12, 12)
        ax = plt.gca()
        ax.set_yscale('log')
        ax.set_xscale('log')
        ax.set_xlabel('C1')
        ax.set_ylabel('C2')
        ax.set_title(
            "Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})"
            .format(min(_c), max(_c)))

        ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0, 0, 0])

        print("Dark blue => {:0.4}, dark red => {:0.4}".format(
            min(_c), max(_c)))
        plt.show()

    def plot_learning_curve(self, X, y):
        train_sizes = np.linspace(.1, 1.0, 5)
        n_jobs = 8
        title = "Learning Curves"
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        plt.figure()
        plt.title(title)
        ylim = (0.01, 1.01)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")

        X_lines = []
        y_lines = []
        for doc_x, doc_y in zip(X, y):
            for line_idx, line in enumerate(doc_x):
                X_lines.append(line)
                y_lines.append(doc_y[line_idx])

        estimator = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                         c1=0.001,
                                         c2=0.001,
                                         max_iterations=110,
                                         all_possible_transitions=True,
                                         verbose=True)
        custom_scorer = make_scorer(self.score_model, greater_is_better=True)

        #train_sizes, train_scores, test_scores = learning_curve(estimator, X_lines, y_lines, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X_lines,
            y_lines,
            cv=cv,
            scoring=custom_scorer,
            n_jobs=n_jobs,
            train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")

        plt.legend(loc="best")
        return plt