Exemple #1
0
def test_crf_score(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])

    score = crf.score([xseq], [yseq])
    if algorithm != 'ap':
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8
def test_crf_score(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    score = crf.score([xseq], [yseq])
    if algorithm != "ap":
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8
Exemple #3
0
class CRFEvaluateStep(Step):
    """
    Step to evaluate testing data against a CRF model,
    stored on file
    """
    def __init__(self, model_file_path):
        self.model_file_path = path.abspath(path.expanduser(model_file_path))
        self.model = CRF(algorithm='l2sgd',
                         c2=0.1,
                         max_iterations=1000,
                         all_possible_transitions=True,
                         model_filename=self.model_file_path)

    def run(self, batches: Generator) -> None:
        """
        Runs the CRF model, storing to pickle in the end
        """
        st = time.time()

        x = []
        y = []

        # For prediction, CRF does not implement batching, so we pass a list
        for batch in batches:
            b = list(batch)
            x.extend(b[0])
            y.extend(b[1])

        accuracy = self.model.score(x, y)
        y_pred = self.model.predict(x)
        f1_score = metrics.flat_f1_score(y, y_pred, average='weighted')
        accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred)
        classification_report = metrics.flat_classification_report(
            y, y_pred, labels=self.model.classes_)
        print("*" * 80)
        print("MODEL EVALUATION")
        print("*" * 80)
        print("Token-wise accuracy score on Test Data:")
        print(round(accuracy, 3))
        print("F1 score on Test Data:")
        print(round(f1_score, 3))
        print(
            "Sequence accurancy score (% of sentences scored 100% correctly):")
        print(round(accuracy_sentence, 3))
        print("Class-wise classification report:")
        print(classification_report)
        et = time.time()
        print(f"Evaluation finished in {round(et-st, 2)} seconds.")
Exemple #4
0
def test_crf_model_filename(xseq, yseq, tmpdir):
    path = os.path.join(str(tmpdir), "foo.crfsuite")
    assert not os.path.exists(path)

    # model file is created at a specified location
    crf = CRF(model_filename=path)
    crf.fit([xseq], [yseq])
    assert os.path.exists(path)

    # it is possible to load the model just by passing a file name
    crf2 = CRF(model_filename=path)
    assert crf2.score([xseq], [yseq]) == 1.0

    # crf is picklable
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)
    crf3 = pickle.loads(data)
    assert crf3.score([xseq], [yseq]) == 1.0
def test_crf_model_filename(xseq, yseq, tmpdir):
    path = os.path.join(str(tmpdir), "foo.crfsuite")
    assert not os.path.exists(path)

    # model file is created at a specified location
    crf = CRF(model_filename=path)
    crf.fit([xseq], [yseq])
    assert os.path.exists(path)

    # it is possible to load the model just by passing a file name
    crf2 = CRF(model_filename=path)
    assert crf2.score([xseq], [yseq]) == 1.0

    # crf is picklable
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)
    crf3 = pickle.loads(data)
    assert crf3.score([xseq], [yseq]) == 1.0
Exemple #6
0
    def train(self, test_size=0.25, dumper=None):

        dataset, word_dictionary = self.datasource()

        if self.tokenizer:
            self.model.synonyms = self.tokenizer.synonyms
            self.tokenizer.word_dictionary = word_dictionary
        self.model.word_dictionary = word_dictionary
        self.dataset = dataset

        best_classifier = None
        max_accuracy = 0
        clf = None

        # print('Dataset %s' % len(self.dataset))
        if len(self.dataset) == 0: return

        train_set, test_set = train_test_split(self.dataset,
                                               test_size=test_size,
                                               random_state=10)

        if not train_set or self.is_overfitting:
            train_set = self.dataset
            test_set = self.dataset

        taggers = list()

        if self.classifiers[0].__name__ == 'CRF':
            from sklearn_crfsuite import metrics
            X_train, y_train = self.crf_transform_to_dataset(train_set)
            X_test, y_test = self.crf_transform_to_dataset(test_set)

            if dumper:
                self.dumper = dumper
                dumper(X_train,
                       self.__class__.__name__.lower() + 'X_train.txt')
                dumper(X_test, self.__class__.__name__.lower() + 'X_test.txt')

            print('Train_set %s' % len(X_train))
            print('Test_set %s' % len(X_test))
            # print(len(X_train), len(y_train))

            clf = CRF()
            clf.fit(X_train, y_train)

            accuracy = clf.score(X_test, y_test)
            max_accuracy = accuracy

            # Print F1 score of each label
            if self.is_overfitting == True:
                y_pred = clf.predict(X_test)
                classes = list(clf.classes_)
                labels = []
                for label in classes:
                    if label[:1] != '_':
                        labels.append(label)
                print(
                    metrics.flat_classification_report(y_test,
                                                       y_pred,
                                                       labels=labels,
                                                       digits=3))
            else:
                accuracy = clf.score(X_test, y_test)
                max_accuracy = accuracy

        else:

            for feature_extraction in self.feature_extractions:

                X_train, y_train = self.classify_transform_to_dataset(
                    train_set)
                X_test, y_test = self.classify_transform_to_dataset(test_set)
                for classifier in self.classifiers:
                    steps = list()
                    steps.append(feature_extraction)
                    if self.model.use_tfidf:
                        steps.append(('tfidf', TfidfTransformer()))
                    steps.append(self.get_classifier(classifier))
                    clf = Pipeline(steps)

                    try:
                        clf.fit(X_train, y_train)
                    except Exception as e:
                        print('ERROR', e)
                        continue

                    y_pred = clf.predict(X_test)
                    classes = list(clf.classes_)
                    # print(classes)
                    from sklearn import metrics
                    print(
                        metrics.classification_report(y_test,
                                                      y_pred,
                                                      target_names=classes,
                                                      digits=3))
                    accuracy = clf.score(X_test, y_test)

                    # for y1,y2,x in zip(y_test,y_pred,X_test):
                    #     if y1!=y2:
                    #         print('Sentence: ',x)
                    #         print('True label',y1)
                    #         print('Predict label',y2)


                    print('feature extraction %s, classifier %s, accuracy: %s' % \
                          (feature_extraction[0], classifier.__name__, accuracy))

                    if accuracy >= max_accuracy:
                        max_accuracy = accuracy
                        best_classifier = clf

        if not best_classifier:
            best_classifier = clf

        feature_extraction = 'dict' if best_classifier.__class__.__name__ == 'CRF' \
            else best_classifier.steps[0][0]

        classifier_name = best_classifier.__class__.__name__ if best_classifier.__class__.__name__ == 'CRF' \
            else best_classifier.steps[-1][1].__class__.__name__


        print('Best model: feature extraction %s, classifier %s, accuracy: %s' % \
              (feature_extraction, classifier_name, max_accuracy))

        self.model.pipeline = best_classifier

        if feature_extraction == 'count':
            self.model.pipeline.steps[0][1].tokenizer = None

        self.model.build_version = time.time()

        self.dataset = zip(X_train, y_train)
        self.taggers = taggers

        return self.model
Exemple #7
0
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

report = flat_classification_report(y_pred=pred, y_true=y)

crf.fit(X, y)

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def pos_tagger(sent):
  doc = nlp(sent)
  sent_list = []
  for token in doc:
    sent_list.append((token.text, token.tag_))
  return sent_list

print(report)

crf.score(X, y)

x=crf.predict_single(sent2features(pos_tagger("Jim bought 300 shares of Acme Corp. in 2006")))
print(x)

Exemple #8
0

print(len(tagged_sentences))
total = int(len(tagged_sentences) * 0.80)
print(total)
#train=tagged_sentences

X1_train, y1_train = transform_to_dataset(tagged_sentences[:total])
print(len(tagged_sentences[:total]))
print(len(tagged_sentences[total:]))

X_test, y_test = transform_to_dataset(tagged_sentences[total:])
#déclaration du modèle suivant l'algotihme  lbfgs
model = CRF(
    algorithm='arow',  # Adaptive Regularization Of Weight Vector (AROW).
    #Used to pos tag words or other information provided by the model within the situation.
    max_iterations=100,
    all_possible_transitions=True)
#entrainement
model.fit(X1_train, y1_train)

y_pred = model.predict(X_test)
labels = list(model.classes_)
print(labels)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print(model.score(X1_train, y1_train))
# Sauvegarde du modèle

#from joblib import dump, load
#dump(model, 'model.joblib')
Exemple #9
0
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
#entrainement
model.fit(X1_train, y1_train)

y_pred = model.predict(X_test)
labels = list(model.classes_)
print(labels)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

# Sauvegarde du modèle

#from joblib import dump, load
#dump(model, 'model.joblib')
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
##print(metrics.flat_classification_report(
##    y_test, y_pred, labels=sorted_labels, digits=3
##))

print (model.score(X_test, y_pred))
model.fit(X_train, y_train)

from joblib import dump, load
dump(model, 'model.joblib')
                               for sentence in f_dev.read().split("\n\n")
                               if sentence != ""]

            with codecs.open(args.train, encoding="utf-8") as f_train:
                train_samples = [[l.split("\t") for l in sentence.split("\n")]
                                 for sentence in f_train.read().split("\n\n")
                                 if sentence != ""]

            X_train = [
                sent2features(s, args.prev_context, args.next_context)
                for s in train_samples
            ]
            y_train = [sent2labels(s) for s in train_samples]
            X_dev = [
                sent2features(s, args.prev_context, args.next_context)
                for s in dev_samples
            ]
            y_dev = [sent2labels(s) for s in dev_samples]
            crf.fit(X_train, y_train)
            y_pred = crf.predict(X_dev)
            print "F-score", flat_f1_score(y_dev, y_pred, average='weighted')
            print "Accuracy:", crf.score(X_dev, y_dev)
            with codecs.open(args.model + ".crf.pickle", "wb") as f:
                pickle.dump((crf, args.prev_context, args.next_context), f)

        else:
            raise NotImplementedError

    else:
        raise NotImplementedError
Exemple #11
0
    y_dev = [sent2label(sent) for sent in dev_sent]

    X_test = [
        chunking_sent2features(sent=sent, mode='test') for sent in test_sents
    ]
    y_test = [sent2label(sent) for sent in test_sents]

    # Build the model
    print("=======================")
    print("Build the model ...")
    model = CRF(algorithm='lbfgs',
                all_possible_transitions=True,
                c1=0.5,
                c2=0.005,
                max_iterations=MAX_ITER,
                delta=DELTA,
                period=30,
                verbose=True)
    print("Training ...")
    model.fit(X_train, y_train, X_dev, y_dev)
    print("Done training! Saving the model to /models/" + MODEL_NAME)

    pickle.dump(model, open(MODEL_NAME + ".pkl", "wb"), protocol=2)
    print("Done!!!")

    print("=======================")
    print("Testing ....")
    score = model.score(X_test, y_test)
    print(score)
    print("Done!!!")
X_train, y_train = transform_to_dataset(tagged_sentences)

model = CRF(
    algorithm=
    'arow',  # Limited-memory  Broyden–Fletcher–Goldfarb–Shanno Algorithm.
    #Used to pos tag words or other information provided by the model within the situation.
    max_iterations=100,
    all_possible_transitions=True)
#entrainement
model.fit(X1_train, y1_train)

y_pred = model.predict(X_test)
labels = list(model.classes_)
print(labels)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))
print(model.score(X_test, y_test))
# Sauvegarde du modèle
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
#print(metrics.flat_classification_report(
#   y_test, y_pred, labels=sorted_labels, digits=3
#))

#print (model.classes_)
from collections import Counter


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))