def test_crf_score(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) if algorithm != 'ap': assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8
def test_crf_score(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8
class CRFEvaluateStep(Step): """ Step to evaluate testing data against a CRF model, stored on file """ def __init__(self, model_file_path): self.model_file_path = path.abspath(path.expanduser(model_file_path)) self.model = CRF(algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True, model_filename=self.model_file_path) def run(self, batches: Generator) -> None: """ Runs the CRF model, storing to pickle in the end """ st = time.time() x = [] y = [] # For prediction, CRF does not implement batching, so we pass a list for batch in batches: b = list(batch) x.extend(b[0]) y.extend(b[1]) accuracy = self.model.score(x, y) y_pred = self.model.predict(x) f1_score = metrics.flat_f1_score(y, y_pred, average='weighted') accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred) classification_report = metrics.flat_classification_report( y, y_pred, labels=self.model.classes_) print("*" * 80) print("MODEL EVALUATION") print("*" * 80) print("Token-wise accuracy score on Test Data:") print(round(accuracy, 3)) print("F1 score on Test Data:") print(round(f1_score, 3)) print( "Sequence accurancy score (% of sentences scored 100% correctly):") print(round(accuracy_sentence, 3)) print("Class-wise classification report:") print(classification_report) et = time.time() print(f"Evaluation finished in {round(et-st, 2)} seconds.")
def test_crf_model_filename(xseq, yseq, tmpdir): path = os.path.join(str(tmpdir), "foo.crfsuite") assert not os.path.exists(path) # model file is created at a specified location crf = CRF(model_filename=path) crf.fit([xseq], [yseq]) assert os.path.exists(path) # it is possible to load the model just by passing a file name crf2 = CRF(model_filename=path) assert crf2.score([xseq], [yseq]) == 1.0 # crf is picklable data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) crf3 = pickle.loads(data) assert crf3.score([xseq], [yseq]) == 1.0
def train(self, test_size=0.25, dumper=None): dataset, word_dictionary = self.datasource() if self.tokenizer: self.model.synonyms = self.tokenizer.synonyms self.tokenizer.word_dictionary = word_dictionary self.model.word_dictionary = word_dictionary self.dataset = dataset best_classifier = None max_accuracy = 0 clf = None # print('Dataset %s' % len(self.dataset)) if len(self.dataset) == 0: return train_set, test_set = train_test_split(self.dataset, test_size=test_size, random_state=10) if not train_set or self.is_overfitting: train_set = self.dataset test_set = self.dataset taggers = list() if self.classifiers[0].__name__ == 'CRF': from sklearn_crfsuite import metrics X_train, y_train = self.crf_transform_to_dataset(train_set) X_test, y_test = self.crf_transform_to_dataset(test_set) if dumper: self.dumper = dumper dumper(X_train, self.__class__.__name__.lower() + 'X_train.txt') dumper(X_test, self.__class__.__name__.lower() + 'X_test.txt') print('Train_set %s' % len(X_train)) print('Test_set %s' % len(X_test)) # print(len(X_train), len(y_train)) clf = CRF() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) max_accuracy = accuracy # Print F1 score of each label if self.is_overfitting == True: y_pred = clf.predict(X_test) classes = list(clf.classes_) labels = [] for label in classes: if label[:1] != '_': labels.append(label) print( metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3)) else: accuracy = clf.score(X_test, y_test) max_accuracy = accuracy else: for feature_extraction in self.feature_extractions: X_train, y_train = self.classify_transform_to_dataset( train_set) X_test, y_test = self.classify_transform_to_dataset(test_set) for classifier in self.classifiers: steps = list() steps.append(feature_extraction) if self.model.use_tfidf: steps.append(('tfidf', TfidfTransformer())) steps.append(self.get_classifier(classifier)) clf = Pipeline(steps) try: clf.fit(X_train, y_train) except Exception as e: print('ERROR', e) continue y_pred = clf.predict(X_test) classes = list(clf.classes_) # print(classes) from sklearn import metrics print( metrics.classification_report(y_test, y_pred, target_names=classes, digits=3)) accuracy = clf.score(X_test, y_test) # for y1,y2,x in zip(y_test,y_pred,X_test): # if y1!=y2: # print('Sentence: ',x) # print('True label',y1) # print('Predict label',y2) print('feature extraction %s, classifier %s, accuracy: %s' % \ (feature_extraction[0], classifier.__name__, accuracy)) if accuracy >= max_accuracy: max_accuracy = accuracy best_classifier = clf if not best_classifier: best_classifier = clf feature_extraction = 'dict' if best_classifier.__class__.__name__ == 'CRF' \ else best_classifier.steps[0][0] classifier_name = best_classifier.__class__.__name__ if best_classifier.__class__.__name__ == 'CRF' \ else best_classifier.steps[-1][1].__class__.__name__ print('Best model: feature extraction %s, classifier %s, accuracy: %s' % \ (feature_extraction, classifier_name, max_accuracy)) self.model.pipeline = best_classifier if feature_extraction == 'count': self.model.pipeline.steps[0][1].tokenizer = None self.model.build_version = time.time() self.dataset = zip(X_train, y_train) self.taggers = taggers return self.model
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) report = flat_classification_report(y_pred=pred, y_true=y) crf.fit(X, y) import spacy import en_core_web_sm nlp = en_core_web_sm.load() def pos_tagger(sent): doc = nlp(sent) sent_list = [] for token in doc: sent_list.append((token.text, token.tag_)) return sent_list print(report) crf.score(X, y) x=crf.predict_single(sent2features(pos_tagger("Jim bought 300 shares of Acme Corp. in 2006"))) print(x)
print(len(tagged_sentences)) total = int(len(tagged_sentences) * 0.80) print(total) #train=tagged_sentences X1_train, y1_train = transform_to_dataset(tagged_sentences[:total]) print(len(tagged_sentences[:total])) print(len(tagged_sentences[total:])) X_test, y_test = transform_to_dataset(tagged_sentences[total:]) #déclaration du modèle suivant l'algotihme lbfgs model = CRF( algorithm='arow', # Adaptive Regularization Of Weight Vector (AROW). #Used to pos tag words or other information provided by the model within the situation. max_iterations=100, all_possible_transitions=True) #entrainement model.fit(X1_train, y1_train) y_pred = model.predict(X_test) labels = list(model.classes_) print(labels) metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) print(model.score(X1_train, y1_train)) # Sauvegarde du modèle #from joblib import dump, load #dump(model, 'model.joblib')
c2=0.1, max_iterations=100, all_possible_transitions=True ) #entrainement model.fit(X1_train, y1_train) y_pred = model.predict(X_test) labels = list(model.classes_) print(labels) metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) # Sauvegarde du modèle #from joblib import dump, load #dump(model, 'model.joblib') sorted_labels = sorted( labels, key=lambda name: (name[1:], name[0]) ) ##print(metrics.flat_classification_report( ## y_test, y_pred, labels=sorted_labels, digits=3 ##)) print (model.score(X_test, y_pred)) model.fit(X_train, y_train) from joblib import dump, load dump(model, 'model.joblib')
for sentence in f_dev.read().split("\n\n") if sentence != ""] with codecs.open(args.train, encoding="utf-8") as f_train: train_samples = [[l.split("\t") for l in sentence.split("\n")] for sentence in f_train.read().split("\n\n") if sentence != ""] X_train = [ sent2features(s, args.prev_context, args.next_context) for s in train_samples ] y_train = [sent2labels(s) for s in train_samples] X_dev = [ sent2features(s, args.prev_context, args.next_context) for s in dev_samples ] y_dev = [sent2labels(s) for s in dev_samples] crf.fit(X_train, y_train) y_pred = crf.predict(X_dev) print "F-score", flat_f1_score(y_dev, y_pred, average='weighted') print "Accuracy:", crf.score(X_dev, y_dev) with codecs.open(args.model + ".crf.pickle", "wb") as f: pickle.dump((crf, args.prev_context, args.next_context), f) else: raise NotImplementedError else: raise NotImplementedError
y_dev = [sent2label(sent) for sent in dev_sent] X_test = [ chunking_sent2features(sent=sent, mode='test') for sent in test_sents ] y_test = [sent2label(sent) for sent in test_sents] # Build the model print("=======================") print("Build the model ...") model = CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.5, c2=0.005, max_iterations=MAX_ITER, delta=DELTA, period=30, verbose=True) print("Training ...") model.fit(X_train, y_train, X_dev, y_dev) print("Done training! Saving the model to /models/" + MODEL_NAME) pickle.dump(model, open(MODEL_NAME + ".pkl", "wb"), protocol=2) print("Done!!!") print("=======================") print("Testing ....") score = model.score(X_test, y_test) print(score) print("Done!!!")
X_train, y_train = transform_to_dataset(tagged_sentences) model = CRF( algorithm= 'arow', # Limited-memory Broyden–Fletcher–Goldfarb–Shanno Algorithm. #Used to pos tag words or other information provided by the model within the situation. max_iterations=100, all_possible_transitions=True) #entrainement model.fit(X1_train, y1_train) y_pred = model.predict(X_test) labels = list(model.classes_) print(labels) print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)) print(model.score(X_test, y_test)) # Sauvegarde du modèle sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) #print(metrics.flat_classification_report( # y_test, y_pred, labels=sorted_labels, digits=3 #)) #print (model.classes_) from collections import Counter def print_transitions(trans_features): for (label_from, label_to), weight in trans_features: print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))