def test_ensure_safe(self):
        unsafe_examples = [
            ([[]], [[]]),
            ([[], []], [[], []]),
        ]

        # We don't assert anything here but it segfault otherwise
        for x, y in unsafe_examples:
            x, y = _ensure_safe(x, y)
            model = CRF().fit(x, y)
            model.predict_single([""])
Beispiel #2
0
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

report = flat_classification_report(y_pred=pred, y_true=y)

crf.fit(X, y)

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def pos_tagger(sent):
  doc = nlp(sent)
  sent_list = []
  for token in doc:
    sent_list.append((token.text, token.tag_))
  return sent_list

print(report)

crf.score(X, y)

x=crf.predict_single(sent2features(pos_tagger("Jim bought 300 shares of Acme Corp. in 2006")))
print(x)

Beispiel #3
0
from Sents2Features import string2Features
from sklearn_crfsuite import CRF
import configparser

config = configparser.ConfigParser()
config.read('config.ini')
names = config['Names']

crf = CRF(algorithm='lbfgs',
          c1=0.0001,
          c2=0.0001,
          max_iterations=100,
          all_possible_transitions=False,
          model_filename=names['model_name'])

print(
    crf.predict_single(
        string2Features(
            'Ég kenni stærðfræði í Háskóla Íslands öll virk kvöld')))
Beispiel #4
0
class Model:

    def __init__(self, algo: str = 'lbfgs', min_freq: int = 0,
                 all_states: bool = False, max_iter: int = 100,
                 epsilon: float = 1e-5, delta: float = 1e-5):
        """

        :param algo: optimization algorithm (lbfgs, l2sgd, ap, pa, arow)
        :param min_freq: threshold of ignoring feature
        :param all_states: if True, consider combinations
                           of missing features and labels
        :param max_iter: max iteration size
        :param epsilon: learning rate
        :param delta: stop training threshold
        """

        self._algo = algo
        self._min_freq = min_freq
        self._all_states = all_states
        self._max_iter = max_iter
        self._epsilon = epsilon
        self._delta = delta
        self.model = CRF(algorithm=algo,
                         min_freq=min_freq,
                         all_possible_states=all_states,
                         max_iterations=max_iter,
                         epsilon=epsilon,
                         delta=delta)

    def train(self, features: List[List[Dict[str, Any]]],
              labels: List[List[str]]):
        """
        train CRF model using dataset features and labels
        :param features: features of sentences
        :param labels: labels of sentences
        :return:
        """

        self.model.fit(features, labels)

    def predict(self, features: List[Dict[str, Any]]) -> List[str]:
        """
        predict NE labels of a sentence
        :param features: features of a sentence
        :return: labels of a sentence
        """

        return self.model.predict_single(features)

    def predict_all(self, features: List[List[Dict[str, Any]]])\
            -> List[List[str]]:
        """
        predict NE labels of sentences
        :param features: features of sentences
        :return: labels of sentences
        """

        return self.model.predict(features)

    def label_types(self) -> List[str]:
        """
        get label types of dataset
        :return: label types of dataset
        """

        label_types = list(self.model.classes_)
        label_types.remove('O')
        label_types = sorted(list(set(label[2:] for label in label_types)))
        return label_types

    def hyper_param_tune(self,
                         train_features: List[List[Dict[str, Any]]],
                         train_labels: List[List[str]],
                         dev_features: List[List[Dict[str, Any]]],
                         dev_labels: List[List[str]]) -> None:
        """
        execute hyper paramter tuning with grid search
        :param dev_features: [description]
        :param dev_labels: [description]
        :return: [description]
        """

        c1 = [0.01, 0.05, 0.1]
        c2 = [0.01, 0.05, 0.1]

        tmp_f1_score = 0
        tmp_model = None
        for c1_ in c1:
            for c2_ in c2:
                self.model = CRF(algorithm=self._algo,
                                 min_freq=self._min_freq,
                                 all_possible_states=self._all_states,
                                 max_iterations=self._max_iter,
                                 epsilon=self._epsilon,
                                 delta=self._delta,
                                 c1=c1_,
                                 c2=c2_)
                self.train(train_features, train_labels)
                predicted = self.predict_all(dev_features)
                labels = list(self.model.classes_)
                labels.remove('O')
                f1_score = flat_f1_score(dev_labels, predicted,
                                         average='weighted', labels=labels)
                if f1_score > tmp_f1_score:
                    tmp_f1_score = f1_score
                    tmp_model = self.model
        self.model = tmp_model