Python CRF.fit Exemples, sklearn_crfsuite.CRF.fit Python Exemples

Exemple #1

0

Afficher le fichier

def main(path_train, path_test, path_pred, path_crf, take_first, dev_size):
    print("loading train corpus..")
    _, X_raw, y = load_corpus(path_train, take_first=take_first)
    print("extracting features from train corpus..")
    fe = TaggerFeatureExtractor()
    X = fe.fit_transform(tqdm(X_raw))
    print("training..")
    crf = CRF(algorithm='ap', verbose=True, max_iterations=10)
    if dev_size:
        X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size)
    else:
        X_dev, y_dev = None, None
    crf.fit(X, y, X_dev, y_dev)

    print("saving..")
    joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2)

    print("loading test corpus..")
    corpus, X_test_raw, y_test = load_corpus(path_test)
    print("extracting features from test corpus..")
    X_test = fe.transform(X_test_raw)
    print("predicting..")
    y_pred = crf.predict(tqdm(X_test))

    print("saving results..")
    sents_pred = y_pred_to_sents_pred(corpus, y_pred)
    conll.write_sents(sents_pred, path_pred)

Exemple #2

0

Afficher le fichier

Fichier : crf_tagger.py Projet : RikVN/Neural_DRS

def train(train_file, test_file, min_freq, model_file):
    '''Train a CRF tagger based'''
    # Read in initial training data
    conll_data_train = read_conll_data(train_file)
    train_sents = [[line[0] for line in doc] for doc in conll_data_train]
    train_labels = [[line[2] for line in doc] for doc in conll_data_train]

    # Featurize and create instance from list of sentences
    feat_sent_train = build_dataset(train_sents)
    print("Training on {0} inst".format(len(feat_sent_train)))

    # Train and test loop for parameter settings
    # Create and train CRF model
    # For different parameter options, see:
    # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html
    model = CRF(min_freq=min_freq)
    model.fit(feat_sent_train, train_labels)

    # Test the model on held out test set if wanted
    if args.test_file:
        conll_data_test = read_conll_data(test_file)
        test_sents = [[line[0] for line in doc] for doc in conll_data_test]
        test_labels = [[line[2] for line in doc] for doc in conll_data_test]
        feat_sent_test = build_dataset(test_sents)
        # Predicting and printing accuracy
        pred = model.predict(feat_sent_test)
        acc = metrics.flat_accuracy_score(test_labels, pred)
        print("Accuracy: {0}%".format(float(round(acc, 3)) * 100))
    # Save model to disk if wanted
    if args.model:
        print("Saving model to {0}".format(model_file))
        joblib.dump(model, model_file)

Exemple #3

0

Afficher le fichier

    def train_pos_tagger(self, path):
        # Just to make sure
        nltk.download('treebank')

        tagged_sentences = treebank.tagged_sents()

        train_size = int(.80 * len(tagged_sentences))
        training_sentences = tagged_sentences[:train_size]

        X_train, y_train = self.transform_to_dataset(training_sentences)

        model = CRF()

        print('Training started...')
        model.fit(X_train, y_train)
        print('Training finished.')

        # Save classifier to file
        model_pkl = open(path, 'wb')
        pickle.dump(model, model_pkl)
        model_pkl.close()

        print("POSTagger saved.")

        self.classifier = model

Exemple #4

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_attributes(xseq, yseq):
    crf = CRF()
    assert crf.tagger_ is None
    assert crf.size_ is None
    assert crf.classes_ is None
    assert crf.num_attributes_ is None
    assert crf.attributes_ is None
    assert crf.state_features_ is None
    assert crf.transition_features_ is None

    crf.fit([xseq] * 20, [yseq] * 20)

    assert crf.tagger_ is not None
    assert crf.size_ > 1000
    assert set(crf.classes_) == {"sunny", "rainy"}

    assert crf.num_attributes_ > 0
    assert len(crf.attributes_) == crf.num_attributes_
    assert all(crf.attributes_)
    assert "clean" in crf.attributes_

    assert len(crf.state_features_) > 0
    assert all(isinstance(c, float) for c in crf.state_features_.values())
    assert all(
        attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys()
    ), crf.state_features_

    assert len(crf.transition_features_) > 0
    assert all(isinstance(c, float) for c in crf.transition_features_.values())
    assert all(
        label_from in crf.classes_ and label_to in crf.classes_
        for (label_from, label_to) in crf.transition_features_.keys()
    ), crf.transition_features_

Exemple #5

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]

Exemple #6

0

Afficher le fichier

Fichier : train.py Projet : kittinan/thaiaddress

def train(file_path: str):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            addresses.append(obj)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    metrics.flat_f1_score(y_val,
                          y_pred,
                          average='weighted',
                          labels=[l for l in LABELS if l != 'O'])
    return crf

Exemple #7

0

Afficher le fichier

def train(file_path: str, model_path: str = None):
    """
    Training CRF model from a given ``file_path``
    """
    addresses = read_file(file_path)
    addresses_train, addresses_val = train_test_split(addresses,
                                                      test_size=0.25,
                                                      random_state=42)

    X_train, y_train = addresses_to_features(addresses_train)
    X_val, y_val = addresses_to_features(addresses_val)

    crf = CRF(c1=0.2,
              c2=0.2,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(X_train, y_train)

    # prediction score on validation set
    y_pred = crf.predict(X_val)
    f1_score = metrics.flat_f1_score(y_val,
                                     y_pred,
                                     average="weighted",
                                     labels=[l for l in LABELS if l != "O"])
    print("Flat F1-Score on validation set = {}".format(f1_score))

    if model_path:
        joblib.dump(crf, model_path)
        print("Save model to {}".format(model_path))

    return crf

Exemple #8

0

Afficher le fichier

Fichier : crf.py Projet : y0shid0n/Japanese-Company-Lexicon

class CRFModel(object):
    def __init__(self,
                 algorithm='lbfgs',
                 c1=0.1,
                 c2=0.1,
                 max_iterations=100,
                 all_possible_transitions=False):

        self.model = CRF(algorithm=algorithm,
                         c1=c1,
                         c2=c2,
                         max_iterations=max_iterations,
                         all_possible_transitions=all_possible_transitions)

    def train(self, sentences, tag_lists, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
        else:
            features = [sent2features(s) for s in sentences]
        self.model.fit(features, tag_lists)

    def test(self, sentences, tagged=False):
        if tagged:
            features = [sent2features_tagged(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        else:
            features = [sent2features(s) for s in sentences]
            pred_tag_lists = self.model.predict(features)
        return pred_tag_lists

Exemple #9

0

Afficher le fichier

Fichier : crf.py Projet : yoshitomo-matsubara/section-categorization

def parameter_tuning(args, dataset):
    c1s = experiment_util.get_param_list(args.c1)
    c2s = experiment_util.get_param_list(args.c2)
    best_valid_f1_score = -np.inf
    best_c1 = -np.inf
    best_c2 = -np.inf
    best_model = None
    for c1 in c1s:
        for c2 in c2s:
            crf = CRF(algorithm='lbfgs',
                      c1=c1,
                      c2=c2,
                      max_iterations=500,
                      all_possible_transitions=True,
                      verbose=args.debug)
            crf.fit(dataset.training.list_of_feature_dicts,
                    dataset.training.list_of_labels)
            preds = crf.predict(dataset.validation.list_of_feature_dicts)
            valid_f1_score = metrics.flat_f1_score(
                dataset.validation.list_of_labels, preds, average='micro')
            if valid_f1_score > best_valid_f1_score:
                best_valid_f1_score = valid_f1_score
                best_c1 = c1
                best_c2 = c2
                best_model = crf
    print('Best validation F1 score:', best_valid_f1_score, 'Best c1:',
          best_c1, 'Best c2:', best_c2)
    return best_model

Exemple #10

0

Afficher le fichier

Fichier : CRF.py Projet : eshwag/NER

    def train1(self, data, y, tag):
        #tagged_data = a.fit(a.tag(),y,tag)
        # Features as conditional random field accepts
        feaobj = Features(data, self.num_features)
        x_train, y_train = feaobj.get
        print("labelled data")
        # Using conditional random field as features
        crf = CRF(algorithm='lbfgs',
                  c1=0.1,
                  c2=0.1,
                  max_iterations=100,
                  all_possible_transitions=False)
        print(crf)
        crf.fit(x_train, y_train)

        # Saving the model which is trained
        filename = 'finalized_model.sav'
        pickle.dump(crf, open(filename, 'wb'))

        # Prediction on train
        pred = crf.predict(x_train)

        # printing classification report and Accuracy
        print('\n \n Prediction On Trained Data:\n \n',
              flat_classification_report(y_train, pred))
        print('Accuracy:', flat_accuracy_score(y_train, pred))

Exemple #11

0

Afficher le fichier

Fichier : training.py Projet : cdli-gh/Sumerian-Translation-Pipeline

def main():

    df=pd.read_csv(args.input)
    tagged_sentence=Preparing_tagged_data(df)
    df=df[['ID','FORM','XPOSTAG']]
    #printing details
    printing_details(tagged_sentence)
    
    train_set, test_set = train_test_split(tagged_sentence,test_size=0.05,random_state=7)
    
    #print("Number of Sentences in Training Data ",len(train_set))
    #print("Number of Sentences in Testing Data ",len(test_set))
    X_train,y_train=prepareData(tagged_sentence)
    X_test,y_test=prepareData(test_set)
    
    crf = CRF(
    algorithm='l2sgd',
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True)
    
    crf.fit(X_train, y_train)
    print(crf)
    
    print("Saving Model .....")
    # Save the Model to file in the current working directory
    Pkl_Filename = args.output
    with open(Pkl_Filename, 'wb') as file:
        pickle.dump(crf, file)
        
    print("Model Saved at "+ Pkl_Filename)
    print()    
    print("Checking the Algoritham's Performance \n")
    TestData(crf, X_train,y_train,X_test,y_test)

Exemple #12

0

Afficher le fichier

    def build(sequences, labels, **kwargs):
        """
        Builds a sequence classifier from x/y pairs

        :param sequences: A list of sequences, with each member of the sequence
                   represented as features
        :type sequences: list of list of dict
        :param labels: The corresponding labels for each sequence
        :type labels: list of list of str
        :param kwargs: arguments to override the defaults given to the
                       underlying CRF
        :return: A trained sequence classifier based on the provided training
                 data
        :rtype: SequenceClassifier
        """
        params = {
            'algorithm': DEFAULT_ALGORITHM,
            'c1': DEFAULT_C1,
            'c2': DEFAULT_C2,
            'max_iterations': DEFAULT_MAX_ITERATIONS,
            'all_possible_transitions': DEFAULT_ALL_POSSIBLE_TRANSITIONS
        }

        if kwargs:
            params.update(kwargs)

        model = CRF(**params)
        model.fit(sequences, labels)
        return SequenceClassifier(model)

Exemple #13

0

Afficher le fichier

Fichier : pos_tagging.py Projet : GayatriPurandharT/NLUTProject

def crf_tag():
    brown_tagged_sents = brown.tagged_sents(categories='news')
    #print(brown_tagged_sents[0])
    train_len = int(len(brown_tagged_sents) * 0.9)
    training_sentences = brown_tagged_sents[:train_len]
    test_sentences = brown_tagged_sents[train_len:]

    X_train, y_train = transform_to_dataset(training_sentences)
    X_test, y_test = transform_to_dataset(test_sentences)

    #print(len(X_train))
    #print(len(X_test))
    print(X_train[0])
    print(y_train[0])

    model = CRF()
    model.fit(X_train, y_train)

    raw_sent = ['I', 'am', 'a', 'student']
    sent_feat = [
        feature_extract(raw_sent, index) for index in range(len(raw_sent))
    ]
    print(list(zip(raw_sent, model.predict([sent_feat])[0])))
    y_pred = model.predict(X_test)
    print(metrics.flat_accuracy_score(y_test, y_pred))

Exemple #14

0

Afficher le fichier

class CRFBased:
    '''CRF based information retrieval.
    The model is similar to the Default model
    used in homework 2 and 3'''
    def __init__(self, load, n_train, n_test):
        self.load = load
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=False)
        self.n_train = int(n_train)
        self.n_test = int(n_test)

    def load_data(self):
        X_train, y_train, X_test, y_test = prepare_crf_dataset(
            self.load, self.n_train, self.n_test)
        return X_train, y_train, X_test, y_test

    def fit(self, X, y):
        self.crf.fit(X, y)
        self.labels = list(self.crf.classes_)
        self.labels.remove('O')

    def predict(self, X):  #dataset = Train or Test
        pred = self.crf.predict(X)
        return pred

    def evaluate(self, y_true, y_pred):
        print("Final Scores for CRF Based Modes:")
        print(
            flat_classification_report(y_pred=y_pred,
                                       y_true=y_true,
                                       labels=self.labels))

Exemple #15

0

Afficher le fichier

    def train(self, model_name, tagged_sentences):
        # Split the dataset for training and testing
        cutoff = int(.75 * len(tagged_sentences))
        training_sentences = tagged_sentences[:cutoff]
        test_sentences = tagged_sentences[cutoff:]

        X_train, y_train = transform_to_dataset(training_sentences)
        X_test, y_test = transform_to_dataset(test_sentences)
        print(len(X_train))
        print(len(X_test))


        print("Training Started........")
        print("it will take time according to your dataset size..")
        model = CRF()
        model.fit(X_train, y_train)
        print("Training Finished!")
        
        print("Evaluating with Test Data...")
        y_pred = model.predict(X_test)
        print("Accuracy is: ")
        print(metrics.flat_accuracy_score(y_test, y_pred))
        
        pickle.dump(model, open(model_name, 'wb'))
        print("Model Saved!")

Exemple #16

0

Afficher le fichier

def test_attributes(xseq, yseq):
    crf = CRF()
    assert crf.tagger_ is None
    assert crf.size_ is None
    assert crf.classes_ is None
    assert crf.num_attributes_ is None
    assert crf.attributes_ is None
    assert crf.state_features_ is None
    assert crf.transition_features_ is None

    crf.fit([xseq] * 20, [yseq] * 20)

    assert crf.tagger_ is not None
    assert crf.size_ > 1000
    assert set(crf.classes_) == {'sunny', 'rainy'}

    assert crf.num_attributes_ > 0
    assert len(crf.attributes_) == crf.num_attributes_
    assert all(crf.attributes_)
    assert 'clean' in crf.attributes_

    assert len(crf.state_features_) > 0
    assert all(isinstance(c, float) for c in crf.state_features_.values())
    assert all(attr in crf.attributes_ and label in crf.classes_
               for (attr,
                    label) in crf.state_features_.keys()), crf.state_features_

    assert len(crf.transition_features_) > 0
    assert all(isinstance(c, float) for c in crf.transition_features_.values())
    assert all(label_from in crf.classes_ and label_to in crf.classes_ for (
        label_from,
        label_to) in crf.transition_features_.keys()), crf.transition_features_

Exemple #17

0

Afficher le fichier

class CRFNER(object):
    """ A class to get reviews for products on Amazon """
    def __init__(self, gazetteer, fraction=0.7):
        self.gazateer = gazetteer
        self.fraction = fraction

    def train(self, documents):
        self.data = ner_processing.NERFormatter(self.gazateer, documents)
        d_train, d_test = ner_processing.train_test_NER(self.data)

        self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction(
            d_train, d_test)

        self.model = CRF(algorithm='lbfgs',
                         c1=0.31,
                         c2=0.02,
                         max_iterations=100,
                         all_possible_transitions=True)

        self.model.fit(self.X_train, self.y_train)

    def predict(self, sentence):
        """Transforms a single sentence (for NER testing) into a CRF-suite format"""

        sentence_split = nltk.word_tokenize(sentence)
        n_words = [0] * len(sentence_split)

        df_pred = pd.DataFrame({
            'word':
            sentence_split,
            'sentence_no':
            n_words,
            'category':
            n_words,
            'POS': [x[-1] for x in nltk.pos_tag(sentence_split)],
        })

        getter = crf_processing.SentenceGetter(df_pred)
        sent = getter.get_next()
        sentences = getter.sentences

        self.X = [crf_processing.sent2features(s) for s in sentences]
        return self.model.predict(self.X)

    def report(self):
        labels = list(self.model.classes_)

        y_pred = self.model.predict(self.X_test)
        print('F1 score {}'.format(
            metrics.flat_f1_score(self.y_test,
                                  y_pred,
                                  average='weighted',
                                  labels=labels)))

        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
        print(
            metrics.flat_classification_report(self.y_test,
                                               y_pred,
                                               labels=sorted_labels,
                                               digits=3))

Exemple #18

0

Afficher le fichier

Fichier : CRF_Teste.py Projet : rui-pduarte/NLPyPort

def test_crf(train_file,test_file,model_name=""):
    valores = []
    data=pandas.read_csv(train_file,sep="\t",header=None)
    X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values)
    useful_features=[True,True]
    X_train,y_train=prepareData([X_dataset],'train',useful_features)
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    crf = CRF(
            algorithm='lbfgs',
            c1=0.0625,
            c2=0.5,
            max_iterations=100,
            all_possible_transitions=False,
            all_possible_states=True,
            verbose=True
        )
    crf.fit(X_train, y_train)
    if(model_name!=""):
        save_model(model_name + ".pickle",crf)
    useful_features=[True,True]
    data2=pandas.read_csv(test_file,sep="\t",header=None)
    X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values)
    X_teste,y_teste=prepareData([X_test],'predict',useful_features)
    y_pred=crf.predict(X_teste)
    resultados = []
    for index,elem in enumerate(y_pred[0]):
        resultados.append(str(y_pred[0][index]))
    return resultados

Exemple #19

0

Afficher le fichier

def training_crf(training_cue, data, dataset):

    getter = get_frase(data)
    frases = getter.get_frase

    get_negaciones(data)

    X = [sent2features(f, training_cue) for f in frases]
    y = [sent2labels(f, training_cue) for f in frases]

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=True,
              verbose=True)

    pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

    crf.fit(X, y)

    if training_cue == 'cue':
        model_filename = os.getcwd(
        ) + '/models/' + dataset + '/crf_cue_model.pkl'
    else:
        model_filename = os.getcwd(
        ) + '/models/' + dataset + '/crf_sco_model.pkl'

    with open(model_filename, 'wb') as file_model:
        pickle.dump(crf, file_model)

    return (y, pred, crf)

Exemple #20

0

Afficher le fichier

def test_crf(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])

    y_pred = crf.predict([xseq])
    if algorithm != 'ap':  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]

Exemple #21

0

Afficher le fichier

Fichier : test_sklearn_crfsuite.py Projet : suryalistic/eli5

def test_sklearn_crfsuite(xseq, yseq):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    expl = explain_weights(crf)
    text, html = format_as_all(expl, crf)

    assert "y='sunny' top features" in text
    assert "y='rainy' top features" in text
    assert "Transition features" in text
    assert "sunny   -0.130    0.696" in text
    assert u'+0.124  солнце:не светит' in text

    html_nospaces = html.replace(' ', '').replace("\n", '')
    assert u'солнце:не светит' in html
    assert '<th>rainy</th><th>sunny</th>' in html_nospaces

    try:
        from eli5 import format_as_dataframe, format_as_dataframes
    except ImportError:
        pass
    else:
        from .test_formatters_as_dataframe import check_targets_dataframe
        df_dict = format_as_dataframes(expl)
        check_targets_dataframe(df_dict['targets'], expl)
        df_transition = df_dict['transition_features']
        transition = expl.transition_features
        print(df_transition)
        assert list(transition.class_names) == ['rainy', 'sunny']
        assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0,
                                                                           0])
        assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0,
                                                                           1])
        assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1,
                                                                           0])

Exemple #22

0

Afficher le fichier

Fichier : training.py Projet : LiamWoodRoberts/Dataturks-NER-Tools

def train_crf(labelled_files, save=True, eval=True):
    x, y, _ = format_labelled_data(labelled_files)

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=False)

    if eval:
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        crf.fit(x_train, y_train)
        pred = crf.predict(x_test)
        report = classification_report(y_test, pred)
        print("Test Results:\n")
        line(60)
        print(report)
        line(60)
        log_results(y_test, pred)
        line(60)

    else:
        crf.fit(x, y)

    if save:
        save_crf(crf)

    return crf

Exemple #23

0

Afficher le fichier

def entity_crf_train(my_subjects):
    for i in range(0, len(X)):
        for j in range(0, len(X[i])):
            if 'sub' in X[i][j]:
                subj = my_subjects[np.random.randint(len(my_subjects))]
                subj = subj.split()
                X[i] = X[i][:j] + subj + X[i][j + 1:]
                y[i] = y[i][:j] + ['subject'] * len(subj) + y[i][j + 1:]
        X[i] = X[i][0:10]
        y[i] = y[i][0:10]

    crf = CRF(c1=0.1,
              c2=0.01,
              max_iterations=200,
              all_possible_transitions=True)

    print(".....Training entity extraction model.....")
    crf.fit(X, y)
    print(".....Trained entity extraction model.....")

    working_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    with open(working_directory + '/full_model/crf_model.pkl',
              'wb') as pickle_file:
        pickle.dump(crf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open(working_directory + '/full_model/subjects.pkl',
              'wb') as pickle_file:
        pickle.dump(my_subjects, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

Exemple #24

0

Afficher le fichier

Fichier : crf_model.py Projet : ienoob/neo_nlp_project

class CRFNerModel(object):
    def __init__(self, is_save=False):
        self.crf = CRF(algorithm='lbfgs',
                       c1=0.1,
                       c2=0.1,
                       max_iterations=100,
                       all_possible_transitions=True)
        self.is_save = is_save
        self.save_model = "crf.model"

    def fit(self, train_x, train_y):
        self.crf.fit(train_x, train_y)

        if self.is_save:
            self.dump_model()

    def predict(self, input_x):
        input_x = list(input_x)
        input_feature = [sent2features(input_x)]
        return self.crf.predict(input_feature)

    def dump_model(self):
        model_data = pickle.dumps(self.crf)
        with open(self.save_model, "wb") as f:
            f.write(model_data)

    def load_model(self):
        with open(self.save_model, "rb") as f:
            model_data = f.read()
        self.crf = pickle.loads(model_data)

    def predict_list(self, input_list):
        return self.crf.predict(input_list)

    def extract_ner(self, input_x):
        extract_ner = []
        res = self.predict(input_x)

        start = None
        label = None
        for i, x in enumerate(res[0]):
            if x == "O":
                if start is not None:
                    extract_ner.append((start, i, label, input_x[start:i]))
                    start = None
                    label = None
            else:
                xindex, xlabel = x.split("-")
                if xindex == "B":
                    if start is not None:
                        extract_ner.append((start, i, label, input_x[start:i]))
                    start = i
                    label = xlabel
                else:
                    if label != xlabel:
                        start = None
                        label = None
        return extract_ner

Exemple #25

0

Afficher le fichier

def train_crf(trainx, trainy):
    print "training CRF..."
    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=True)
    crf.fit(trainx, trainy)
    return crf

Exemple #26

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_score(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    score = crf.score([xseq], [yseq])
    if algorithm != "ap":
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8

Exemple #27

0

Afficher le fichier

def train_crf(x,y):
    '''train a crf model on x and y data'''
    crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)
    crf.fit(x, y)
    return crf

Exemple #28

0

Afficher le fichier

def test_crf_score(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])

    score = crf.score([xseq], [yseq])
    if algorithm != 'ap':
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8

Exemple #29

0

Afficher le fichier

def test_sklearn_crfsuite_feature_re(xseq, yseq):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    expl = explain_weights(crf, feature_re=u'(солн|clean)')
    for expl in format_as_all(expl, crf):
        assert u'солн' in expl
        assert u'clean' in expl
        assert 'walk' not in expl

Exemple #30

0

Afficher le fichier

def train_crf(x, y):
    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=False)

    crf.fit(x, y)

    return crf

Exemple #31

0

Afficher le fichier

def CRF_model(X_train,y_train):
    crf = CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    return crf

Exemple #32

0

Afficher le fichier

def test_sklearn_targets(xseq, yseq, targets):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    res = explain_weights(crf, target_names={'sunny': u'☀'}, targets=targets)
    for expl in format_as_all(res, crf):
        assert u'☀' in expl
        if targets[0] == 'rainy':
            assert expl.index('rainy') < expl.index(u'☀')
        else:
            assert expl.index('rainy') > expl.index(u'☀')

Exemple #33

0

Afficher le fichier

def train(model_name, xtrain, ytrain):
    print('hallo')
    crf = CRF(algorithm='lbfgs',
              c1=0.0001,
              c2=0.0001,
              max_iterations=100,
              all_possible_transitions=False,
              model_filename=(model_name))
    crf.fit(xtrain, ytrain)
    print('hallo2')
    return crf

Exemple #34

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_verbose(xseq, yseq, algorithm, use_dev):
    crf = CRF(algorithm, verbose=True)

    if use_dev:
        X_dev, y_dev = [xseq], [yseq]
    else:
        X_dev, y_dev = None, None

    crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev)
    y_pred = crf.predict([xseq])
    if algorithm != "ap":  # Averaged Perceptron is regularized too much
        assert y_pred == [yseq]

Exemple #35

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_pickling(xseq, yseq, algorithm):
    crf = CRF(algorithm=algorithm)
    crf.fit([xseq], [yseq])
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)

    crf2 = pickle.loads(data)
    score = crf2.score([xseq], [yseq])
    if algorithm != "ap":
        assert score == 1.0
    else:  # Averaged Perceptron is regularized too much
        assert score > 0.8
    assert crf2.algorithm == algorithm

Exemple #36

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_marginals(xseq, yseq, algorithm):
    crf = CRF(algorithm)
    crf.fit([xseq], [yseq])

    y_pred_marginals = crf.predict_marginals([xseq])
    assert len(y_pred_marginals) == 1
    marginals = y_pred_marginals[0]
    assert len(marginals) == len(yseq)

    labels = crf.tagger_.labels()
    for m in marginals:
        assert isinstance(m, dict)
        assert set(m.keys()) == set(labels)
        assert abs(sum(m.values()) - 1.0) < 1e-6

Exemple #37

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_model_filename(xseq, yseq, tmpdir):
    path = os.path.join(str(tmpdir), "foo.crfsuite")
    assert not os.path.exists(path)

    # model file is created at a specified location
    crf = CRF(model_filename=path)
    crf.fit([xseq], [yseq])
    assert os.path.exists(path)

    # it is possible to load the model just by passing a file name
    crf2 = CRF(model_filename=path)
    assert crf2.score([xseq], [yseq]) == 1.0

    # crf is picklable
    data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL)
    crf3 = pickle.loads(data)
    assert crf3.score([xseq], [yseq]) == 1.0

Exemple #38

0

Afficher le fichier

Fichier : CRF.py Projet : sardarr/BeliefTagger

def main(arg):

    X_train, y_train = transform_to_dataset(training_sentences,arg)
    X_test, y_test = transform_to_dataset(test_sentences,arg)
    print(len(X_train))
    print(len(X_test))
    print(X_train[0])

    if arg['model_name']=="crf":
        model = CRF()
        model.fit(X_train, y_train)
    elif arg['model_name']=="SVM":
        v = DictVectorizer(sparse=False)
        X_tr = v.fit_transform(X_train)
        X_ts = v.fit_transform(X_test)





    sentence = ['I', 'am', 'Bob', '!']

Exemple #39

0

Afficher le fichier

Fichier : test_crf.py Projet : mansweet/sklearn-crfsuite

def test_crf_dev_bad_arguments(xseq, yseq):
    crf = CRF()
    X = [xseq] * 20
    y = [yseq] * 20
    with pytest.raises(ValueError):
        crf.fit(X, y, X)