Esempio n. 1
0
 def _start_easy(self, X_train, y_train):
     crf = sklearn_crfsuite.CRF(
         algorithm=self.algorithm,
         c1=self.c1,
         c2=self.c2,
         max_iterations=self.max_iterations,
         all_possible_transitions=self.all_possible_transitions)
     crf.fit(X_train, y_train)
     return crf
Esempio n. 2
0
 def __init__(self, client_name):
     try:
         self.classifier = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=200,
                                                all_possible_transitions=True)
         self.client_name = client_name
         self.module_name = 'feature_extractors.' + self.client_name + '_layout_feature_extractor_crf'
         self.classifier_model = LAYOUT_MODELS.get(client_name)
     except Exception:
         raise FileNotFoundError("Caught Exception in getting loaded model")
Esempio n. 3
0
def l2sgd(train_X, train_Y, test_X, test_Y):
    algorithms = ['l2sgd']
    min_frequencies = [0, 0.02]
    all_states = [True, False]
    all_transitions = [True, False]
    c2s = [0, 0.01, 0.05, 0.1]

    i = 1
    N = len(algorithms) * len(min_frequencies) * len(all_states) * len(all_transitions) * len(c2s)
    start = time.time()

    results = []

    for algo in algorithms:
        for min_freq in min_frequencies:
            for all_state in all_states:
                for all_transition in all_transitions:
                    for c2 in c2s:
                        print(round(100 * i / N), '%')
                        print('Time elapsed: {} s'.format(round(time.time() - start)))
                        i += 1
                        params = {'algo': algo,
                                  'min_freq': min_freq,
                                  'all_state': all_state,
                                  'all_transition': all_transition,
                                  'c2': c2}

                        print(params)
                        try:
                            crf = sklearn_crfsuite.CRF(
                                algorithm=algo,
                                c2=c2,
                                max_iterations=1000,
                                all_possible_transitions=all_transition,
                                all_possible_states=all_state,
                                min_freq=min_freq
                            )

                            crf.fit(train_X, train_Y)
                            pred_Y = crf.predict(test_X)

                            f1 = metrics.flat_f1_score(test_Y, pred_Y, average='weighted',
                                                       labels=['per', 'org', 'misc', 'loc', 'notpropn'])
                            res = metrics.flat_classification_report(test_Y, pred_Y,
                                                                     labels=['per', 'org', 'misc', 'loc',
                                                                             'notpropn'], digits=4)
                            results.append((f1, params))
                            print(res)
                            print()

                        except:
                            print('Invalid parameter combination.')
                            continue

    with open('results/l2sgd', 'wb') as file:
        pickle.dump(results, file)
Esempio n. 4
0
def main():

    parser = argparse.ArgumentParser(description="")
    opt = parse_arguments(parser)
    corpora = utilidades.Corpora()

    contextual_features = corpora.load_embeddings(opt.data_path + '/' +
                                                  opt.contextual_features)

    if opt.train == 'yes':

        dataset = load_dataset(opt.data_path + '/' + opt.corpus + '/train.txt')

        X_train = [
            batch2features(b, contextual_features) for b in dataset if b
        ]
        y_train = [batch2labels(b) for b in dataset if b]

        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        print('treinando modelo')
        crf.fit(X_train, y_train)

        print('salvando modelo')
        pickle.dump(crf, open(opt.save_model_in, 'wb'))

    elif opt.test == 'yes':

        if opt.dev == 'yes':
            dataset_test = load_dataset(opt.data_path + '/' + opt.corpus +
                                        '/dev.txt')
        else:
            dataset_test = load_dataset(opt.data_path + '/' + opt.corpus +
                                        '/test.txt')

        X_test = [
            batch2features(b, contextual_features) for b in dataset_test if b
        ]
        y_test = [batch2labels(b) for b in dataset_test if b]

        crf = pickle.load(open(opt.saved_model, 'rb'))
        labels = list(crf.classes_)
        y_pred = crf.predict(X_test)

        print(
            metrics.flat_f1_score(y_test,
                                  y_pred,
                                  average='weighted',
                                  labels=labels))

        print('escrevendo resultados')
        print(len(dataset_test), len(y_pred))
        write_results(opt.path_results, dataset_test, y_pred)
Esempio n. 5
0
    def train(self, max_iteration=100):
        """
        Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model
        and the F1_score (macro average of f1_score of each label)
        @type train_sequence: SequenceData
        @param train_sequence: the training data set
        @type test_sequence: SequenceData
        @param test_sequence: the test data set
        """
        # keep the training start timestamp
        start = time.time()

        if not self.quiet:
            print("Number of sentences train: ",
                  len(self.train_labels_sequence))
            print("Number of sentences test: ", len(self.test_labels_sequence))
            print("{0} feature types".format(
                self.feature_extractor.num_feature_types()))
            print("\"{0}\" feature template".format(
                self.feature_extractor.feature_template))

        crf = sklearn_crfsuite.CRF(
            algorithm=self.algorithm,
            c1=self.c1,
            c2=self.c2,
            epsilon=self.epsilon,
            max_iterations=max_iteration,
            all_possible_transitions=self.all_possible_transition,
            all_possible_states=self.all_possible_state,
            verbose=not self.quiet)

        if not self.quiet:
            print("Train model")
        crf.fit(self.train_features, self.train_labels_sequence)
        #X_dev=self.validation_features,
        #y_dev=self.validation_labels_sequence)

        if not self.quiet:
            print("Predict")
        y_pred = crf.predict(self.test_features)
        self.test_sequence.save_prediction_to_file(y_pred,
                                                   self.prediction_path)
        exact_score, inexact_score, conllEval = report_fscore_from_file(
            self.prediction_path + "/predictions.txt",
            wikiner=self.wikiner,
            quiet=True)
        if not self.quiet:
            self.display_results("Conll", conllEval)
            self.display_results("Exact", exact_score)
            self.display_results("Inexact", inexact_score)

        self.save_results(conllEval, exact_score, inexact_score)
        self.__save_model(crf)
        print("Training time:",
              str(datetime.timedelta(seconds=time.time() - start)))
        return exact_score, inexact_score, conllEval
Esempio n. 6
0
def load_CRF_model():
    """
    Load in a trained CRF model from a file.

    :return: CRF model object as defined by sklearn_crfsuite
    """
    crf = sklearn_crfsuite.CRF(model_filename=parser_tokens.MODEL_PATH +
                               parser_tokens.MODEL_FILE,
                               verbose=True)
    return crf
Esempio n. 7
0
def crf_train():
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.05,
        c2=0.005,
        max_iterations=100,
        all_possible_transitions=True,
    )
    crf.fit(train_X_crf, train_y_crf)
    return crf
Esempio n. 8
0
def train_crf_model(X_train, y_train):

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)

    return crf
Esempio n. 9
0
 def __init__(self, **kwargs):
     self.cnt = 0
     if 'model' in kwargs:
         self.model = joblib.load(kwargs.get('model'))
     else:
         self.model = sklearn_crfsuite.CRF(algorithm='l2sgd',
                                           c2=1,
                                           max_iterations=100,
                                           all_possible_transitions=True,
                                           verbose=True)
Esempio n. 10
0
 def initialize_model(self):
     """
     初始化
     """
     algorithm = self.config.get('model', 'algorithm')
     c1 = float(self.config.get('model', 'c1'))
     c2 = float(self.config.get('model', 'c2'))
     max_iterations = int(self.config.get('model', 'max_iterations'))
     self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,
                                       max_iterations=max_iterations, all_possible_transitions=True)
Esempio n. 11
0
    def __init__(self, verbose=True):
        self.morph = pymorphy2.MorphAnalyzer()
        self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                        c1=0.1,
                                        c2=0.1,
                                        max_iterations=200,
                                        all_possible_transitions=True,
                                        verbose=verbose)

        self.fitted = False
Esempio n. 12
0
def trainCRF(trainFeatures, trainLabels):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True,
    )
    crf.fit(trainFeatures, trainLabels)
    return crf
Esempio n. 13
0
 def train(self):
     print "Training CRF ..."
     self.model = crfsuite.CRF(
         algorithm='lbfgs',
         max_iterations=5)
     self.model.fit(self.trn_feats, self.trn_tags)
     trn_tags_pred = self.model.predict(self.trn_feats)
     self.eval(trn_tags_pred, self.trn_tags)
     dev_tags_pred = self.model.predict(self.dev_feats)
     self.eval(dev_tags_pred, self.dev_tags)
Esempio n. 14
0
def main():
    print("Here")
    txt_train = codecs.open(".//train.txt", "rb", encoding="utf-8")
    txt_test = codecs.open(".//test.txt", "rb", encoding="utf-8")

    train_sents = readtxt(txt_train)
    test_sents = readtxt(txt_test)

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    #STEP3: Training
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)

    #STEP4: Evaluation
    labels = list(crf.classes_)
    print("it's here")
    y_pred = crf.predict(X_test)

    y_file = open(".//Task1.pkl", "wb")
    pickle.dump(y_pred, y_file)
    y_file.close()

    metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    #Inspect per-class results in more detail:
    print(
        metrics.flat_classification_report(y_test,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    #Accuracy
    print(len(y_test), len(y_pred))
    t = 0
    total = 0
    for i in range(len(y_test)):
        for j in range(len(y_test[i])):
            if y_test[i][j] == y_pred[i][j]:
                t += 1
                total += 1
            else:
                total += 1

    print(t, total, t / total)
Esempio n. 15
0
def objective(chain_len):
    global X_train, Y_train
    # X_train = params['X_train']
    # chg_pct = params['chg_pct']
    # chg_threshold = params['chg_threshold']
    # chain_len = 2 + params['chain_len']  # 2 ~ 10
    # training_start_date = params['training_start_date']
    # training_end_date = params['training_end_date']

    # Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date)
    # if Y_train is None:  # failed to load Y
    #     best_sub_params ={'c1': np.nan, 'c2': np.nan}
    #     best_cv_score = 0
    #     return {'loss':9999, 'status':STATUS_FAIL, 'c1':np.nan, 'c2':np.nan}

    # Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    # Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    chain_X_train = all_data[tmp_columns]
    chain_Y_train = all_data['Y']

    chain_X_train = Xpoint2Set(chain_X_train, chain_len)
    chain_Y_train = Ypoint2Set(chain_Y_train, chain_len)

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        # c1=0.1,
        # c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )

    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    labels = Y_train['Y'].astype('str').unique()
    val_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

    rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=0, n_jobs=-1, n_iter=50,
                               scoring=val_scorer)  # searching
    rs_cv.fit(chain_X_train, chain_Y_train)

    best_cv_score = rs_cv.best_score_
    best_sub_params = rs_cv.best_params_

    obj_result = {'cv_score':best_cv_score}
    obj_result.update(best_sub_params)  # c1 c2

    return obj_result
Esempio n. 16
0
def cross_validation(val_ratio=0.2, verbose=False):
    with open(DATA_ROOT + 'posts_manual_tokenized.txt') as f:
        post_list = split_train_text(f.read())
    with open(DATA_ROOT + 'answers_manual_tokenized.txt') as f:
        post_list.extend(split_train_text(f.read()))
    post_list = [post_list[i] for i in DATA_SEQ]
    val_length = int(val_ratio * len(post_list))
    staart = 0
    ennd = val_length
    cross_round = 1
    score_all = []
    #pdb.set_trace()
    while (ennd <= len(post_list)):
        train_list = post_list[:staart] + post_list[ennd:]
        val_list = post_list[staart:ennd]
        x_train, y_train = [], []
        x_val, y_val = [], []
        list2xy(train_list, x_train, y_train)
        list2xy(val_list, x_val, y_val)
        # build model
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=CONFIG['single']['c1'],
                                   c2=CONFIG['single']['c2'],
                                   max_iterations=100,
                                   all_possible_transitions=True)
        # train model
        crf.fit(x_train, y_train)

        labels = list(crf.classes_)

        # Remove I label, too many
        labels.remove('I')
        # predict model
        y_pred = crf.predict(x_val)
        #print(y_pred)
        #pdb.set_trace()
        print("Cross validation round ", cross_round, " :")
        if (verbose):
            evaluate_nested(y_pred, y_val)
        score_all.append(evaluate_nested_score(y_pred, y_val))
        staart += val_length
        ennd += val_length
        cross_round += 1
    print(
        "average precision of CRF tokenization in {}-fold corss validation: {:0.4}"
        .format(int(1 / val_ratio),
                sum([score[0] for score in score_all]) / len(score_all)))
    print(
        "average recall of CRF tokenization in {}-fold corss validation:    {:0.4}"
        .format(int(1 / val_ratio),
                sum([score[1] for score in score_all]) / len(score_all)))
    print(
        "average f1 score of CRF tokenization in {}-fold corss validation:  {:0.4}"
        .format(int(1 / val_ratio),
                sum([score[2] for score in score_all]) / len(score_all)))
Esempio n. 17
0
def predict_labels_crf(docs=None):

    print('Initializing docs...')
    docs = docs or pickle.load(open(DOC_PKL))

    for pio in ['interventions'
                ]:  #['participants', 'interventions', 'outcomes']:
        print('Running crf for %s' % pio)
        test_fnames = glob(
            '%s/annotations/aggregated/hierarchical_labels/%s/test/gold/*.ann'
            % (TOP, pio))
        train_fnames = glob(
            '%s/annotations/aggregated/hierarchical_labels/%s/train/*.ann' %
            (TOP, pio))

        print('Reading labels for %d train and %d test docs' %
              (len(train_fnames), len(test_fnames)))
        test_labels = {
            os.path.basename(f).split('_')[0]: open(f).read().split(',')
            for f in test_fnames
        }
        train_labels = {
            os.path.basename(f).split('_')[0]: open(f).read().split(',')
            for f in train_fnames
        }

        train_pmids = sorted(train_labels.keys())
        test_pmids = test_labels.keys()

        assert len(set(test_pmids) & set(train_pmids)) == 0

        print('Computing test/train features')
        train_X = [doc2features(docs[p]) for p in train_pmids]
        test_X = [doc2features(docs[p]) for p in test_pmids]

        train_Y = [train_labels[p] for p in train_pmids]
        test_Y = [test_labels[p] for p in test_pmids]

        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.4,
                                   c2=0.005,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        print('Fitting model for %d pmids' % len(train_pmids))
        crf.fit(train_X, train_Y)
        labels = list(crf.classes_)
        pred_Y = crf.predict(test_X)

        labels.remove('0')
        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
        print(
            metrics.flat_classification_report(test_Y,
                                               pred_Y,
                                               labels=sorted_labels,
                                               digits=3))
Esempio n. 18
0
    def train_model(self):
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=self.C1,
                                   c2=self.C2,
                                   verbose=True,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        print('Begin training...')
        crf.fit(self.x_train, self.y_train)

        self.y_predict = crf.predict(self.x_test)
Esempio n. 19
0
 def initialize_model(self):
     """初始化"""
     algorithm = self.algorithm
     c1 = float(self.c1)
     c2 = float(self.c2)
     max_iterations = int(self.max_iterations)
     self.model = sklearn_crfsuite.CRF(algorithm=algorithm,
                                       c1=c1,
                                       c2=c2,
                                       max_iterations=max_iterations,
                                       all_possible_transitions=True)
Esempio n. 20
0
def get_crf(**kwargs):
    params = dict(
        algorithm='lbfgs',
        c1=0.001,
        c2=0.05,
        max_iterations=100,
        all_possible_transitions=True,
        verbose=False,
    )
    params.update(kwargs)
    return sklearn_crfsuite.CRF(**params)
Esempio n. 21
0
    def fit_model(self, X, y) -> object:
        if int(util.cf.get('Util', 'verbose')): print('Fitting the model')

        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X, y)

        return crf
Esempio n. 22
0
 def initialize_model(self):
     algorithm = model_config.get("algorithm")
     c1 = float(model_config.get("c1"))
     c2 = float(model_config.get("c2"))
     max_iterations = int(model_config.get("max_iterations"))
     self.model = sklearn_crfsuite.CRF(algorithm=algorithm,
                                       c1=c1,
                                       c2=c2,
                                       max_iterations=max_iterations,
                                       all_possible_transitions=True)
     print("-> 完成模型初始化")
Esempio n. 23
0
def train(train_corpus: str,
          dev_corpus: str,
          c1: float = 0.0,
          c2: float = 0.0,
          algorithm: str = 'lbfgs',
          max_iterations: int = 100,
          all_possible_transitions: bool = False,
          window_size: int = 1,
          model_filename: str = None,
          _run: Run = None,
          _log: logger = None):
    """
    running crf experiment
    """
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    train_sents, _ = get_tagged_sents_and_words(train_corpus)
    dev_sents, _ = get_tagged_sents_and_words(dev_corpus)

    X_train = [sent2features(s, window_size) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_dev = [sent2features(s, window_size) for s in dev_sents]
    y_dev = [sent2labels(s) for s in dev_sents]

    crf = sklearn_crfsuite.CRF(
        algorithm=algorithm,
        c1=c1,
        c2=c2,
        max_iterations=max_iterations,
        all_possible_transitions=all_possible_transitions,
        model_filename=model_filename,
    )

    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_dev)
    overall, by_type = evaluate(y_dev, y_pred)
    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')
    if model_filename is not None:
        _log.info(f'saving to: {model_filename}.pkl')
        joblib.dump(crf, f'{model_filename}.pkl')
        _run.add_artifact(f'{model_filename}.pkl')
Esempio n. 24
0
def train_model(x_train, y_train):
    # get thr CRF model
    model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.2, max_iterations=100, all_possible_transitions=True)
    model.fit(x_train, y_train)
    # since label 'O' is the most common label,
    # it will make the result look better than the real one
    # then we will remove label 'O'
    labels = list(model.classes_)
    labels.remove('O')
    joblib.dump(model, MODEL_PATH)
    return model, labels
Esempio n. 25
0
    def train(self, corpus):
        """

        :param corpus:
        :return:
        """
        assert corpus is not None, "No training data file given"
        assert 'utterances' in corpus, "Token mapping missing from training data"

        x_train, y_train, trained_utterances = self.prepare(
            corpus["utterances"])

        unq_labels = list(set(list(chain(*y_train))))

        # assert len(unq_labels) > 1, "Not enough unique labels for training"

        # best parameter selection for model-lbfgs
        def get_best_model(X_train, y_train):
            crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                       max_iterations=100,
                                       all_possible_transitions=True)
            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }

            # use the same metric for evaluation
            f1_scorer = make_scorer(metrics.flat_f1_score,
                                    average='weighted',
                                    labels=unq_labels)

            # search
            rs = RandomizedSearchCV(crf,
                                    params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=3,
                                    n_iter=10,
                                    scoring=f1_scorer)
            rs.fit(X_train, y_train)

            return rs

        rs = get_best_model(x_train, y_train)

        self.model = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=rs.best_params_.get('c1'),  # 0.1,
            c2=rs.best_params_.get('c2'),
            max_iterations=100,
            all_possible_transitions=True)
        self.model.fit(x_train, y_train)

        return trained_utterances
Esempio n. 26
0
def train_crf(X_train, y_train):
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.564,
                               c2=0.0279,
                               max_iterations=100,
                               all_possible_transitions=True)

    crf.fit(X_train, y_train)

    with open('crf_model.p', 'w') as f:
        pickle.dump(crf, f)
def CRF_trainer(X, y, X_Te):
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    #kf=KFold(n_splits=5,shuffle=True)
    #predicted=cross_val_predict(crf, X, y, cv=kf)
    crf.fit(X, y)
    predicted = crf.predict(X_Te)
    return predicted
Esempio n. 28
0
def main():
    """Main function of script"""
    args = utils.read_arguments(__doc__)
    documents = utils.pickle_from_file(args['input_filename'])

    transformer = conll_feature_extractor.ConllFeatureExtractor(
        use_structural=True,
        use_syntactic=True,  # use_lexical=True
    )
    # Extract instances and labels. Each instance is a sentence, represented as
    # a list of feature dictionaries for each work. Labels are represented as
    # a list of word labels.
    instances = transformer.get_feature_dict(documents)
    labels = conll_feature_extractor.get_labels_from_documents(documents)

    x_train, x_test, y_train, y_test = train_test_split(instances,
                                                        labels,
                                                        test_size=0.33)

    classifier = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                      c1=0.1,
                                      c2=0.1,
                                      max_iterations=100,
                                      all_possible_transitions=True)

    if not args['search_grid']:
        classifier.fit(x_train, y_train)
        predictions = list(itertools.chain(*classifier.predict(x_test)))

        evaluation.log_report(predictions, list(itertools.chain(*y_test)))
    else:
        # label_names = list(classifier.classes_)
        # label_names.remove('O')
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }
        f1_scorer = metrics.make_scorer(
            suite_metrics.flat_f1_score,
            average='weighted')  #, labels=label_names)
        # search
        rs = RandomizedSearchCV(classifier,
                                params_space,
                                cv=3,
                                verbose=1,
                                n_jobs=-1,
                                n_iter=50,
                                scoring=f1_scorer)
        rs.fit(x_train, y_train)
        print('best params:', rs.best_params_)
        print('best CV score:', rs.best_score_)
        classifier = rs.best_estimator_
        predictions = list(itertools.chain(*classifier.predict(x_test)))
        evaluation.log_report(predictions, list(itertools.chain(*y_test)))
Esempio n. 29
0
    def train_crf(self):
        x_train = [self.create_x_train()]
        y_train = [self.create_y_label_for_train()]

        self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                              c1=0.1,
                                              c2=0.1,
                                              max_iterations=1000,
                                              all_possible_transitions=True)

        self.crf_model.fit(x_train, y_train)
Esempio n. 30
0
def train():
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_test)
    print(metrics.flat_f1_score(y_test, y_pred, average='weighted',
                                labels=lll))

    print(metrics.flat_classification_report(y_test, y_pred, labels=lll))