def _start_easy(self, X_train, y_train): crf = sklearn_crfsuite.CRF( algorithm=self.algorithm, c1=self.c1, c2=self.c2, max_iterations=self.max_iterations, all_possible_transitions=self.all_possible_transitions) crf.fit(X_train, y_train) return crf
def __init__(self, client_name): try: self.classifier = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=200, all_possible_transitions=True) self.client_name = client_name self.module_name = 'feature_extractors.' + self.client_name + '_layout_feature_extractor_crf' self.classifier_model = LAYOUT_MODELS.get(client_name) except Exception: raise FileNotFoundError("Caught Exception in getting loaded model")
def l2sgd(train_X, train_Y, test_X, test_Y): algorithms = ['l2sgd'] min_frequencies = [0, 0.02] all_states = [True, False] all_transitions = [True, False] c2s = [0, 0.01, 0.05, 0.1] i = 1 N = len(algorithms) * len(min_frequencies) * len(all_states) * len(all_transitions) * len(c2s) start = time.time() results = [] for algo in algorithms: for min_freq in min_frequencies: for all_state in all_states: for all_transition in all_transitions: for c2 in c2s: print(round(100 * i / N), '%') print('Time elapsed: {} s'.format(round(time.time() - start))) i += 1 params = {'algo': algo, 'min_freq': min_freq, 'all_state': all_state, 'all_transition': all_transition, 'c2': c2} print(params) try: crf = sklearn_crfsuite.CRF( algorithm=algo, c2=c2, max_iterations=1000, all_possible_transitions=all_transition, all_possible_states=all_state, min_freq=min_freq ) crf.fit(train_X, train_Y) pred_Y = crf.predict(test_X) f1 = metrics.flat_f1_score(test_Y, pred_Y, average='weighted', labels=['per', 'org', 'misc', 'loc', 'notpropn']) res = metrics.flat_classification_report(test_Y, pred_Y, labels=['per', 'org', 'misc', 'loc', 'notpropn'], digits=4) results.append((f1, params)) print(res) print() except: print('Invalid parameter combination.') continue with open('results/l2sgd', 'wb') as file: pickle.dump(results, file)
def main(): parser = argparse.ArgumentParser(description="") opt = parse_arguments(parser) corpora = utilidades.Corpora() contextual_features = corpora.load_embeddings(opt.data_path + '/' + opt.contextual_features) if opt.train == 'yes': dataset = load_dataset(opt.data_path + '/' + opt.corpus + '/train.txt') X_train = [ batch2features(b, contextual_features) for b in dataset if b ] y_train = [batch2labels(b) for b in dataset if b] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) print('treinando modelo') crf.fit(X_train, y_train) print('salvando modelo') pickle.dump(crf, open(opt.save_model_in, 'wb')) elif opt.test == 'yes': if opt.dev == 'yes': dataset_test = load_dataset(opt.data_path + '/' + opt.corpus + '/dev.txt') else: dataset_test = load_dataset(opt.data_path + '/' + opt.corpus + '/test.txt') X_test = [ batch2features(b, contextual_features) for b in dataset_test if b ] y_test = [batch2labels(b) for b in dataset_test if b] crf = pickle.load(open(opt.saved_model, 'rb')) labels = list(crf.classes_) y_pred = crf.predict(X_test) print( metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)) print('escrevendo resultados') print(len(dataset_test), len(y_pred)) write_results(opt.path_results, dataset_test, y_pred)
def train(self, max_iteration=100): """ Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model and the F1_score (macro average of f1_score of each label) @type train_sequence: SequenceData @param train_sequence: the training data set @type test_sequence: SequenceData @param test_sequence: the test data set """ # keep the training start timestamp start = time.time() if not self.quiet: print("Number of sentences train: ", len(self.train_labels_sequence)) print("Number of sentences test: ", len(self.test_labels_sequence)) print("{0} feature types".format( self.feature_extractor.num_feature_types())) print("\"{0}\" feature template".format( self.feature_extractor.feature_template)) crf = sklearn_crfsuite.CRF( algorithm=self.algorithm, c1=self.c1, c2=self.c2, epsilon=self.epsilon, max_iterations=max_iteration, all_possible_transitions=self.all_possible_transition, all_possible_states=self.all_possible_state, verbose=not self.quiet) if not self.quiet: print("Train model") crf.fit(self.train_features, self.train_labels_sequence) #X_dev=self.validation_features, #y_dev=self.validation_labels_sequence) if not self.quiet: print("Predict") y_pred = crf.predict(self.test_features) self.test_sequence.save_prediction_to_file(y_pred, self.prediction_path) exact_score, inexact_score, conllEval = report_fscore_from_file( self.prediction_path + "/predictions.txt", wikiner=self.wikiner, quiet=True) if not self.quiet: self.display_results("Conll", conllEval) self.display_results("Exact", exact_score) self.display_results("Inexact", inexact_score) self.save_results(conllEval, exact_score, inexact_score) self.__save_model(crf) print("Training time:", str(datetime.timedelta(seconds=time.time() - start))) return exact_score, inexact_score, conllEval
def load_CRF_model(): """ Load in a trained CRF model from a file. :return: CRF model object as defined by sklearn_crfsuite """ crf = sklearn_crfsuite.CRF(model_filename=parser_tokens.MODEL_PATH + parser_tokens.MODEL_FILE, verbose=True) return crf
def crf_train(): crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.05, c2=0.005, max_iterations=100, all_possible_transitions=True, ) crf.fit(train_X_crf, train_y_crf) return crf
def train_crf_model(X_train, y_train): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) return crf
def __init__(self, **kwargs): self.cnt = 0 if 'model' in kwargs: self.model = joblib.load(kwargs.get('model')) else: self.model = sklearn_crfsuite.CRF(algorithm='l2sgd', c2=1, max_iterations=100, all_possible_transitions=True, verbose=True)
def initialize_model(self): """ 初始化 """ algorithm = self.config.get('model', 'algorithm') c1 = float(self.config.get('model', 'c1')) c2 = float(self.config.get('model', 'c2')) max_iterations = int(self.config.get('model', 'max_iterations')) self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True)
def __init__(self, verbose=True): self.morph = pymorphy2.MorphAnalyzer() self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=200, all_possible_transitions=True, verbose=verbose) self.fitted = False
def trainCRF(trainFeatures, trainLabels): crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, ) crf.fit(trainFeatures, trainLabels) return crf
def train(self): print "Training CRF ..." self.model = crfsuite.CRF( algorithm='lbfgs', max_iterations=5) self.model.fit(self.trn_feats, self.trn_tags) trn_tags_pred = self.model.predict(self.trn_feats) self.eval(trn_tags_pred, self.trn_tags) dev_tags_pred = self.model.predict(self.dev_feats) self.eval(dev_tags_pred, self.dev_tags)
def main(): print("Here") txt_train = codecs.open(".//train.txt", "rb", encoding="utf-8") txt_test = codecs.open(".//test.txt", "rb", encoding="utf-8") train_sents = readtxt(txt_train) test_sents = readtxt(txt_test) X_train = [sent2features(s) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] #STEP3: Training crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) #STEP4: Evaluation labels = list(crf.classes_) print("it's here") y_pred = crf.predict(X_test) y_file = open(".//Task1.pkl", "wb") pickle.dump(y_pred, y_file) y_file.close() metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) #Inspect per-class results in more detail: print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) #Accuracy print(len(y_test), len(y_pred)) t = 0 total = 0 for i in range(len(y_test)): for j in range(len(y_test[i])): if y_test[i][j] == y_pred[i][j]: t += 1 total += 1 else: total += 1 print(t, total, t / total)
def objective(chain_len): global X_train, Y_train # X_train = params['X_train'] # chg_pct = params['chg_pct'] # chg_threshold = params['chg_threshold'] # chain_len = 2 + params['chain_len'] # 2 ~ 10 # training_start_date = params['training_start_date'] # training_end_date = params['training_end_date'] # Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) # if Y_train is None: # failed to load Y # best_sub_params ={'c1': np.nan, 'c2': np.nan} # best_cv_score = 0 # return {'loss':9999, 'status':STATUS_FAIL, 'c1':np.nan, 'c2':np.nan} # Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! # Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') chain_X_train = all_data[tmp_columns] chain_Y_train = all_data['Y'] chain_X_train = Xpoint2Set(chain_X_train, chain_len) chain_Y_train = Ypoint2Set(chain_Y_train, chain_len) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', # c1=0.1, # c2=0.1, max_iterations=100, all_possible_transitions=True ) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } labels = Y_train['Y'].astype('str').unique() val_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=0, n_jobs=-1, n_iter=50, scoring=val_scorer) # searching rs_cv.fit(chain_X_train, chain_Y_train) best_cv_score = rs_cv.best_score_ best_sub_params = rs_cv.best_params_ obj_result = {'cv_score':best_cv_score} obj_result.update(best_sub_params) # c1 c2 return obj_result
def cross_validation(val_ratio=0.2, verbose=False): with open(DATA_ROOT + 'posts_manual_tokenized.txt') as f: post_list = split_train_text(f.read()) with open(DATA_ROOT + 'answers_manual_tokenized.txt') as f: post_list.extend(split_train_text(f.read())) post_list = [post_list[i] for i in DATA_SEQ] val_length = int(val_ratio * len(post_list)) staart = 0 ennd = val_length cross_round = 1 score_all = [] #pdb.set_trace() while (ennd <= len(post_list)): train_list = post_list[:staart] + post_list[ennd:] val_list = post_list[staart:ennd] x_train, y_train = [], [] x_val, y_val = [], [] list2xy(train_list, x_train, y_train) list2xy(val_list, x_val, y_val) # build model crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=CONFIG['single']['c1'], c2=CONFIG['single']['c2'], max_iterations=100, all_possible_transitions=True) # train model crf.fit(x_train, y_train) labels = list(crf.classes_) # Remove I label, too many labels.remove('I') # predict model y_pred = crf.predict(x_val) #print(y_pred) #pdb.set_trace() print("Cross validation round ", cross_round, " :") if (verbose): evaluate_nested(y_pred, y_val) score_all.append(evaluate_nested_score(y_pred, y_val)) staart += val_length ennd += val_length cross_round += 1 print( "average precision of CRF tokenization in {}-fold corss validation: {:0.4}" .format(int(1 / val_ratio), sum([score[0] for score in score_all]) / len(score_all))) print( "average recall of CRF tokenization in {}-fold corss validation: {:0.4}" .format(int(1 / val_ratio), sum([score[1] for score in score_all]) / len(score_all))) print( "average f1 score of CRF tokenization in {}-fold corss validation: {:0.4}" .format(int(1 / val_ratio), sum([score[2] for score in score_all]) / len(score_all)))
def predict_labels_crf(docs=None): print('Initializing docs...') docs = docs or pickle.load(open(DOC_PKL)) for pio in ['interventions' ]: #['participants', 'interventions', 'outcomes']: print('Running crf for %s' % pio) test_fnames = glob( '%s/annotations/aggregated/hierarchical_labels/%s/test/gold/*.ann' % (TOP, pio)) train_fnames = glob( '%s/annotations/aggregated/hierarchical_labels/%s/train/*.ann' % (TOP, pio)) print('Reading labels for %d train and %d test docs' % (len(train_fnames), len(test_fnames))) test_labels = { os.path.basename(f).split('_')[0]: open(f).read().split(',') for f in test_fnames } train_labels = { os.path.basename(f).split('_')[0]: open(f).read().split(',') for f in train_fnames } train_pmids = sorted(train_labels.keys()) test_pmids = test_labels.keys() assert len(set(test_pmids) & set(train_pmids)) == 0 print('Computing test/train features') train_X = [doc2features(docs[p]) for p in train_pmids] test_X = [doc2features(docs[p]) for p in test_pmids] train_Y = [train_labels[p] for p in train_pmids] test_Y = [test_labels[p] for p in test_pmids] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.4, c2=0.005, max_iterations=100, all_possible_transitions=True) print('Fitting model for %d pmids' % len(train_pmids)) crf.fit(train_X, train_Y) labels = list(crf.classes_) pred_Y = crf.predict(test_X) labels.remove('0') sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(test_Y, pred_Y, labels=sorted_labels, digits=3))
def train_model(self): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=self.C1, c2=self.C2, verbose=True, max_iterations=100, all_possible_transitions=True) print('Begin training...') crf.fit(self.x_train, self.y_train) self.y_predict = crf.predict(self.x_test)
def initialize_model(self): """初始化""" algorithm = self.algorithm c1 = float(self.c1) c2 = float(self.c2) max_iterations = int(self.max_iterations) self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True)
def get_crf(**kwargs): params = dict( algorithm='lbfgs', c1=0.001, c2=0.05, max_iterations=100, all_possible_transitions=True, verbose=False, ) params.update(kwargs) return sklearn_crfsuite.CRF(**params)
def fit_model(self, X, y) -> object: if int(util.cf.get('Util', 'verbose')): print('Fitting the model') crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X, y) return crf
def initialize_model(self): algorithm = model_config.get("algorithm") c1 = float(model_config.get("c1")) c2 = float(model_config.get("c2")) max_iterations = int(model_config.get("max_iterations")) self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True) print("-> 完成模型初始化")
def train(train_corpus: str, dev_corpus: str, c1: float = 0.0, c2: float = 0.0, algorithm: str = 'lbfgs', max_iterations: int = 100, all_possible_transitions: bool = False, window_size: int = 1, model_filename: str = None, _run: Run = None, _log: logger = None): """ running crf experiment """ _run.add_resource(train_corpus) _run.add_resource(dev_corpus) train_sents, _ = get_tagged_sents_and_words(train_corpus) dev_sents, _ = get_tagged_sents_and_words(dev_corpus) X_train = [sent2features(s, window_size) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_dev = [sent2features(s, window_size) for s in dev_sents] y_dev = [sent2labels(s) for s in dev_sents] crf = sklearn_crfsuite.CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions, model_filename=model_filename, ) crf.fit(X_train, y_train) y_pred = crf.predict(X_dev) overall, by_type = evaluate(y_dev, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}') if model_filename is not None: _log.info(f'saving to: {model_filename}.pkl') joblib.dump(crf, f'{model_filename}.pkl') _run.add_artifact(f'{model_filename}.pkl')
def train_model(x_train, y_train): # get thr CRF model model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.2, max_iterations=100, all_possible_transitions=True) model.fit(x_train, y_train) # since label 'O' is the most common label, # it will make the result look better than the real one # then we will remove label 'O' labels = list(model.classes_) labels.remove('O') joblib.dump(model, MODEL_PATH) return model, labels
def train(self, corpus): """ :param corpus: :return: """ assert corpus is not None, "No training data file given" assert 'utterances' in corpus, "Token mapping missing from training data" x_train, y_train, trained_utterances = self.prepare( corpus["utterances"]) unq_labels = list(set(list(chain(*y_train)))) # assert len(unq_labels) > 1, "Not enough unique labels for training" # best parameter selection for model-lbfgs def get_best_model(X_train, y_train): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=unq_labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=3, n_iter=10, scoring=f1_scorer) rs.fit(X_train, y_train) return rs rs = get_best_model(x_train, y_train) self.model = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=rs.best_params_.get('c1'), # 0.1, c2=rs.best_params_.get('c2'), max_iterations=100, all_possible_transitions=True) self.model.fit(x_train, y_train) return trained_utterances
def train_crf(X_train, y_train): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.564, c2=0.0279, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) with open('crf_model.p', 'w') as f: pickle.dump(crf, f)
def CRF_trainer(X, y, X_Te): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) #kf=KFold(n_splits=5,shuffle=True) #predicted=cross_val_predict(crf, X, y, cv=kf) crf.fit(X, y) predicted = crf.predict(X_Te) return predicted
def main(): """Main function of script""" args = utils.read_arguments(__doc__) documents = utils.pickle_from_file(args['input_filename']) transformer = conll_feature_extractor.ConllFeatureExtractor( use_structural=True, use_syntactic=True, # use_lexical=True ) # Extract instances and labels. Each instance is a sentence, represented as # a list of feature dictionaries for each work. Labels are represented as # a list of word labels. instances = transformer.get_feature_dict(documents) labels = conll_feature_extractor.get_labels_from_documents(documents) x_train, x_test, y_train, y_test = train_test_split(instances, labels, test_size=0.33) classifier = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) if not args['search_grid']: classifier.fit(x_train, y_train) predictions = list(itertools.chain(*classifier.predict(x_test))) evaluation.log_report(predictions, list(itertools.chain(*y_test))) else: # label_names = list(classifier.classes_) # label_names.remove('O') params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } f1_scorer = metrics.make_scorer( suite_metrics.flat_f1_score, average='weighted') #, labels=label_names) # search rs = RandomizedSearchCV(classifier, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(x_train, y_train) print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) classifier = rs.best_estimator_ predictions = list(itertools.chain(*classifier.predict(x_test))) evaluation.log_report(predictions, list(itertools.chain(*y_test)))
def train_crf(self): x_train = [self.create_x_train()] y_train = [self.create_y_label_for_train()] self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=1000, all_possible_transitions=True) self.crf_model.fit(x_train, y_train)
def train(): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) y_pred = crf.predict(X_test) print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=lll)) print(metrics.flat_classification_report(y_test, y_pred, labels=lll))