class CRFModel(object): def __init__(self, algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False): self.model = CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions) def train(self, sentences, tag_lists, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] else: features = [sent2features(s) for s in sentences] self.model.fit(features, tag_lists) def test(self, sentences, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] pred_tag_lists = self.model.predict(features) else: features = [sent2features(s) for s in sentences] pred_tag_lists = self.model.predict(features) return pred_tag_lists
def crf_tag(): brown_tagged_sents = brown.tagged_sents(categories='news') #print(brown_tagged_sents[0]) train_len = int(len(brown_tagged_sents) * 0.9) training_sentences = brown_tagged_sents[:train_len] test_sentences = brown_tagged_sents[train_len:] X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(test_sentences) #print(len(X_train)) #print(len(X_test)) print(X_train[0]) print(y_train[0]) model = CRF() model.fit(X_train, y_train) raw_sent = ['I', 'am', 'a', 'student'] sent_feat = [ feature_extract(raw_sent, index) for index in range(len(raw_sent)) ] print(list(zip(raw_sent, model.predict([sent_feat])[0]))) y_pred = model.predict(X_test) print(metrics.flat_accuracy_score(y_test, y_pred))
class CRFNER(object): """ A class to get reviews for products on Amazon """ def __init__(self, gazetteer, fraction=0.7): self.gazateer = gazetteer self.fraction = fraction def train(self, documents): self.data = ner_processing.NERFormatter(self.gazateer, documents) d_train, d_test = ner_processing.train_test_NER(self.data) self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction( d_train, d_test) self.model = CRF(algorithm='lbfgs', c1=0.31, c2=0.02, max_iterations=100, all_possible_transitions=True) self.model.fit(self.X_train, self.y_train) def predict(self, sentence): """Transforms a single sentence (for NER testing) into a CRF-suite format""" sentence_split = nltk.word_tokenize(sentence) n_words = [0] * len(sentence_split) df_pred = pd.DataFrame({ 'word': sentence_split, 'sentence_no': n_words, 'category': n_words, 'POS': [x[-1] for x in nltk.pos_tag(sentence_split)], }) getter = crf_processing.SentenceGetter(df_pred) sent = getter.get_next() sentences = getter.sentences self.X = [crf_processing.sent2features(s) for s in sentences] return self.model.predict(self.X) def report(self): labels = list(self.model.classes_) y_pred = self.model.predict(self.X_test) print('F1 score {}'.format( metrics.flat_f1_score(self.y_test, y_pred, average='weighted', labels=labels))) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(self.y_test, y_pred, labels=sorted_labels, digits=3))
class CRFNerModel(object): def __init__(self, is_save=False): self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) self.is_save = is_save self.save_model = "crf.model" def fit(self, train_x, train_y): self.crf.fit(train_x, train_y) if self.is_save: self.dump_model() def predict(self, input_x): input_x = list(input_x) input_feature = [sent2features(input_x)] return self.crf.predict(input_feature) def dump_model(self): model_data = pickle.dumps(self.crf) with open(self.save_model, "wb") as f: f.write(model_data) def load_model(self): with open(self.save_model, "rb") as f: model_data = f.read() self.crf = pickle.loads(model_data) def predict_list(self, input_list): return self.crf.predict(input_list) def extract_ner(self, input_x): extract_ner = [] res = self.predict(input_x) start = None label = None for i, x in enumerate(res[0]): if x == "O": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = None label = None else: xindex, xlabel = x.split("-") if xindex == "B": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = i label = xlabel else: if label != xlabel: start = None label = None return extract_ner
def test_accuracy(training_dir_path, test_dir_path, is_Convo_label): global labels, type_labels curr_labels = {} if is_Convo_label: curr_labels = labels else: curr_labels = type_labels # Get the training Data x_train, y_train = get_conversation_data(training_dir_path, True, is_Convo_label) print("Loaded Training Data") # Get Testing Data x_test, y_test = get_conversation_data(test_dir_path, False, is_Convo_label) print("Loaded Testing Data") crf = CRF(algorithm='l2sgd', c2=0.001, max_iterations=100, all_possible_transitions=False) crf.fit(x_train, y_train) y_prediction = crf.predict(x_test) predictions = np.array([curr_labels[tag] for row in y_prediction for tag in row]) truths = np.array([curr_labels[tag] for row in y_test for tag in row]) # Print Metrics if is_Convo_label: print(classification_report( truths, predictions, target_names=['REQ', 'ANSW', 'COMPLIM', 'ANNOU', 'THK', 'RESPOS', 'APOL', 'RCPT'])) # Get test accuracy test_ = str(accuracy_score(truths, predictions)) # for w in sorted(crf.transition_features_, key=crf.transition_features_.get, reverse=True): # print(str(w) + ":" + str(crf.transition_features_[w])) # Testing on training data without label x_test, y_test = get_conversation_data(training_dir_path, False, is_Convo_label) y_prediction = crf.predict(x_test) predictions = np.array([curr_labels[tag] for row in y_prediction for tag in row]) truths = np.array([curr_labels[tag] for row in y_test for tag in row]) sf = crf.state_features_ print(type(sf)) # Get train accuracy train_ = str(accuracy_score(truths, predictions)) return test_, train_
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def train(file_path: str): """ Training CRF model from a given ``file_path`` """ addresses = [] with jsonlines.open(file_path) as reader: for obj in reader: addresses.append(obj) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=[l for l in LABELS if l != 'O']) return crf
def parameter_tuning(args, dataset): c1s = experiment_util.get_param_list(args.c1) c2s = experiment_util.get_param_list(args.c2) best_valid_f1_score = -np.inf best_c1 = -np.inf best_c2 = -np.inf best_model = None for c1 in c1s: for c2 in c2s: crf = CRF(algorithm='lbfgs', c1=c1, c2=c2, max_iterations=500, all_possible_transitions=True, verbose=args.debug) crf.fit(dataset.training.list_of_feature_dicts, dataset.training.list_of_labels) preds = crf.predict(dataset.validation.list_of_feature_dicts) valid_f1_score = metrics.flat_f1_score( dataset.validation.list_of_labels, preds, average='micro') if valid_f1_score > best_valid_f1_score: best_valid_f1_score = valid_f1_score best_c1 = c1 best_c2 = c2 best_model = crf print('Best validation F1 score:', best_valid_f1_score, 'Best c1:', best_c1, 'Best c2:', best_c2) return best_model
def train1(self, data, y, tag): #tagged_data = a.fit(a.tag(),y,tag) # Features as conditional random field accepts feaobj = Features(data, self.num_features) x_train, y_train = feaobj.get print("labelled data") # Using conditional random field as features crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) print(crf) crf.fit(x_train, y_train) # Saving the model which is trained filename = 'finalized_model.sav' pickle.dump(crf, open(filename, 'wb')) # Prediction on train pred = crf.predict(x_train) # printing classification report and Accuracy print('\n \n Prediction On Trained Data:\n \n', flat_classification_report(y_train, pred)) print('Accuracy:', flat_accuracy_score(y_train, pred))
def train(train_file, test_file, min_freq, model_file): '''Train a CRF tagger based''' # Read in initial training data conll_data_train = read_conll_data(train_file) train_sents = [[line[0] for line in doc] for doc in conll_data_train] train_labels = [[line[2] for line in doc] for doc in conll_data_train] # Featurize and create instance from list of sentences feat_sent_train = build_dataset(train_sents) print("Training on {0} inst".format(len(feat_sent_train))) # Train and test loop for parameter settings # Create and train CRF model # For different parameter options, see: # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html model = CRF(min_freq=min_freq) model.fit(feat_sent_train, train_labels) # Test the model on held out test set if wanted if args.test_file: conll_data_test = read_conll_data(test_file) test_sents = [[line[0] for line in doc] for doc in conll_data_test] test_labels = [[line[2] for line in doc] for doc in conll_data_test] feat_sent_test = build_dataset(test_sents) # Predicting and printing accuracy pred = model.predict(feat_sent_test) acc = metrics.flat_accuracy_score(test_labels, pred) print("Accuracy: {0}%".format(float(round(acc, 3)) * 100)) # Save model to disk if wanted if args.model: print("Saving model to {0}".format(model_file)) joblib.dump(model, model_file)
def main(path_train, path_test, path_pred, path_crf, take_first, dev_size): print("loading train corpus..") _, X_raw, y = load_corpus(path_train, take_first=take_first) print("extracting features from train corpus..") fe = TaggerFeatureExtractor() X = fe.fit_transform(tqdm(X_raw)) print("training..") crf = CRF(algorithm='ap', verbose=True, max_iterations=10) if dev_size: X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size) else: X_dev, y_dev = None, None crf.fit(X, y, X_dev, y_dev) print("saving..") joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2) print("loading test corpus..") corpus, X_test_raw, y_test = load_corpus(path_test) print("extracting features from test corpus..") X_test = fe.transform(X_test_raw) print("predicting..") y_pred = crf.predict(tqdm(X_test)) print("saving results..") sents_pred = y_pred_to_sents_pred(corpus, y_pred) conll.write_sents(sents_pred, path_pred)
def train_crf(labelled_files, save=True, eval=True): x, y, _ = format_labelled_data(labelled_files) crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) if eval: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42) crf.fit(x_train, y_train) pred = crf.predict(x_test) report = classification_report(y_test, pred) print("Test Results:\n") line(60) print(report) line(60) log_results(y_test, pred) line(60) else: crf.fit(x, y) if save: save_crf(crf) return crf
class CRFBased: '''CRF based information retrieval. The model is similar to the Default model used in homework 2 and 3''' def __init__(self, load, n_train, n_test): self.load = load self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) self.n_train = int(n_train) self.n_test = int(n_test) def load_data(self): X_train, y_train, X_test, y_test = prepare_crf_dataset( self.load, self.n_train, self.n_test) return X_train, y_train, X_test, y_test def fit(self, X, y): self.crf.fit(X, y) self.labels = list(self.crf.classes_) self.labels.remove('O') def predict(self, X): #dataset = Train or Test pred = self.crf.predict(X) return pred def evaluate(self, y_true, y_pred): print("Final Scores for CRF Based Modes:") print( flat_classification_report(y_pred=y_pred, y_true=y_true, labels=self.labels))
def train(self, model_name, tagged_sentences): # Split the dataset for training and testing cutoff = int(.75 * len(tagged_sentences)) training_sentences = tagged_sentences[:cutoff] test_sentences = tagged_sentences[cutoff:] X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(test_sentences) print(len(X_train)) print(len(X_test)) print("Training Started........") print("it will take time according to your dataset size..") model = CRF() model.fit(X_train, y_train) print("Training Finished!") print("Evaluating with Test Data...") y_pred = model.predict(X_test) print("Accuracy is: ") print(metrics.flat_accuracy_score(y_test, y_pred)) pickle.dump(model, open(model_name, 'wb')) print("Model Saved!")
def test_crf(train_file,test_file,model_name=""): valores = [] data=pandas.read_csv(train_file,sep="\t",header=None) X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values) useful_features=[True,True] X_train,y_train=prepareData([X_dataset],'train',useful_features) data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) crf = CRF( algorithm='lbfgs', c1=0.0625, c2=0.5, max_iterations=100, all_possible_transitions=False, all_possible_states=True, verbose=True ) crf.fit(X_train, y_train) if(model_name!=""): save_model(model_name + ".pickle",crf) useful_features=[True,True] data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) y_pred=crf.predict(X_teste) resultados = [] for index,elem in enumerate(y_pred[0]): resultados.append(str(y_pred[0][index])) return resultados
def write_to_CoNLL(mdl_file_name, sentence2features, test_sentences, write_path): X_test_local = [] cond_rand_mdl = CRF(algorithm='lbfgs', c1=0.0001, c2=0.0001, max_iterations=100, all_possible_transitions=False, model_filename=mdl_file_name) if mdl_file_name[(len(mdl_file_name) - 1)] == '2': old_crf = CRF(algorithm='lbfgs', c1=0.0001, c2=0.0001, max_iterations=100, all_possible_transitions=False, model_filename=(mdl_file_name[:(len(mdl_file_name) - 1)]) + '1') X_test_local = [sent2features_second_guess(s, sentence2features, old_crf) for s in test_sentences] else: X_test_local = [sentence2features(s) for s in test_sentences] predictions = cond_rand_mdl.predict(X_test_local) with open(write_path, 'a') as f: for i in range(0, len(predictions)): sent = test_sentences[i] preds = predictions[i] for j in range(0, len(sent)): str_to_write = '{}\t{}\n'.format(sent[j][0], preds[j]) f.write(str_to_write) f.write('\n')
def train(file_path: str, model_path: str = None): """ Training CRF model from a given ``file_path`` """ addresses = read_file(file_path) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) f1_score = metrics.flat_f1_score(y_val, y_pred, average="weighted", labels=[l for l in LABELS if l != "O"]) print("Flat F1-Score on validation set = {}".format(f1_score)) if model_path: joblib.dump(crf, model_path) print("Save model to {}".format(model_path)) return crf
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != 'ap': # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def test_crf_verbose(xseq, yseq, algorithm, use_dev): crf = CRF(algorithm=algorithm, verbose=True) if use_dev: X_dev, y_dev = [xseq], [yseq] else: X_dev, y_dev = None, None crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev) y_pred = crf.predict([xseq]) if algorithm != 'ap': # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def test_crf_verbose(xseq, yseq, algorithm, use_dev): crf = CRF(algorithm, verbose=True) if use_dev: X_dev, y_dev = [xseq], [yseq] else: X_dev, y_dev = None, None crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
class CRFmodel(ModelBase): def __init__(self): self.model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True) def fit(self, X, Y): if self.debug: print("training CRF...") self.model.fit(X, Y) def predict(self, X): return self.model.predict(X)
class CRFEvaluateStep(Step): """ Step to evaluate testing data against a CRF model, stored on file """ def __init__(self, model_file_path): self.model_file_path = path.abspath(path.expanduser(model_file_path)) self.model = CRF(algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True, model_filename=self.model_file_path) def run(self, batches: Generator) -> None: """ Runs the CRF model, storing to pickle in the end """ st = time.time() x = [] y = [] # For prediction, CRF does not implement batching, so we pass a list for batch in batches: b = list(batch) x.extend(b[0]) y.extend(b[1]) accuracy = self.model.score(x, y) y_pred = self.model.predict(x) f1_score = metrics.flat_f1_score(y, y_pred, average='weighted') accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred) classification_report = metrics.flat_classification_report( y, y_pred, labels=self.model.classes_) print("*" * 80) print("MODEL EVALUATION") print("*" * 80) print("Token-wise accuracy score on Test Data:") print(round(accuracy, 3)) print("F1 score on Test Data:") print(round(f1_score, 3)) print( "Sequence accurancy score (% of sentences scored 100% correctly):") print(round(accuracy_sentence, 3)) print("Class-wise classification report:") print(classification_report) et = time.time() print(f"Evaluation finished in {round(et-st, 2)} seconds.")
def main(): X, y = load_dataset(DATA_PATH) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) crf.fit(X_train, y_train) y_pred = crf.predict(X_test) f1_score = flat_f1_score(y_test, y_pred, average='weighted') print(f1_score) report = flat_classification_report(y_test, y_pred) print(report)
def test_model( model: sklearn_crfsuite.CRF, test_path: typing.Union[str, Path], out_file: typing.Optional[typing.TextIO] = None, ): """Print an accuracy report for a model to a file""" try: import conllu except ImportError as e: _LOGGER.fatal("conllu package is required for testing") _LOGGER.fatal("pip install 'conllu>=4.4'") raise e _LOGGER.debug("Loading test file (%s)", test_path) with open(test_path, "r") as test_file: test_sents = conllu.parse(test_file.read()) _LOGGER.debug("Getting features for %s test sentence(s)", len(test_sents)) x_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] labels = list(model.classes_) y_pred = model.predict(x_test) print( "F1 score on the test set = {}".format( metrics.flat_f1_score(y_test, y_pred, average="weighted", labels=labels)), file=out_file, ) print( "Accuracy on the test set = {}".format( metrics.flat_accuracy_score(y_test, y_pred)), file=out_file, ) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( "Test set classification report: {}".format( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)), file=out_file, )
class CRFModel(object): def __init__(self,algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False): self.model=CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) def train(self,sentences,tagLists): features=[utils.sent2feature(sent)for sent in sentences] self.model.fit(features,tagLists) def test(self,sentences): features=[utils.sent2feature(sent)for sent in sentences] predictLists=self.model.predict(features) return predictLists
def train_crf_pos(corpus, corpus_name): # Required corpus structure: # [[(w1,t1), (w2,t2),...(wn,tn)], [(w1,t1)(w2,t2),...(wm,tm)],...] #feat_all = {} # common features (baseline set) #feat_en = {} # extra features for English #features = {**feat_all, **feat_en} train_frac = 0.9 # fraction of data for the training set split_idx = int(train_frac * len(corpus)) # Extract the feautures and separate labels from features X = [get_crf_features([pair[0] for pair in sent]) for sent in corpus] y = [[pair[1] for pair in sent] for sent in corpus] # Create the training and the test sets X_train = X[:split_idx] y_train = y[:split_idx] X_test = X[split_idx:] y_test = y[split_idx:] # Create the CRF model model = CRF( algorithm='lbfgs', # gradient descent using the L-BFGS method c1=0.1, # coeff. for L1 regularization c2=0.1, # coeff. for L2 regularization max_iterations=100, ) # Train the model model.fit(X_train, y_train) # Save the model with open(os.path.join('data', 'models', corpus_name + '_crf.pkl'), 'wb') as f: pickle.dump(model, f, 4) # Evaluate the model y_pred = model.predict(X_test) print("Test accuracy: %.4f" % metrics.flat_accuracy_score(y_test, y_pred)) return model
def test_crf(train_file, test_file, model_name=""): l1 = [0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1] l2 = [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16] valores = [] useful_features = [True, True] data = pandas.read_csv(train_file, sep="\t", header=None) X_dataset = fromListToTuple(data.iloc[:, [0, 1, 2, 3]].values) X_teste, y_teste = prepareData([X_dataset], 'test', useful_features) #X_teste, test = train_test_split(X_dataset, test_size=0.1) X_teste = pd.DataFrame(X_teste).transpose() y_teste = pd.DataFrame(y_teste).transpose() print(X_teste.shape) print(y_teste.shape) X_teste_2 = X_teste y_teste_2 = y_teste crf = CRF( algorithm='lbfgs', #c1=0.0625, c1=1.0, #c2=0.5, c2=1.0, max_iterations=100, all_possible_transitions=False, all_possible_states=True, verbose=True) crf.fit(X_teste.values.tolist(), y_teste.values.tolist()) y_pred = crf.predict(X_teste.values.tolist()) labels = list(crf.classes_) save_model("NP_Final_Macro.pickle", crf) string = " " string += str( metrics.flat_classification_report(y_teste.values.tolist(), y_pred, labels=labels, digits=3)) filename = "Results _.txt" print("$$$$$$$$$$$") print(filename) print("$$$$$$$$$$$") with open(filename, 'a') as f: f.write(string)
def train_seq(X_train, Y_train, X_dev, Y_dev): # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True) crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50) #, all_possible_states=True) #Just to fit on training data crf.fit(X_train, Y_train) labels = list(crf.classes_) #testing: y_pred = crf.predict(X_dev) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_f1_score(Y_dev, y_pred, average='weighted', labels=labels)) print( metrics.flat_classification_report(Y_dev, y_pred, labels=sorted_labels, digits=3)) print(metrics.sequence_accuracy_score(Y_dev, y_pred)) get_confusion_matrix(Y_dev, y_pred, labels=sorted_labels)
class CRFModel(object): def __init__(self, solver="lbfgs", c1=0.1, c2=0.1, max_iter=100, all_possible_transitions=False): self.model = CRF(algorithm=solver, c1=c1, c2=c2, max_iterations=max_iter, all_possible_transitions=all_possible_transitions) def train(self, sentences, tag_lists): features = [sent2features(s) for s in sentences] self.model.fit(features, tag_lists) def test(self, sentences): features = [sent2features(s) for s in sentences] pred_tag_lists = self.model.predict(features) return pred_tag_lists
class TrainCRF(): def __init__(self, char2idx_path, tag2idx_path, algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False): # 载入一些字典 # char2idx: 字 转换为 token self.char2idx = load_dict(char2idx_path) # tag2idx: 标签转换为 token self.tag2idx = load_dict(tag2idx_path) # idx2tag: token转换为标签 self.idx2tag = {v: k for k, v in self.tag2idx.items()} # 初始化隐状态数量(实体标签数)和观测数量(字数) self.tag_size = len(self.tag2idx) self.vocab_size = max([v for _, v in self.char2idx.items()]) + 1 self.model = CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions) def train_crf(self, train_dic_path): train_dic = load_data(train_dic_path) features = [] labels = [] for dic in tqdm(train_dic): features.append(sent2features(dic["text"])) labels.append(dic["label"]) self.model.fit(features, labels) def predict(self, setence): features = [sent2features(s) for s in setence] pred_tag_lists = self.model.predict(features) print(pred_tag_lists)
class GenericRetriever(Retriever): def learn(self, config): texts = config.getTexts() sentences = [] for text in texts: for sent in ET.fromstring(text).findall('sentence'): stemp = [] for wrd in sent.findall('word'): stemp.append( [wrd.text, wrd.attrib['pos'], wrd.attrib['tag']]) sentences.append(stemp) X = [sent2features(s) for s in sentences] y = [sent2labels(s) for s in sentences] self.clf = CRF(algorithm='lbfgs', c1=10, c2=0.1, max_iterations=100, all_possible_transitions=False) self.clf.fit(X, y) def retrieve(self, text): text = nltk.pos_tag(nltk.word_tokenize(text.lower())) X = sent2features(text) resp = [] pred = self.clf.predict([X])[0] acum = None for i in range(len(pred)): if pred[i][0] == 'B': acum = text[i][0] elif pred[i][0] == 'I': acum = acum + " " + text[i][0] else: if acum != None: resp.append(acum) acum = None return resp
class CRFPredictStep(Step): """ Step to get predictions from features using a CRF model, for specific sentences. """ def __init__(self, model_file_path): self.model_file_path = path.abspath(path.expanduser(model_file_path)) self.model = CRF(algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True, model_filename=self.model_file_path) def run(self, batches: Generator) -> None: """ Runs the step """ features = list(batches) pred = self.model.predict(features) for index, feature in enumerate(features): print(' '.join(map(lambda x: x['word'], feature)), end='') print(' => ', end='') print(pred[index])
def test_predict_without_fit(xseq, algorithm): crf = CRF(algorithm) with pytest.raises(Exception): crf.predict([xseq])