def main(path_train, path_test, path_pred, path_crf, take_first, dev_size): print("loading train corpus..") _, X_raw, y = load_corpus(path_train, take_first=take_first) print("extracting features from train corpus..") fe = TaggerFeatureExtractor() X = fe.fit_transform(tqdm(X_raw)) print("training..") crf = CRF(algorithm='ap', verbose=True, max_iterations=10) if dev_size: X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size) else: X_dev, y_dev = None, None crf.fit(X, y, X_dev, y_dev) print("saving..") joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2) print("loading test corpus..") corpus, X_test_raw, y_test = load_corpus(path_test) print("extracting features from test corpus..") X_test = fe.transform(X_test_raw) print("predicting..") y_pred = crf.predict(tqdm(X_test)) print("saving results..") sents_pred = y_pred_to_sents_pred(corpus, y_pred) conll.write_sents(sents_pred, path_pred)
def train(train_file, test_file, min_freq, model_file): '''Train a CRF tagger based''' # Read in initial training data conll_data_train = read_conll_data(train_file) train_sents = [[line[0] for line in doc] for doc in conll_data_train] train_labels = [[line[2] for line in doc] for doc in conll_data_train] # Featurize and create instance from list of sentences feat_sent_train = build_dataset(train_sents) print("Training on {0} inst".format(len(feat_sent_train))) # Train and test loop for parameter settings # Create and train CRF model # For different parameter options, see: # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html model = CRF(min_freq=min_freq) model.fit(feat_sent_train, train_labels) # Test the model on held out test set if wanted if args.test_file: conll_data_test = read_conll_data(test_file) test_sents = [[line[0] for line in doc] for doc in conll_data_test] test_labels = [[line[2] for line in doc] for doc in conll_data_test] feat_sent_test = build_dataset(test_sents) # Predicting and printing accuracy pred = model.predict(feat_sent_test) acc = metrics.flat_accuracy_score(test_labels, pred) print("Accuracy: {0}%".format(float(round(acc, 3)) * 100)) # Save model to disk if wanted if args.model: print("Saving model to {0}".format(model_file)) joblib.dump(model, model_file)
def train_pos_tagger(self, path): # Just to make sure nltk.download('treebank') tagged_sentences = treebank.tagged_sents() train_size = int(.80 * len(tagged_sentences)) training_sentences = tagged_sentences[:train_size] X_train, y_train = self.transform_to_dataset(training_sentences) model = CRF() print('Training started...') model.fit(X_train, y_train) print('Training finished.') # Save classifier to file model_pkl = open(path, 'wb') pickle.dump(model, model_pkl) model_pkl.close() print("POSTagger saved.") self.classifier = model
def test_attributes(xseq, yseq): crf = CRF() assert crf.tagger_ is None assert crf.size_ is None assert crf.classes_ is None assert crf.num_attributes_ is None assert crf.attributes_ is None assert crf.state_features_ is None assert crf.transition_features_ is None crf.fit([xseq] * 20, [yseq] * 20) assert crf.tagger_ is not None assert crf.size_ > 1000 assert set(crf.classes_) == {"sunny", "rainy"} assert crf.num_attributes_ > 0 assert len(crf.attributes_) == crf.num_attributes_ assert all(crf.attributes_) assert "clean" in crf.attributes_ assert len(crf.state_features_) > 0 assert all(isinstance(c, float) for c in crf.state_features_.values()) assert all( attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys() ), crf.state_features_ assert len(crf.transition_features_) > 0 assert all(isinstance(c, float) for c in crf.transition_features_.values()) assert all( label_from in crf.classes_ and label_to in crf.classes_ for (label_from, label_to) in crf.transition_features_.keys() ), crf.transition_features_
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def train(file_path: str): """ Training CRF model from a given ``file_path`` """ addresses = [] with jsonlines.open(file_path) as reader: for obj in reader: addresses.append(obj) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=[l for l in LABELS if l != 'O']) return crf
def train(file_path: str, model_path: str = None): """ Training CRF model from a given ``file_path`` """ addresses = read_file(file_path) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) f1_score = metrics.flat_f1_score(y_val, y_pred, average="weighted", labels=[l for l in LABELS if l != "O"]) print("Flat F1-Score on validation set = {}".format(f1_score)) if model_path: joblib.dump(crf, model_path) print("Save model to {}".format(model_path)) return crf
class CRFModel(object): def __init__(self, algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False): self.model = CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions) def train(self, sentences, tag_lists, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] else: features = [sent2features(s) for s in sentences] self.model.fit(features, tag_lists) def test(self, sentences, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] pred_tag_lists = self.model.predict(features) else: features = [sent2features(s) for s in sentences] pred_tag_lists = self.model.predict(features) return pred_tag_lists
def parameter_tuning(args, dataset): c1s = experiment_util.get_param_list(args.c1) c2s = experiment_util.get_param_list(args.c2) best_valid_f1_score = -np.inf best_c1 = -np.inf best_c2 = -np.inf best_model = None for c1 in c1s: for c2 in c2s: crf = CRF(algorithm='lbfgs', c1=c1, c2=c2, max_iterations=500, all_possible_transitions=True, verbose=args.debug) crf.fit(dataset.training.list_of_feature_dicts, dataset.training.list_of_labels) preds = crf.predict(dataset.validation.list_of_feature_dicts) valid_f1_score = metrics.flat_f1_score( dataset.validation.list_of_labels, preds, average='micro') if valid_f1_score > best_valid_f1_score: best_valid_f1_score = valid_f1_score best_c1 = c1 best_c2 = c2 best_model = crf print('Best validation F1 score:', best_valid_f1_score, 'Best c1:', best_c1, 'Best c2:', best_c2) return best_model
def train1(self, data, y, tag): #tagged_data = a.fit(a.tag(),y,tag) # Features as conditional random field accepts feaobj = Features(data, self.num_features) x_train, y_train = feaobj.get print("labelled data") # Using conditional random field as features crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) print(crf) crf.fit(x_train, y_train) # Saving the model which is trained filename = 'finalized_model.sav' pickle.dump(crf, open(filename, 'wb')) # Prediction on train pred = crf.predict(x_train) # printing classification report and Accuracy print('\n \n Prediction On Trained Data:\n \n', flat_classification_report(y_train, pred)) print('Accuracy:', flat_accuracy_score(y_train, pred))
def main(): df=pd.read_csv(args.input) tagged_sentence=Preparing_tagged_data(df) df=df[['ID','FORM','XPOSTAG']] #printing details printing_details(tagged_sentence) train_set, test_set = train_test_split(tagged_sentence,test_size=0.05,random_state=7) #print("Number of Sentences in Training Data ",len(train_set)) #print("Number of Sentences in Testing Data ",len(test_set)) X_train,y_train=prepareData(tagged_sentence) X_test,y_test=prepareData(test_set) crf = CRF( algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True) crf.fit(X_train, y_train) print(crf) print("Saving Model .....") # Save the Model to file in the current working directory Pkl_Filename = args.output with open(Pkl_Filename, 'wb') as file: pickle.dump(crf, file) print("Model Saved at "+ Pkl_Filename) print() print("Checking the Algoritham's Performance \n") TestData(crf, X_train,y_train,X_test,y_test)
def build(sequences, labels, **kwargs): """ Builds a sequence classifier from x/y pairs :param sequences: A list of sequences, with each member of the sequence represented as features :type sequences: list of list of dict :param labels: The corresponding labels for each sequence :type labels: list of list of str :param kwargs: arguments to override the defaults given to the underlying CRF :return: A trained sequence classifier based on the provided training data :rtype: SequenceClassifier """ params = { 'algorithm': DEFAULT_ALGORITHM, 'c1': DEFAULT_C1, 'c2': DEFAULT_C2, 'max_iterations': DEFAULT_MAX_ITERATIONS, 'all_possible_transitions': DEFAULT_ALL_POSSIBLE_TRANSITIONS } if kwargs: params.update(kwargs) model = CRF(**params) model.fit(sequences, labels) return SequenceClassifier(model)
def crf_tag(): brown_tagged_sents = brown.tagged_sents(categories='news') #print(brown_tagged_sents[0]) train_len = int(len(brown_tagged_sents) * 0.9) training_sentences = brown_tagged_sents[:train_len] test_sentences = brown_tagged_sents[train_len:] X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(test_sentences) #print(len(X_train)) #print(len(X_test)) print(X_train[0]) print(y_train[0]) model = CRF() model.fit(X_train, y_train) raw_sent = ['I', 'am', 'a', 'student'] sent_feat = [ feature_extract(raw_sent, index) for index in range(len(raw_sent)) ] print(list(zip(raw_sent, model.predict([sent_feat])[0]))) y_pred = model.predict(X_test) print(metrics.flat_accuracy_score(y_test, y_pred))
class CRFBased: '''CRF based information retrieval. The model is similar to the Default model used in homework 2 and 3''' def __init__(self, load, n_train, n_test): self.load = load self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) self.n_train = int(n_train) self.n_test = int(n_test) def load_data(self): X_train, y_train, X_test, y_test = prepare_crf_dataset( self.load, self.n_train, self.n_test) return X_train, y_train, X_test, y_test def fit(self, X, y): self.crf.fit(X, y) self.labels = list(self.crf.classes_) self.labels.remove('O') def predict(self, X): #dataset = Train or Test pred = self.crf.predict(X) return pred def evaluate(self, y_true, y_pred): print("Final Scores for CRF Based Modes:") print( flat_classification_report(y_pred=y_pred, y_true=y_true, labels=self.labels))
def train(self, model_name, tagged_sentences): # Split the dataset for training and testing cutoff = int(.75 * len(tagged_sentences)) training_sentences = tagged_sentences[:cutoff] test_sentences = tagged_sentences[cutoff:] X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(test_sentences) print(len(X_train)) print(len(X_test)) print("Training Started........") print("it will take time according to your dataset size..") model = CRF() model.fit(X_train, y_train) print("Training Finished!") print("Evaluating with Test Data...") y_pred = model.predict(X_test) print("Accuracy is: ") print(metrics.flat_accuracy_score(y_test, y_pred)) pickle.dump(model, open(model_name, 'wb')) print("Model Saved!")
def test_attributes(xseq, yseq): crf = CRF() assert crf.tagger_ is None assert crf.size_ is None assert crf.classes_ is None assert crf.num_attributes_ is None assert crf.attributes_ is None assert crf.state_features_ is None assert crf.transition_features_ is None crf.fit([xseq] * 20, [yseq] * 20) assert crf.tagger_ is not None assert crf.size_ > 1000 assert set(crf.classes_) == {'sunny', 'rainy'} assert crf.num_attributes_ > 0 assert len(crf.attributes_) == crf.num_attributes_ assert all(crf.attributes_) assert 'clean' in crf.attributes_ assert len(crf.state_features_) > 0 assert all(isinstance(c, float) for c in crf.state_features_.values()) assert all(attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys()), crf.state_features_ assert len(crf.transition_features_) > 0 assert all(isinstance(c, float) for c in crf.transition_features_.values()) assert all(label_from in crf.classes_ and label_to in crf.classes_ for ( label_from, label_to) in crf.transition_features_.keys()), crf.transition_features_
class CRFNER(object): """ A class to get reviews for products on Amazon """ def __init__(self, gazetteer, fraction=0.7): self.gazateer = gazetteer self.fraction = fraction def train(self, documents): self.data = ner_processing.NERFormatter(self.gazateer, documents) d_train, d_test = ner_processing.train_test_NER(self.data) self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction( d_train, d_test) self.model = CRF(algorithm='lbfgs', c1=0.31, c2=0.02, max_iterations=100, all_possible_transitions=True) self.model.fit(self.X_train, self.y_train) def predict(self, sentence): """Transforms a single sentence (for NER testing) into a CRF-suite format""" sentence_split = nltk.word_tokenize(sentence) n_words = [0] * len(sentence_split) df_pred = pd.DataFrame({ 'word': sentence_split, 'sentence_no': n_words, 'category': n_words, 'POS': [x[-1] for x in nltk.pos_tag(sentence_split)], }) getter = crf_processing.SentenceGetter(df_pred) sent = getter.get_next() sentences = getter.sentences self.X = [crf_processing.sent2features(s) for s in sentences] return self.model.predict(self.X) def report(self): labels = list(self.model.classes_) y_pred = self.model.predict(self.X_test) print('F1 score {}'.format( metrics.flat_f1_score(self.y_test, y_pred, average='weighted', labels=labels))) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(self.y_test, y_pred, labels=sorted_labels, digits=3))
def test_crf(train_file,test_file,model_name=""): valores = [] data=pandas.read_csv(train_file,sep="\t",header=None) X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values) useful_features=[True,True] X_train,y_train=prepareData([X_dataset],'train',useful_features) data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) crf = CRF( algorithm='lbfgs', c1=0.0625, c2=0.5, max_iterations=100, all_possible_transitions=False, all_possible_states=True, verbose=True ) crf.fit(X_train, y_train) if(model_name!=""): save_model(model_name + ".pickle",crf) useful_features=[True,True] data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) y_pred=crf.predict(X_teste) resultados = [] for index,elem in enumerate(y_pred[0]): resultados.append(str(y_pred[0][index])) return resultados
def training_crf(training_cue, data, dataset): getter = get_frase(data) frases = getter.get_frase get_negaciones(data) X = [sent2features(f, training_cue) for f in frases] y = [sent2labels(f, training_cue) for f in frases] crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, verbose=True) pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) crf.fit(X, y) if training_cue == 'cue': model_filename = os.getcwd( ) + '/models/' + dataset + '/crf_cue_model.pkl' else: model_filename = os.getcwd( ) + '/models/' + dataset + '/crf_sco_model.pkl' with open(model_filename, 'wb') as file_model: pickle.dump(crf, file_model) return (y, pred, crf)
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != 'ap': # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def test_sklearn_crfsuite(xseq, yseq): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) expl = explain_weights(crf) text, html = format_as_all(expl, crf) assert "y='sunny' top features" in text assert "y='rainy' top features" in text assert "Transition features" in text assert "sunny -0.130 0.696" in text assert u'+0.124 солнце:не светит' in text html_nospaces = html.replace(' ', '').replace("\n", '') assert u'солнце:не светит' in html assert '<th>rainy</th><th>sunny</th>' in html_nospaces try: from eli5 import format_as_dataframe, format_as_dataframes except ImportError: pass else: from .test_formatters_as_dataframe import check_targets_dataframe df_dict = format_as_dataframes(expl) check_targets_dataframe(df_dict['targets'], expl) df_transition = df_dict['transition_features'] transition = expl.transition_features print(df_transition) assert list(transition.class_names) == ['rainy', 'sunny'] assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0, 0]) assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0, 1]) assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1, 0])
def train_crf(labelled_files, save=True, eval=True): x, y, _ = format_labelled_data(labelled_files) crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) if eval: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42) crf.fit(x_train, y_train) pred = crf.predict(x_test) report = classification_report(y_test, pred) print("Test Results:\n") line(60) print(report) line(60) log_results(y_test, pred) line(60) else: crf.fit(x, y) if save: save_crf(crf) return crf
def entity_crf_train(my_subjects): for i in range(0, len(X)): for j in range(0, len(X[i])): if 'sub' in X[i][j]: subj = my_subjects[np.random.randint(len(my_subjects))] subj = subj.split() X[i] = X[i][:j] + subj + X[i][j + 1:] y[i] = y[i][:j] + ['subject'] * len(subj) + y[i][j + 1:] X[i] = X[i][0:10] y[i] = y[i][0:10] crf = CRF(c1=0.1, c2=0.01, max_iterations=200, all_possible_transitions=True) print(".....Training entity extraction model.....") crf.fit(X, y) print(".....Trained entity extraction model.....") working_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) with open(working_directory + '/full_model/crf_model.pkl', 'wb') as pickle_file: pickle.dump(crf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) with open(working_directory + '/full_model/subjects.pkl', 'wb') as pickle_file: pickle.dump(my_subjects, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
class CRFNerModel(object): def __init__(self, is_save=False): self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) self.is_save = is_save self.save_model = "crf.model" def fit(self, train_x, train_y): self.crf.fit(train_x, train_y) if self.is_save: self.dump_model() def predict(self, input_x): input_x = list(input_x) input_feature = [sent2features(input_x)] return self.crf.predict(input_feature) def dump_model(self): model_data = pickle.dumps(self.crf) with open(self.save_model, "wb") as f: f.write(model_data) def load_model(self): with open(self.save_model, "rb") as f: model_data = f.read() self.crf = pickle.loads(model_data) def predict_list(self, input_list): return self.crf.predict(input_list) def extract_ner(self, input_x): extract_ner = [] res = self.predict(input_x) start = None label = None for i, x in enumerate(res[0]): if x == "O": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = None label = None else: xindex, xlabel = x.split("-") if xindex == "B": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = i label = xlabel else: if label != xlabel: start = None label = None return extract_ner
def train_crf(trainx, trainy): print "training CRF..." crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(trainx, trainy) return crf
def test_crf_score(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8
def train_crf(x,y): '''train a crf model on x and y data''' crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) crf.fit(x, y) return crf
def test_crf_score(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) if algorithm != 'ap': assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8
def test_sklearn_crfsuite_feature_re(xseq, yseq): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) expl = explain_weights(crf, feature_re=u'(солн|clean)') for expl in format_as_all(expl, crf): assert u'солн' in expl assert u'clean' in expl assert 'walk' not in expl
def train_crf(x, y): crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) crf.fit(x, y) return crf
def CRF_model(X_train,y_train): crf = CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train, y_train) return crf
def test_sklearn_targets(xseq, yseq, targets): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) res = explain_weights(crf, target_names={'sunny': u'☀'}, targets=targets) for expl in format_as_all(res, crf): assert u'☀' in expl if targets[0] == 'rainy': assert expl.index('rainy') < expl.index(u'☀') else: assert expl.index('rainy') > expl.index(u'☀')
def train(model_name, xtrain, ytrain): print('hallo') crf = CRF(algorithm='lbfgs', c1=0.0001, c2=0.0001, max_iterations=100, all_possible_transitions=False, model_filename=(model_name)) crf.fit(xtrain, ytrain) print('hallo2') return crf
def test_crf_verbose(xseq, yseq, algorithm, use_dev): crf = CRF(algorithm, verbose=True) if use_dev: X_dev, y_dev = [xseq], [yseq] else: X_dev, y_dev = None, None crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def test_crf_pickling(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) crf2 = pickle.loads(data) score = crf2.score([xseq], [yseq]) if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8 assert crf2.algorithm == algorithm
def test_crf_marginals(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred_marginals = crf.predict_marginals([xseq]) assert len(y_pred_marginals) == 1 marginals = y_pred_marginals[0] assert len(marginals) == len(yseq) labels = crf.tagger_.labels() for m in marginals: assert isinstance(m, dict) assert set(m.keys()) == set(labels) assert abs(sum(m.values()) - 1.0) < 1e-6
def test_crf_model_filename(xseq, yseq, tmpdir): path = os.path.join(str(tmpdir), "foo.crfsuite") assert not os.path.exists(path) # model file is created at a specified location crf = CRF(model_filename=path) crf.fit([xseq], [yseq]) assert os.path.exists(path) # it is possible to load the model just by passing a file name crf2 = CRF(model_filename=path) assert crf2.score([xseq], [yseq]) == 1.0 # crf is picklable data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) crf3 = pickle.loads(data) assert crf3.score([xseq], [yseq]) == 1.0
def main(arg): X_train, y_train = transform_to_dataset(training_sentences,arg) X_test, y_test = transform_to_dataset(test_sentences,arg) print(len(X_train)) print(len(X_test)) print(X_train[0]) if arg['model_name']=="crf": model = CRF() model.fit(X_train, y_train) elif arg['model_name']=="SVM": v = DictVectorizer(sparse=False) X_tr = v.fit_transform(X_train) X_ts = v.fit_transform(X_test) sentence = ['I', 'am', 'Bob', '!']
def test_crf_dev_bad_arguments(xseq, yseq): crf = CRF() X = [xseq] * 20 y = [yseq] * 20 with pytest.raises(ValueError): crf.fit(X, y, X)