def train(texts): trainer = pycrfsuite.Trainer() for text in texts: xseq, yseq = text_to_feature_label(text) trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 100000000, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(os.path.join(get_dir(), 'model.crfsuite'))
def generate_model_split(X, y, train_split, test_split): for i in range(len(train_split)): trainer = pycrfsuite.Trainer(verbose=True) X_train = [X[j] for j in train_split[i]] y_train = [y[j] for j in train_split[i]] for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.1, #L1 penalty 'c2': 0.01, #L2 penalty 'max_iterations': 200, 'feature.possible_transitions': True }) trainer.train('crf_model/crf.model_' + str(i))
def train(inp_dir): train_data = get_data(inp_dir) trainer = pycrfsuite.Trainer(verbose=True) for dialog in train_data: features = create_features_for_dialogues(dialog) act_tags = [utt.act_tag for utt in dialog] trainer.append(features, act_tags) trainer.set_params({ 'c1': 1.0, 'c2': 1e-3, 'max_iterations': 50, 'feature.possible_transitions': True }) trainer.train("baseline_crf")
def trainModel(training_data, module, model_path, params_to_set): algorithm = 'lgbfs' if 'algorithm' in params_to_set: algorithm = params_to_set.get('algorithm') del params_to_set['algorithm'] trainer = pycrfsuite.Trainer(verbose=False, algorithm=algorithm, params=params_to_set) print("Trainer parameters:", trainer.get_params()) for _, components in training_data: tokens, labels = list(zip(*components)) features = module.tokens2features(tokens) if features is not None: trainer.append(features, labels) trainer.train(model_path)
def train(X, y, modelname='./model/train.model'): trainer = pycrfsuite.Trainer() for xseq, yseq in zip(X, y): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(modelname)
def train(self, train_data, model_file): """학습해서 CRF모델을 만들어 주는 함수입니다. Args: train_data(list(list(list(str,str)))): 학습데이터입니다. """ X_train = [self._get_features(s) for s in train_data] Y_train = [self._sent2labels(s) for s in train_data] trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for xseq, yseq in zip(X_train, Y_train): trainer.append(xseq, yseq) trainer.train(model_file) self.set_model_file(model_file)
def train(self, X, Y, file): trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X, Y): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 200, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(file)
def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = pycrfsuite.Trainer(algorithm, verbose=False) trainer.set_params(params) for doc in docs: #print(doc) for sent in doc.sents: tokens = list(sent) features = self.feature_extractor.extract( [token.text for token in tokens]) encoded_labels = self._encoder.encode(tokens) trainer.append(features, encoded_labels) trainer.train(path) self.tagger = pycrfsuite.Tagger() self.tagger.open(path)
def __init__(self, model_path, model_name, save_path=None, start_iter=0): self.model_path = model_path self.model_name = model_name self.trainer = pycrfsuite.Trainer(verbose=False) self.tagger = pycrfsuite.Tagger() self.iter = start_iter # self.save_path = save_path if not os.path.exists(self.model_path): os.makedirs(self.model_path) if self.save_path is not None: if not os.path.exists(self.save_path): os.makedirs(self.save_path) if st.DICTIONARY is True or st.SELF_ITER_N > 1: self.X_total = [] ## added for dicionary self.y_total = []
def train(self, train_data, model_file): trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for sent in train_data: tokens, labels = zip(*sent) features = [ self._feature_func(tokens, i) for i in range(len(tokens)) ] trainer.append(features, labels) # Now train the model, the output should be model_file trainer.train(model_file) # Save the model file self.set_model_file(model_file)
def train(self, train_x, train_y, out_model): trainer = pycrfsuite.Trainer(verbose=False) for x, y in zip(train_x, train_y): if x and y: trainer.append(x, y) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier 'feature.possible_transitions': True # include transitions that are possible, but not observed }) trainer.train(out_model) print(trainer.logparser.last_iteration)
def _train(self, data): trainer = pycrfsuite.Trainer(verbose=False) trainer.set_params({ 'c1': 3.0, # coefficient for L1 penalty 'c2': 1e-20, # coefficient for L2 penalty # 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) x_train = map(itemgetter(0), data) y_train = map(itemgetter(1), data) trainer.append(x_train, y_train) trainer.train(ColingBaselineClassifier.crfModelName)
def get_trainer(features): trainer = suite.Trainer(verbose=False) for xseq, yseq in zip(features[0], features[1]): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) return trainer
def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = pycrfsuite.Trainer(algorithm, verbose=False) trainer.set_params(params) encoder = self.encoder() for doc in docs: idx = 0 for sent in doc.sents: tokens = list(sent) features = self.feature_extractor.extract([str(token) for token in tokens],idx) encoding = encoder.encode(tokens) trainer.append(features, encoding) idx +=1 trainer.train(path) self.tagger = pycrfsuite.Tagger() self.tagger.open(path)
def search(self, X, y, verbose): for param_search in self.param_searches: if isinstance(param_search.values()[0], dict): for _, param_gs in param_search.items(): self.search_grid(X, y, param_gs, verbose) else: self.search_grid(X, y, param_search, verbose) if self.model: trainer = crf.Trainer(verbose) trainer.select(self.best_algorithm, self.graphical_model) trainer.set_params(self.best_param) for xseq, yseq in zip(X, y): trainer.append(xseq, yseq) trainer.train(model)
def train(self, data_path): """ Train train data loaded from file and save model to model_path :param data_path: path to data file or directory depending on self.load_data_from_file method :return: None """ sentences, labels = self.load_data_from_file(data_path) X, y = self.prepare_training_data(sentences, labels) trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X, y): trainer.append(xseq, yseq) trainer.set_params(self.crf_config) trainer.train(self.model_path)
def main(): inputdir = sys.argv[1] testdir = sys.argv[2] outputfile = sys.argv[3] x_list = [] y_list = [] for root, dirs, files in os.walk(inputdir): for filename in files: if filename.endswith(".csv"): filepath = os.path.abspath(os.path.join(root, filename)) utterances = inputtool.get_utterances_from_filename(filepath) x_train = sent2features(utterances) y_train = sent2labels(utterances) for x in x_train: x_list.append(x) for y in y_train: y_list.append(y) trainer = pycrfsuite.Trainer(verbose=False) trainer.append(x_list, y_list) trainer.set_params({ 'c1': 1, 'c2': 1e-3, 'max_iterations': 85, 'feature.possible_states': True, 'feature.possible_transitions': True }) trainer.train('baseline.crfsuite') tagger = pycrfsuite.Tagger() tagger.open('baseline.crfsuite') f = open(outputfile, "a") f.truncate(0) for root, dirs, files in os.walk(testdir): for filename in files: if filename.endswith(".csv"): filepath = os.path.abspath(os.path.join(root, filename)) utterances = inputtool.get_utterances_from_filename(filepath) x_tag = sent2features(utterances) outputlist = tagger.tag(x_tag) f.write('Filename="') f.write(filename) f.write('"') f.write('\n') for y in outputlist: f.write(y) f.write('\n') f.write('\n') f.close()
def main(training_file, testing_file, model_file): start = time.time() # Get training and testing set of data training_set = get_input(training_file) testing_set = get_input(testing_file) # Get features of each word on training set X_train = [get_features(s) for s in training_set] y_train = [get_labels(s) for s in training_set] # Get features of each word on testing set X_test = [get_features(s) for s in testing_set] y_test = [get_labels(s) for s in testing_set] # Create trainer model of CRF trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.5, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) # Train the model and save the trained model into model_file trainer.train(model_file) print ("Log of last iteration={}".format(trainer.logparser.iterations[-1])) # Initial tagger for prediction task trained_model = pycrfsuite.Tagger() trained_model.open(model_file) # Load the trained model. # Get prediction tag results from trained model y_pred = [trained_model.tag(xseq) for xseq in X_test] # Print the Precision, Recall, and F-1 score print(bio_classification_report(y_test, y_pred)) end = time.time() print('CRF model has been generated.') print('runtime:', end - start)
def train(x_train, y_train): trainer = pycrfsuite.Trainer(verbose=False) for (x, y) in zip(x_train, y_train): trainer.append(x, y) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('advanced_tagger.crfsuite') return
def train(self, X_sentences, Y_labels, model_filename = 'md.model'): trainer = pycrfsuite.Trainer(verbose=False) X_train = [sent2features(x) for x in X_sentences] Y_train = [sent2labels(y) for y in Y_labels] for xseq, yseq in zip(X_train, Y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(modelfilename)
def train(): X_train = [ sent2features(s) for s in train_sent] Y_train = [ sent2labels(s) for s in train_sent] trainer = pycrfsuite.Trainer(verbose=False) trainer.set_params({ 'c1': 1.0, 'c2': 1e-3, 'max_iterations': 50, 'feature.possible_transitions': True }) for xseq, yseq in zip(X_train, Y_train): trainer.append(xseq, yseq) trainer.train('mytrain_model')
def trainModel(training_data, model_file): X = [] Y = [] for address_text, components in training_data: tokens, labels = zip(*components) X.append(usaddress.addr2features(tokens)) Y.append(labels) #train model trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X, Y): trainer.append(xseq, yseq) trainer.train(model_file)
def _fit_model(X_train: List[List[List[str]]], y_train: List[List[str]], output_path: str, crf_max_iteration: int) -> None: logger.info("Fitting CRF model..") trainer = pycrfsuite.Trainer(verbose=True) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'epsilon': 1e-4, 'max_iterations': crf_max_iteration, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(output_path) logger.info(f"Done! Model saved at {output_path}")
def train_crf(x_train, y_train): print('Training...') trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(x_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 500, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(param.crf_path)
def train(data, model_file, params=None): """ :type data: nalaf.structures.data.Dataset :type model_file: str ~ filename (from local file system) to save trained model to. If None, no model is saved. """ trainer = pycrfsuite.Trainer() if params is not None: trainer.set_params(params) for sentence in data.sentences(): trainer.append(pycrfsuite.ItemSequence([token.features for token in sentence]), [token.original_labels[0].value for token in sentence]) # The CRFSuite library handles the "pickling" of the file; saves the model here trainer.train(model_file)
def train(self, sentences, model): """Train the CRF tagger using CRFSuite. :params sentences: Annotated sentences. :params model: Path to save pickled model. """ trainer = pycrfsuite.Trainer(verbose=True) trainer.set_params(self.params) for sentence in sentences: tokens, labels = zip(*sentence) features = [ self._get_features(tokens, i) for i in range(len(tokens)) ] trainer.append(features, labels) trainer.train(model) self.load(model)
def train(self, train_sents): X_train = [self.sent2features(s) for s in train_sents] Y_train = [self.sent2labels(s) for s in train_sents] trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, Y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(self.modelfile) '''
def train(self, model_file_path): """ TBW """ # create a trainer object self.trainer = pycrfsuite.Trainer(verbose=self.verbose) # append training data for feature_seq, label_seq in self.train_data: self.trainer.append(feature_seq, label_seq) # do the actual training self.trainer.train(model_file_path) # return the path to the model file return model_file_path
def train(self, docs, model_fname): if not self.feature_vocabulary: self.feature_vocabulary = self._scan_features(docs) trainer = pycrfsuite.Trainer(verbose=self.verbose) if self.verbose: print('begin appending data to trainer') for sent in docs: x, y = sent_to_xy(sent, self.to_feature) x = [[xij for xij in xi if xij in self.feature_vocabulary] for xi in x] trainer.append(x, y) if self.verbose: print('all data are appended to trainer. begin training') trainer.set_params(self.params) trainer.train(model_fname) self.load_tagger(model_fname)
def train(train_dir, feature_ext_fn, c1, c2, total_iterations): samples = get_data(train_dir) trainer = pycrfsuite.Trainer(verbose=True) for index, dialog in enumerate(samples): features = feature_ext_fn(dialog) tags = [utt.act_tag for utt in dialog] trainer.append(features, tags) i = 0 trainer.set_params({ 'c1': c1, 'c2': c2, 'max_iterations': total_iterations, 'feature.possible_transitions': True }) trainer.train("model.crfsuite")