コード例 #1
0
ファイル: crf_helper.py プロジェクト: nishiba/tagging_helper
def train(texts):
    trainer = pycrfsuite.Trainer()
    for text in texts:
        xseq, yseq = text_to_feature_label(text)
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 100000000,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    trainer.train(os.path.join(get_dir(), 'model.crfsuite'))
コード例 #2
0
def generate_model_split(X, y, train_split, test_split):
    for i in range(len(train_split)):
        trainer = pycrfsuite.Trainer(verbose=True)
        X_train = [X[j] for j in train_split[i]]
        y_train = [y[j] for j in train_split[i]]
        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 0.1,  #L1 penalty
            'c2': 0.01,  #L2 penalty
            'max_iterations': 200,
            'feature.possible_transitions': True
        })
        trainer.train('crf_model/crf.model_' + str(i))
コード例 #3
0
ファイル: advanced_tagger.py プロジェクト: soniagodhwani/NLP
def train(inp_dir):
    train_data = get_data(inp_dir)
    trainer = pycrfsuite.Trainer(verbose=True)
    for dialog in train_data:
        features = create_features_for_dialogues(dialog)
        act_tags = [utt.act_tag for utt in dialog]
        trainer.append(features, act_tags)
    trainer.set_params({
        'c1': 1.0,
        'c2': 1e-3,
        'max_iterations': 50,
        'feature.possible_transitions': True
    })
    trainer.train("baseline_crf")
コード例 #4
0
ファイル: training.py プロジェクト: paudan/parserator
def trainModel(training_data, module, model_path, params_to_set):
    algorithm = 'lgbfs'
    if 'algorithm' in params_to_set:
        algorithm = params_to_set.get('algorithm')
        del params_to_set['algorithm']
    trainer = pycrfsuite.Trainer(verbose=False,
                                 algorithm=algorithm,
                                 params=params_to_set)
    print("Trainer parameters:", trainer.get_params())
    for _, components in training_data:
        tokens, labels = list(zip(*components))
        features = module.tokens2features(tokens)
        if features is not None:
            trainer.append(features, labels)
    trainer.train(model_path)
コード例 #5
0
ファイル: main.py プロジェクト: sdu2011/NER
def train(X, y, modelname='./model/train.model'):
    trainer = pycrfsuite.Trainer()

    for xseq, yseq in zip(X, y):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    trainer.train(modelname)
コード例 #6
0
    def train(self, train_data, model_file):
        """학습해서 CRF모델을 만들어 주는 함수입니다.

        Args:
            train_data(list(list(list(str,str)))): 학습데이터입니다.

        """
        X_train = [self._get_features(s) for s in train_data]
        Y_train = [self._sent2labels(s) for s in train_data]
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)
        for xseq, yseq in zip(X_train, Y_train):
            trainer.append(xseq, yseq)
        trainer.train(model_file)
        self.set_model_file(model_file)
コード例 #7
0
ファイル: crf.py プロジェクト: idleyui/sequence-labeling
    def train(self, X, Y, file):
        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq, yseq in zip(X, Y):
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 1,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 200,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
        trainer.train(file)
コード例 #8
0
 def train(self, docs: Iterable[Doc], algorithm: str, params: dict,
           path: str) -> None:
     trainer = pycrfsuite.Trainer(algorithm, verbose=False)
     trainer.set_params(params)
     for doc in docs:
         #print(doc)
         for sent in doc.sents:
             tokens = list(sent)
             features = self.feature_extractor.extract(
                 [token.text for token in tokens])
             encoded_labels = self._encoder.encode(tokens)
             trainer.append(features, encoded_labels)
     trainer.train(path)
     self.tagger = pycrfsuite.Tagger()
     self.tagger.open(path)
コード例 #9
0
 def __init__(self, model_path, model_name, save_path=None, start_iter=0):
     self.model_path = model_path
     self.model_name = model_name
     self.trainer = pycrfsuite.Trainer(verbose=False)
     self.tagger = pycrfsuite.Tagger()
     self.iter = start_iter  #
     self.save_path = save_path
     if not os.path.exists(self.model_path):
         os.makedirs(self.model_path)
     if self.save_path is not None:
         if not os.path.exists(self.save_path):
             os.makedirs(self.save_path)
     if st.DICTIONARY is True or st.SELF_ITER_N > 1:
         self.X_total = []  ## added for dicionary
         self.y_total = []
コード例 #10
0
    def train(self, train_data, model_file):
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)

        for sent in train_data:
            tokens, labels = zip(*sent)
            features = [
                self._feature_func(tokens, i) for i in range(len(tokens))
            ]
            trainer.append(features, labels)

        # Now train the model, the output should be model_file
        trainer.train(model_file)
        # Save the model file
        self.set_model_file(model_file)
コード例 #11
0
ファイル: crf_sent_tagger.py プロジェクト: yodanater/Jiayan
    def train(self, train_x, train_y, out_model):
        trainer = pycrfsuite.Trainer(verbose=False)
        for x, y in zip(train_x, train_y):
            if x and y:
                trainer.append(x, y)

        trainer.set_params({
            'c1': 1.0,                            # coefficient for L1 penalty
            'c2': 1e-3,                           # coefficient for L2 penalty
            'max_iterations': 50,                 # stop earlier
            'feature.possible_transitions': True  # include transitions that are possible, but not observed
        })

        trainer.train(out_model)
        print(trainer.logparser.last_iteration)
コード例 #12
0
    def _train(self, data):
        trainer = pycrfsuite.Trainer(verbose=False)
        trainer.set_params({
            'c1': 3.0,  # coefficient for L1 penalty
            'c2': 1e-20,  # coefficient for L2 penalty
            #             'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })

        x_train = map(itemgetter(0), data)
        y_train = map(itemgetter(1), data)
        trainer.append(x_train, y_train)
        trainer.train(ColingBaselineClassifier.crfModelName)
def get_trainer(features):
    trainer = suite.Trainer(verbose=False)
    for xseq, yseq in zip(features[0], features[1]):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    return trainer
コード例 #14
0
 def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None:
     trainer = pycrfsuite.Trainer(algorithm, verbose=False)
     trainer.set_params(params)
     encoder = self.encoder()
     for doc in docs:
         idx = 0
         for sent in doc.sents:
             tokens = list(sent)
             features = self.feature_extractor.extract([str(token) for token in tokens],idx)
             encoding = encoder.encode(tokens)
             trainer.append(features, encoding)
             idx +=1
     trainer.train(path)
     self.tagger = pycrfsuite.Tagger()
     self.tagger.open(path)
コード例 #15
0
    def search(self, X, y, verbose):
        for param_search in self.param_searches:
            if isinstance(param_search.values()[0], dict):
                for _, param_gs in param_search.items():
                    self.search_grid(X, y, param_gs, verbose)
            else:
                self.search_grid(X, y, param_search, verbose)

        if self.model:
            trainer = crf.Trainer(verbose)
            trainer.select(self.best_algorithm, self.graphical_model)
            trainer.set_params(self.best_param)
            for xseq, yseq in zip(X, y):
                trainer.append(xseq, yseq)
            trainer.train(model)
コード例 #16
0
    def train(self, data_path):
        """
        Train train data loaded from file and save model to model_path
        :param data_path: path to data file or directory depending on self.load_data_from_file method
        :return: None
        """
        sentences, labels = self.load_data_from_file(data_path)
        X, y = self.prepare_training_data(sentences, labels)
        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq, yseq in zip(X, y):
            trainer.append(xseq, yseq)

        trainer.set_params(self.crf_config)
        trainer.train(self.model_path)
コード例 #17
0
def main():
    inputdir = sys.argv[1]
    testdir = sys.argv[2]
    outputfile = sys.argv[3]
    x_list = []
    y_list = []
    for root, dirs, files in os.walk(inputdir):
        for filename in files:
            if filename.endswith(".csv"):
                filepath = os.path.abspath(os.path.join(root, filename))
                utterances = inputtool.get_utterances_from_filename(filepath)
                x_train = sent2features(utterances)
                y_train = sent2labels(utterances)
                for x in x_train:
                    x_list.append(x)
                for y in y_train:
                    y_list.append(y)

    trainer = pycrfsuite.Trainer(verbose=False)
    trainer.append(x_list, y_list)
    trainer.set_params({
        'c1': 1,
        'c2': 1e-3,
        'max_iterations': 85,
        'feature.possible_states': True,
        'feature.possible_transitions': True
    })
    trainer.train('baseline.crfsuite')
    tagger = pycrfsuite.Tagger()
    tagger.open('baseline.crfsuite')
    f = open(outputfile, "a")
    f.truncate(0)
    for root, dirs, files in os.walk(testdir):
        for filename in files:
            if filename.endswith(".csv"):
                filepath = os.path.abspath(os.path.join(root, filename))
                utterances = inputtool.get_utterances_from_filename(filepath)
                x_tag = sent2features(utterances)
                outputlist = tagger.tag(x_tag)
                f.write('Filename="')
                f.write(filename)
                f.write('"')
                f.write('\n')
                for y in outputlist:
                    f.write(y)
                    f.write('\n')
                f.write('\n')
    f.close()
コード例 #18
0
def main(training_file, testing_file, model_file):
    
    start = time.time()
    
    # Get training and testing set of data
    training_set = get_input(training_file)
    testing_set = get_input(testing_file)
    
    # Get features of each word on training set
    X_train = [get_features(s) for s in training_set]
    y_train = [get_labels(s) for s in training_set]
    
    # Get features of each word on testing set
    X_test = [get_features(s) for s in testing_set]
    y_test = [get_labels(s) for s in testing_set]

    # Create trainer model of CRF
    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 0.5,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  # stop earlier
    
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })    
    
    # Train the model and save the trained model into model_file
    trainer.train(model_file)
    print ("Log of last iteration={}".format(trainer.logparser.iterations[-1]))

    # Initial tagger for prediction task
    trained_model = pycrfsuite.Tagger()
    trained_model.open(model_file) # Load the trained model.
        
    # Get prediction tag results from trained model
    y_pred = [trained_model.tag(xseq) for xseq in X_test]
    
    # Print the Precision, Recall, and F-1 score
    print(bio_classification_report(y_test, y_pred))
    
    end = time.time()
    print('CRF model has been generated.')
    print('runtime:', end - start)
コード例 #19
0
def train(x_train, y_train):
    trainer = pycrfsuite.Trainer(verbose=False)

    for (x, y) in zip(x_train, y_train):
        trainer.append(x, y)

    trainer.set_params({
        'c1': 1.0, # coefficient for L1 penalty
        'c2': 1e-3, # coefficient for L2 penalty
        'max_iterations': 50, # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('advanced_tagger.crfsuite')
    return 
コード例 #20
0
ファイル: mention_detection.py プロジェクト: titsuki/jawikify
    def train(self, X_sentences, Y_labels, model_filename = 'md.model'):
        
        trainer = pycrfsuite.Trainer(verbose=False)
        X_train = [sent2features(x) for x in X_sentences]
        Y_train = [sent2labels(y) for y in Y_labels]
        for xseq, yseq in zip(X_train, Y_train):
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 1.0,   # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
        trainer.train(modelfilename)
コード例 #21
0
ファイル: test2.py プロジェクト: joecp9key/NLP
def train():
	X_train = [ sent2features(s) for s in train_sent]
	Y_train = [ sent2labels(s) for s in train_sent]

	trainer = pycrfsuite.Trainer(verbose=False)
	trainer.set_params({
	    'c1': 1.0,
	    'c2': 1e-3,
	    'max_iterations': 50,
	    'feature.possible_transitions': True
	})

	for xseq, yseq in zip(X_train, Y_train):
	    trainer.append(xseq, yseq)
	    
	trainer.train('mytrain_model')
コード例 #22
0
def trainModel(training_data, model_file):

    X = []
    Y = []

    for address_text, components in training_data:
        tokens, labels = zip(*components)
        X.append(usaddress.addr2features(tokens))
        Y.append(labels)

    #train model
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X, Y):
        trainer.append(xseq, yseq)

    trainer.train(model_file)
コード例 #23
0
def _fit_model(X_train: List[List[List[str]]], y_train: List[List[str]],
               output_path: str, crf_max_iteration: int) -> None:
    logger.info("Fitting CRF model..")
    trainer = pycrfsuite.Trainer(verbose=True)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'epsilon': 1e-4,
        'max_iterations': crf_max_iteration,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    trainer.train(output_path)
    logger.info(f"Done! Model saved at {output_path}")
コード例 #24
0
def train_crf(x_train, y_train):
    print('Training...')
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(x_train, y_train):
        trainer.append(xseq, yseq)
    
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train(param.crf_path)
コード例 #25
0
ファイル: crfsuite.py プロジェクト: zxsted/nalaf
    def train(data, model_file, params=None):
        """
        :type data: nalaf.structures.data.Dataset
        :type model_file: str ~ filename (from local file system) to save trained model to. If None, no model is saved.
        """

        trainer = pycrfsuite.Trainer()
        if params is not None:
            trainer.set_params(params)

        for sentence in data.sentences():
            trainer.append(pycrfsuite.ItemSequence([token.features for token in sentence]),
                           [token.original_labels[0].value for token in sentence])

        # The CRFSuite library handles the "pickling" of the file; saves the model here
        trainer.train(model_file)
コード例 #26
0
    def train(self, sentences, model):
        """Train the CRF tagger using CRFSuite.

        :params sentences: Annotated sentences.
        :params model: Path to save pickled model.
        """
        trainer = pycrfsuite.Trainer(verbose=True)
        trainer.set_params(self.params)
        for sentence in sentences:
            tokens, labels = zip(*sentence)
            features = [
                self._get_features(tokens, i) for i in range(len(tokens))
            ]
            trainer.append(features, labels)
        trainer.train(model)
        self.load(model)
コード例 #27
0
    def train(self, train_sents):
        X_train = [self.sent2features(s) for s in train_sents]
        Y_train = [self.sent2labels(s) for s in train_sents]
        trainer = pycrfsuite.Trainer(verbose=False)
        for xseq, yseq in zip(X_train, Y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
        trainer.train(self.modelfile)
        '''
コード例 #28
0
    def train(self, model_file_path):
        """
        TBW
        """
        # create a trainer object
        self.trainer = pycrfsuite.Trainer(verbose=self.verbose)

        # append training data
        for feature_seq, label_seq in self.train_data:
            self.trainer.append(feature_seq, label_seq)

        # do the actual training
        self.trainer.train(model_file_path)

        # return the path to the model file
        return model_file_path
コード例 #29
0
 def train(self, docs, model_fname):
     if not self.feature_vocabulary:
         self.feature_vocabulary = self._scan_features(docs)
     trainer = pycrfsuite.Trainer(verbose=self.verbose)
     if self.verbose:
         print('begin appending data to trainer')
     for sent in docs:
         x, y = sent_to_xy(sent, self.to_feature)
         x = [[xij for xij in xi if xij in self.feature_vocabulary]
              for xi in x]
         trainer.append(x, y)
     if self.verbose:
         print('all data are appended to trainer. begin training')
     trainer.set_params(self.params)
     trainer.train(model_fname)
     self.load_tagger(model_fname)
コード例 #30
0
def train(train_dir, feature_ext_fn, c1, c2, total_iterations):
    samples = get_data(train_dir)
    trainer = pycrfsuite.Trainer(verbose=True)

    for index, dialog in enumerate(samples):
        features = feature_ext_fn(dialog)
        tags = [utt.act_tag for utt in dialog]
        trainer.append(features, tags)
    i = 0
    trainer.set_params({
        'c1': c1,
        'c2': c2,
        'max_iterations': total_iterations,
        'feature.possible_transitions': True
    })
    trainer.train("model.crfsuite")