Example #1
0
def model_filename(tmpdir, xseq, yseq):
    from pycrfsuite import Trainer
    trainer = Trainer('lbfgs', verbose=False)
    trainer.append(xseq, yseq)
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    return model_filename
Example #2
0
def test_trainer(tmpdir, xseq, yseq):
    trainer = Trainer('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))
    assert not os.path.isfile(model_filename)
    trainer.train(model_filename)
    assert os.path.isfile(model_filename)
Example #3
0
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}],
        ['spam', 'egg', 'spam', 'spam']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0
 def train(self,
           training_data,
           classifier_path="classifier/cache/label_crf_classifier",
           c1=0,
           c2=10,
           period=300,
           minfreq=5):
     self.preprocess(training_data)
     train = Trainer()
     for i1, i in enumerate(self.x):
         train.append(ItemSequence(i), self.y[i1])
     params = {
         "c1": c1,
         "c2": c2,
         "period": period,
         "feature.minfreq": minfreq,
         "max_iterations": 1000
         # "calibration.eta": 0.05,
         # "calibration_samples": 400,
     }
     # train.select(algorithm = "l2sgd")
     train.set_params(params)
     train.train(classifier_path)
     self.tagger = Tagger()
     self.tagger.open(classifier_path)
    def _train_and_save(self, X_train, y_train):
        trainer = Trainer(verbose=False)
        for i, (xseq, yseq) in enumerate(zip(X_train, y_train)):
            # Check how much memory left, stop adding more data if too little
            if i % 2500 == 0:
                if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory:
                    print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory))
                    self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i)

                    log_dict = {
                        'task': 'EntityExtractorWorker:_train_and_save',
                        'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i),
                        'data': {'task_id': self.task_id}
                    }
                    self.info_logger.info("Memory", extra=log_dict)
                    break
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 0.5,  # coefficient for L1 penalty
            'c2': 1e-4,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            # transitions that are possible, but not observed
            'feature.possible_transitions': True})

        output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type)
        # Train and save
        trainer.train(output_model_path)
        return trainer
Example #6
0
    def _train_and_save(self, X_train, y_train):
        trainer = Trainer(verbose=False)
        for i, (xseq, yseq) in enumerate(zip(X_train, y_train)):
            # Check how much memory left, stop adding more data if too little
            if i % 2500 == 0:
                if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory:
                    print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory))
                    self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i)
                    logging.getLogger(INFO_LOGGER).info(json.dumps({
                        'process': 'EntityExtractorWorker:_train_and_save',
                        'event':   'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i),
                        'data':    {'task_id': self.task_id}
                    }))
                    break
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 0.5,   # coefficient for L1 penalty
            'c2': 1e-4,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            # transitions that are possible, but not observed
            'feature.possible_transitions': True})

        output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type)
        # Train and save
        trainer.train(output_model_path)
        return trainer
def test_help_invalid_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')

    # This segfaults without a workaround;
    # see https://github.com/chokkan/crfsuite/pull/21
    with pytest.raises(ValueError):
        trainer.help('foo')

    with pytest.raises(ValueError):
        trainer.help('c1')
Example #8
0
def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [
            {
                "foo": {
                    "bar": "baz",
                    "spam": 0.5,
                    "egg": ["x", "y"],
                    "ham": {"x": -0.5, "y": -0.1}
                },
            },
            {
                "foo": {
                    "bar": "ham",
                    "spam": -0.5,
                    "ham": set(["x", "y"])
                },
            },
        ],
        ['first', 'second']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0
Example #9
0
def main(argv) :

    inputDir = argv[0]
    testDir = argv[1]
    outputFPath = argv[2]


    trainData = list(get_data(inputDir))
    testData = list(get_data(testDir))


    random.shuffle(trainData)


    # create features
    trainFeatures = create_features(trainData)
    testFeatures = create_features(testData)

    trainer = Trainer()
    for dialogue in trainFeatures :
        trainer.append(dialogue[0],dialogue[1])

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('./model.pkl')

    outputFile = open(outputFPath,'w')
    tagger = Tagger()
    tagger.open('./model.pkl')


    totalUtter=correctUtter=0
    for dialogue in testFeatures :
        preds = tagger.tag(dialogue[0])
        labels = dialogue[1]
        for i,pred in enumerate(preds) :
            outputFile.write(pred+'\n')
            if len(labels)>0 :
                totalUtter += 1
                if labels[i]==pred :
                    correctUtter += 1
        outputFile.write('\n')

    if totalUtter > 0 :
        accuracy = correctUtter/totalUtter
        print('Accuracy: '+str(accuracy))
    outputFile.close()
Example #10
0
def model_filename(tmpdir, xseq, yseq):
    from pycrfsuite import Trainer
    trainer = Trainer('lbfgs', verbose=False)
    trainer.append(xseq, yseq)
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    return model_filename
Example #11
0
def test_help_invalid_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')

    # This segfaults without a workaround;
    # see https://github.com/chokkan/crfsuite/pull/21
    with pytest.raises(ValueError):
        trainer.help('foo')

    with pytest.raises(ValueError):
        trainer.help('c1')
Example #12
0
def test_trainer(tmpdir, xseq, yseq):
    trainer = Trainer('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))
    assert not os.path.isfile(model_filename)
    trainer.train(model_filename)
    assert os.path.isfile(model_filename)
Example #13
0
def train(features: pd.Series, labels: pd.Series) -> None:
    trainer = Trainer(verbose=False)
    features = features.tolist()
    labels = labels.tolist()

    for idx in range(len(features)):
        trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx]))
    trainer.train('crf.model')
Example #14
0
def train(features, labels):
    print("Training..")
    trainer = Trainer(verbose=False)
    features = features.tolist()
    labels = labels.tolist()

    for idx in range(0, len(features)):
        trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx]))
    trainer.train('crf.model')
Example #15
0
def train(X_train, X_test, y_train, y_test, **kwargs):
    '''
    >>> corpus = CorpusReader('annot.opcorpora.xml')
    >>> X_train, x_test, y_train, y_test = get_train_data(corpus, test_size=0.33, random_state=42)
    >>> crf = train(X_train, X_test, y_train, y_test)
    '''
    crf = Trainer()
    crf.set_params({
        'c1': 1.0,
        'c2': 0.001,
        'max_iterations': 200,
    })

    for xseq, yseq in zip(X_train, y_train):
        crf.append(xseq, yseq)
    crf.train(model_name)
    return crf
Example #16
0
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq
def train(X_train, y_train, **kwargs):
    '''
    >>> corpus = CorpusReader('annot.opcorpora.xml')
    >>> X_train, x_test, y_train, y_test = get_train_data(corpus, test_size=0.33, random_state=42)
    >>> crf = train(X_train, y_train)
    '''
    crf = Trainer()
    crf.set_params({
        'c1': 1.0,
        'c2': 0.001,
        'max_iterations': 200,
        'feature.possible_transitions': True,
    })

    for xseq, yseq in zip(X_train, y_train):
        crf.append(xseq, yseq)
    crf.train(PART_OF_SPEECH_MODEL_PATH)
    return crf
Example #18
0
 def train(self, docs: Iterable[Doc], algorithm: str, params: dict,
           path: str) -> None:
     trainer = Trainer(algorithm=algorithm, params=params, verbose=False)
     for doc in docs:
         for sentence in doc.sents:
             tokens = list(sentence)
             features = self.feature_extractor.extract(
                 [str(token) for token in tokens])
             labels = self.encoder.encode(tokens)
             trainer.append(features, labels)
     trainer.train(path)
     self.tagger.close()
     self.tagger.open(path)
Example #19
0
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq
Example #20
0
 def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
   self.trained_model_name = trained_model_name
   self.fp = FeatureProcessing()
   self.do_train = do_train
   self.algorithm = algorithm
   if algorithm == "crf":
     if do_train:
       self.trainer = Trainer()
     else:
       self.tagger = Tagger()
   else:
     if do_train:
       model = ChainCRF()
       self.trainer = FrankWolfeSSVM(model=model)
       self.feat_index = {}
       self.label_index = {}
     else:
       self.tagger = pickle.load(open(self.trained_model_name, "rb"))
       self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
       label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
       self.rev_label_index = {i: x for x, i in label_index.items()}
Example #21
0
def train_model(train_samples, model_name):
    """"
    训练模型---全部数据拿来训练
    :param train_samples:  [DataSample1, DataSample2, ...] 训练数据
    :param model_name:  保存模型的模型
    :return: None
    """
    train = Trainer()

    # append training samples into trainer
    for sample in train_samples:
        xseq = build_model_features(sample, 17, True)
        # yseq = sample.label
        yseq = sample.char_label
        train.append(xseq, yseq)

    train.train(model_name)
Example #22
0
def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([
        {
            "foo": {
                "bar": "baz",
                "spam": 0.5,
                "egg": ["x", "y"],
                "ham": {
                    "x": -0.5,
                    "y": -0.1
                }
            },
        },
        {
            "foo": {
                "bar": "ham",
                "spam": -0.5,
                "ham": set(["x", "y"])
            },
        },
    ], ['first', 'second'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0
Example #23
0
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([{
        'foo': 'bar'
    }, {
        'baz': False
    }, {
        'foo': 'bar',
        'baz': True
    }, {
        'baz': 0.2
    }], ['spam', 'egg', 'spam', 'spam'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0
Example #24
0
def test_set_parameters_in_constructor():
    trainer = Trainer(params={'c2': 100})
    assert abs(trainer.get('c2') - 100) < 1e-6
Example #25
0
def test_params_and_help():
    trainer = Trainer()

    trainer.select('lbfgs')
    assert 'c1' in trainer.params()
    assert 'c2' in trainer.params()
    assert 'num_memories' in trainer.params()
    assert 'L1' in trainer.help('c1')

    trainer.select('l2sgd')
    assert 'c2' in trainer.params()
    assert 'c1' not in trainer.params()
    assert 'L2' in trainer.help('c2')
Example #26
0
def test_get_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')
    assert abs(trainer.get('c2') - 0.1) > 1e-6
    trainer.set('c2', 0.1)
    assert abs(trainer.get('c2') - 0.1) < 1e-6
Example #27
0
def test_params_and_help():
    trainer = Trainer()

    trainer.select('lbfgs')
    assert 'c1' in trainer.params()
    assert 'c2' in trainer.params()
    assert 'num_memories' in trainer.params()
    assert 'L1' in trainer.help('c1')

    trainer.select('l2sgd')
    assert 'c2' in trainer.params()
    assert 'c1' not in trainer.params()
    assert 'L2' in trainer.help('c2')
Example #28
0
def test_trainer_select_raises_error():
    trainer = Trainer()
    with pytest.raises(ValueError):
        trainer.select('foo')
Example #29
0
def test_algorithm_parameters(algo):
    trainer = Trainer(algo)
    params = trainer.get_params()
    assert params

    # set the same values
    trainer.set_params(params)
    params2 = trainer.get_params()
    assert params2 == params

    # change a value
    trainer.set('feature.possible_states', True)
    assert trainer.get_params()['feature.possible_states'] == True

    trainer.set('feature.possible_states', False)
    assert trainer.get_params()['feature.possible_states'] == False

    # invalid parameter
    params['foo'] = 5
    with pytest.raises(ValueError):
        trainer.set_params(params)
Example #30
0
def test_trainer_noselect_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
Example #31
0
def test_trainer_select_raises_error():
    trainer = Trainer()
    with pytest.raises(ValueError):
        trainer.select('foo')
Example #32
0
def test_trainer_noselect_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
Example #33
0
def test_set_parameters_in_constructor():
    trainer = Trainer(params={'c2': 100})
    assert abs(trainer.get('c2') - 100) < 1e-6
Example #34
0
    def train(self,
              data=None,
              form_col=None,
              lbl_col=None,
              ilbl_col=None,
              data_cols=None,
              data_sep=None,
              dump=True):
        """Trains a model based on provided data and features. The default
        behaviour is to load training parameters from the global configuration,
        unless they are passed to this method.

        IMPORTANT: there are two ways to pass data directly through the `data`
        parameter:

        -- np.recarray  `data` needs to be a recarray with column names that
                        match what the feature extractor expects.
        -- csv str      `data` needs to contain a TSV/CSV formatted string.
                        Column names and separator should be provided in the
                        `data_cols` and `data_sep` parameters. They should still
                        match what is expected by the feature extractor.

        The observation, label, and inference column names can be set through
        the global configuration using the following parameter names:
        `form_col`, `label_col`, `guess_label_col`. The default observation
        column name is `fc`, and the inference column name is `guesstag`.
        All three names can be passed to this method to override global
        configuration. Any other column names need to match their respective
        feature extractor functions, e.g. part-of-speech tags need to be placed
        in `postag` column. See `ftex.FeatureTemplate` for others.

        RECOMMENDED: use `utils.parse_tsv` to parse input data to avoid column
        configuration errors.

        NOTE: Due to the way `pycrfsuite` works, the crfsuite model needs to be
        dumped on the hard drive, however, the CRFSuiteTagger model does not
        NEED to be dumped. That process is controlled through the `dump`
        parameter.

        :param data: training data
        :type data: np.recarray or str
        :param form_col: fc column name
        :type form_col: str
        :param lbl_col: label column name
        :type lbl_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param data_cols: list of columns in the data
        :type data_cols: str
        :param data_sep: data tab separator
        :type data_sep: str
        :param dump: dumps the model at specified location if True
        :type dump: bool
        """

        # overriding parameters
        fc = form_col if form_col else self.form_col
        c = data_cols if data_cols else self.cols
        sep = data_sep if data_sep else self.ts
        lc = lbl_col if lbl_col else self.lbl_col
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep, inference_col=ilc)
        elif data is None:
            d = self.train_data
        else:
            raise ValueError('Invalid input type.')

        # extract features
        X = self._extract_features(d, fc)

        # extract labels
        y = gsequences(d, [lc])

        trainer = Trainer(verbose=self.verbose)

        # setting CRFSuite parameters
        trainer.set_params(self.cfg_crf)

        for x_seq, y_seq in zip(X, y):
            trainer.append(x_seq, [l[0] for l in y_seq])

        crfs_mp = '%s.crfs' % self.model_path
        try:
            makedirs(dirname(crfs_mp))
        except OSError:
            pass
        trainer.train(crfs_mp)

        self.tagger = Tagger()
        self.tagger.open(crfs_mp)

        # dumps the model
        if dump:
            self.dump_model(self.model_path)
            pickle.dump(self.cfg, open('%s.cfg.pcl' % self.model_path, 'w'))
Example #35
0
def test_get_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')
    assert abs(trainer.get('c2') - 0.1) > 1e-6
    trainer.set('c2', 0.1)
    assert abs(trainer.get('c2') - 0.1) < 1e-6
Example #36
0
def test_algorithm_parameters(algo):
    trainer = Trainer(algo)
    params = trainer.get_params()
    assert params

    # set the same values
    trainer.set_params(params)
    params2 = trainer.get_params()
    assert params2 == params

    # change a value
    trainer.set('feature.possible_states', True)
    assert trainer.get_params()['feature.possible_states'] == True

    trainer.set('feature.possible_states', False)
    assert trainer.get_params()['feature.possible_states'] == False

    # invalid parameter
    params['foo'] = 5
    with pytest.raises(ValueError):
        trainer.set_params(params)