Example #1
0
def load_model(model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    tagger = Tagger()
    tagger.open(model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    return tagger
Example #2
0
def test_open_inmemory_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open_inmemory(b'')

    with pytest.raises(ValueError):
        tagger.open_inmemory(b'lCRFabc')
 def train(self,
           training_data,
           classifier_path="classifier/cache/label_crf_classifier",
           c1=0,
           c2=10,
           period=300,
           minfreq=5):
     self.preprocess(training_data)
     train = Trainer()
     for i1, i in enumerate(self.x):
         train.append(ItemSequence(i), self.y[i1])
     params = {
         "c1": c1,
         "c2": c2,
         "period": period,
         "feature.minfreq": minfreq,
         "max_iterations": 1000
         # "calibration.eta": 0.05,
         # "calibration_samples": 400,
     }
     # train.select(algorithm = "l2sgd")
     train.set_params(params)
     train.train(classifier_path)
     self.tagger = Tagger()
     self.tagger.open(classifier_path)
Example #4
0
    def tag(self,
            data,
            form_col=None,
            ilbl_col=None,
            tagger=None,
            cols=None,
            ts=None):
        """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model.

        See documentation for `train` for more details on requirements for the
        data passed to this method.

        :param data: data
        :type data: str or recarray
        :param form_col: form column name
        :type form_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param tagger: CRFS tagger
        :type tagger: Tagger
        :param cols: TSV column names
        :type cols: str or list of str
        :param ts: tab separator for TSV
        :type ts: str
        :return: tagged data
        :rtype: recarray
        """

        fc = form_col if form_col else self.form_col
        c = cols if cols else self.cols
        sep = ts if ts else self.ts
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep)
        else:
            raise ValueError('Invalid input type.')

        tgr = tagger

        if tgr is None and self.tagger:
            tgr = self.tagger
        elif tgr is None:
            tgr = Tagger()
            tgr.open('%s.crfs' % self.model_path)

        # extracting features
        X = self._extract_features(d, form_col=fc)

        # tagging sentences
        idx = 0
        for fts in X:
            for l in tgr.tag(fts):
                d[idx][ilc] = l
                idx += 1

        return d
Example #5
0
def test_tag_probability(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        res = tagger.tag(xseq)
        prob = tagger.probability(res)
        prob2 = tagger.probability([yseq[0]] * len(yseq))
        assert prob > prob2
        assert 0 < prob < 1
        assert 0 < prob2 < 1
Example #6
0
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq
Example #7
0
def test_tag_bools(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        # Some values are bools:
        # True <=> 1.0; False <=> 0.0
        data = [
            dict((k, bool(v) if v == 0 or v == 1 else v)
                 for (k, v) in x.items()) for x in xseq
        ]
        assert tagger.tag(data) == yseq
Example #8
0
def main(argv) :

    inputDir = argv[0]
    testDir = argv[1]
    outputFPath = argv[2]


    trainData = list(get_data(inputDir))
    testData = list(get_data(testDir))


    random.shuffle(trainData)


    # create features
    trainFeatures = create_features(trainData)
    testFeatures = create_features(testData)

    trainer = Trainer()
    for dialogue in trainFeatures :
        trainer.append(dialogue[0],dialogue[1])

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('./model.pkl')

    outputFile = open(outputFPath,'w')
    tagger = Tagger()
    tagger.open('./model.pkl')


    totalUtter=correctUtter=0
    for dialogue in testFeatures :
        preds = tagger.tag(dialogue[0])
        labels = dialogue[1]
        for i,pred in enumerate(preds) :
            outputFile.write(pred+'\n')
            if len(labels)>0 :
                totalUtter += 1
                if labels[i]==pred :
                    correctUtter += 1
        outputFile.write('\n')

    if totalUtter > 0 :
        accuracy = correctUtter/totalUtter
        print('Accuracy: '+str(accuracy))
    outputFile.close()
Example #9
0
def test_dump(tmpdir, model_filename):
    with Tagger().open(model_filename) as tagger:
        dump_filename = str(tmpdir.join("dump.txt"))
        tagger.dump(dump_filename)

        with open(dump_filename, 'rb') as f:
            res = f.read().decode('utf8')
            assert 'LABELS = {' in res
            assert u'солнце:не светит --> rainy:' in res

    # it shouldn't segfault on a closed tagger
    with pytest.raises(RuntimeError):
        tagger.dump(dump_filename)
Example #10
0
def predict_crf(reader, model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    tagger = Tagger()
    tagger.open(model_path)

    _log.info('Extracting features from test corpus')
    itemseq = ItemSequence(
        [fs for sent in reader.sents() for fs in extract_crf_features(sent)])

    _log.info('Making predictions with the model')
    return tagger.tag(itemseq)
Example #11
0
def test_open_close_labels(model_filename, yseq):
    tagger = Tagger()

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()

    with tagger.open(model_filename):
        labels = tagger.labels()
    assert set(labels) == set(yseq)

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()
Example #12
0
    def _load_tagger(self):
        # In pycrfsuite, you have to save the model first, then load it as a tagger
        self.model_name = 'model_{}'.format(self.task_obj.unique_id)
        file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name)
        try:
            tagger = Tagger()
            tagger.open(file_path)
        except Exception as e:
            print(e)
            self.error_logger.error('Failed to load crf model from the filesystem.', exc_info=True, extra={
                'model_name': self.model_name,
                'file_path': file_path})

        self.tagger = tagger
        return self.tagger
Example #13
0
def test_info(model_filename):
    with Tagger().open(model_filename) as tagger:
        res = tagger.info()

        assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny',
                                                                      'rainy')]
        assert res.state_features[('walk', 'sunny')] > res.state_features[(
            'walk', 'rainy')]
        assert (u'солнце:не светит', u'rainy') in res.state_features
        assert res.header['num_labels'] == '2'
        assert set(res.labels.keys()) == set(['sunny', 'rainy'])
        assert set(res.attributes.keys()) == set(
            ['shop', 'walk', 'clean', u'солнце:не светит'])

    # it shouldn't segfault on a closed tagger
    with pytest.raises(RuntimeError):
        tagger.info()
Example #14
0
def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([
        {
            "foo": {
                "bar": "baz",
                "spam": 0.5,
                "egg": ["x", "y"],
                "ham": {
                    "x": -0.5,
                    "y": -0.1
                }
            },
        },
        {
            "foo": {
                "bar": "ham",
                "spam": -0.5,
                "ham": set(["x", "y"])
            },
        },
    ], ['first', 'second'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0
Example #15
0
def cal_confidence_score(sample, model_name):
    '''
    给未标注数据打标签,并且计算得分,返回结果
    :param model: 模型
    :param sample: 对象
    :return: 预测的功能点名称  置信度
    '''
    model = Tagger()
    model.open(model_name)
    # unlabeled sample features
    feature_sequence = build_model_features(sample, 17, False)
    # words
    # words = sample.sen_words
    chars = list(sample.sentence)
    model.set(feature_sequence)
    predicted_labels = model.tag()

    # get predicted_fps
    fp_list = []
    fp = ''
    for index, label in enumerate(predicted_labels):
        if label == 'B' or label == 'I' or label == 'E':
            fp += chars[index]
        if label == 'N' and len(fp) > 0:
            fp_list.append(fp)
            fp = ''

    # calculate the probability of tagging
    crf_confidence = model.probability(predicted_labels)

    lan_confidence = 0
    filtered_fp_list = []
    for fp_name in fp_list:
        filtered_fp_list.append(fp_name)

    if len(filtered_fp_list) == 0:
        predicted_fps = 'null'
    else:
        predicted_fps = ' '.join(filtered_fp_list)

    # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence))
    # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息
    return sample.story_id, sample.sentence, predicted_fps, crf_confidence
Example #16
0
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([{
        'foo': 'bar'
    }, {
        'baz': False
    }, {
        'foo': 'bar',
        'baz': True
    }, {
        'baz': 0.2
    }], ['spam', 'egg', 'spam', 'spam'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0
Example #17
0
 def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
   self.trained_model_name = trained_model_name
   self.fp = FeatureProcessing()
   self.do_train = do_train
   self.algorithm = algorithm
   if algorithm == "crf":
     if do_train:
       self.trainer = Trainer()
     else:
       self.tagger = Tagger()
   else:
     if do_train:
       model = ChainCRF()
       self.trainer = FrankWolfeSSVM(model=model)
       self.feat_index = {}
       self.label_index = {}
     else:
       self.tagger = pickle.load(open(self.trained_model_name, "rb"))
       self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
       label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
       self.rev_label_index = {i: x for x, i in label_index.items()}
Example #18
0
def test_open_non_existing():
    tagger = Tagger()
    with pytest.raises(IOError):
        tagger.open('foo')
Example #19
0
 def __init__(self, feature_extractor: WindowedTokenFeatureExtractor,
              encoder: EntityEncoder) -> None:
     self.feature_extractor = feature_extractor
     self._encoder = encoder
     self.tagger = Tagger()
Example #20
0
def test_tag_string_lists(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        # Working with lists is supported,
        # but if we discard weights the results become different
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) != yseq
Example #21
0
def test_tag_item_sequence(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(ItemSequence(xseq)) == yseq
Example #22
0
def test_tag(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq
Example #23
0
def test_tag_not_opened(xseq):
    tagger = Tagger()
    with pytest.raises(Exception):
        tagger.tag(xseq)
Example #24
0
 def model(self) -> Tagger:
     tagger = Tagger()
     tagger.open(self._model_path)
     return tagger
Example #25
0
def test_open_inmemory(model_bytes, xseq, yseq):
    with Tagger().open_inmemory(model_bytes) as tagger:
        assert tagger.tag(xseq) == yseq
Example #26
0
def test_open_invalid_with_correct_signature(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b"lCRFfoo" * 100)
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #27
0
def test_open_invalid_small(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b'foo')
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #28
0
    # YOUR CODE HERE .....

    pass


# -- Load data and crfsuite model and convert them-------------------------

RECREATE = True  # set to True to recreate flexcrf data with new model

CRFSUITE_MODEL_FILE = '../conll2002/conll2002-esp.crfsuite'
CRFSUITE_TEST_DATA_FILE = '../conll2002/conll2002-esp_crfsuite-test-data.dump'
FLEXCRF_TEST_DATA_FILE = '../conll2002/conll2002-esp_flexcrf-test-data.dump'

# crfsuite model
tagger = Tagger()
tagger.open(CRFSUITE_MODEL_FILE)
model = tagger.info()

data = pickle.load(open(CRFSUITE_TEST_DATA_FILE))
print "test data loaded."

if RECREATE:
    dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3)
    pickle.dump({
        'dataset': dataset,
        'thetas': thetas
    }, open(FLEXCRF_TEST_DATA_FILE, 'wb'))
else:
    dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE))
    dataset = dd['dataset']
Example #29
0
def test(features: pd.Series) -> list:
    tagger = Tagger()
    tagger.open('crf.model')
    y_pred = [tagger.tag(xseq) for xseq in features]
    return y_pred
Example #30
0
def test_open_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(__file__)