def load_model(model_path, _log, _run): _log.info('Loading model from %s', model_path) tagger = Tagger() tagger.open(model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) return tagger
def test_open_inmemory_invalid(): tagger = Tagger() with pytest.raises(ValueError): tagger.open_inmemory(b'') with pytest.raises(ValueError): tagger.open_inmemory(b'lCRFabc')
def train(self, training_data, classifier_path="classifier/cache/label_crf_classifier", c1=0, c2=10, period=300, minfreq=5): self.preprocess(training_data) train = Trainer() for i1, i in enumerate(self.x): train.append(ItemSequence(i), self.y[i1]) params = { "c1": c1, "c2": c2, "period": period, "feature.minfreq": minfreq, "max_iterations": 1000 # "calibration.eta": 0.05, # "calibration_samples": 400, } # train.select(algorithm = "l2sgd") train.set_params(params) train.train(classifier_path) self.tagger = Tagger() self.tagger.open(classifier_path)
def tag(self, data, form_col=None, ilbl_col=None, tagger=None, cols=None, ts=None): """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model. See documentation for `train` for more details on requirements for the data passed to this method. :param data: data :type data: str or recarray :param form_col: form column name :type form_col: str :param ilbl_col: inference label column name :type ilbl_col: str :param tagger: CRFS tagger :type tagger: Tagger :param cols: TSV column names :type cols: str or list of str :param ts: tab separator for TSV :type ts: str :return: tagged data :rtype: recarray """ fc = form_col if form_col else self.form_col c = cols if cols else self.cols sep = ts if ts else self.ts ilc = ilbl_col if ilbl_col else self.ilbl_col if type(data) in [np.core.records.recarray, np.ndarray]: d = data elif type(data) == str: d = parse_tsv(s=data, cols=c, ts=sep) else: raise ValueError('Invalid input type.') tgr = tagger if tgr is None and self.tagger: tgr = self.tagger elif tgr is None: tgr = Tagger() tgr.open('%s.crfs' % self.model_path) # extracting features X = self._extract_features(d, form_col=fc) # tagging sentences idx = 0 for fts in X: for l in tgr.tag(fts): d[idx][ilc] = l idx += 1 return d
def test_tag_probability(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: res = tagger.tag(xseq) prob = tagger.probability(res) prob2 = tagger.probability([yseq[0]] * len(yseq)) assert prob > prob2 assert 0 < prob < 1 assert 0 < prob2 < 1
def test_tag_formats(tmpdir, xseq, yseq): # make all coefficients 1 and check that results are the same model_filename = str(tmpdir.join('model.crfsuite')) xseq = [dict((key, 1) for key in x) for x in xseq] trainer = Trainer() trainer.set('c2', 1e-6) # make sure model overfits trainer.append(xseq, yseq) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq # strings with Tagger().open(model_filename) as tagger: data = [x.keys() for x in xseq] assert tagger.tag(data) == yseq
def test_tag_bools(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: # Some values are bools: # True <=> 1.0; False <=> 0.0 data = [ dict((k, bool(v) if v == 0 or v == 1 else v) for (k, v) in x.items()) for x in xseq ] assert tagger.tag(data) == yseq
def main(argv) : inputDir = argv[0] testDir = argv[1] outputFPath = argv[2] trainData = list(get_data(inputDir)) testData = list(get_data(testDir)) random.shuffle(trainData) # create features trainFeatures = create_features(trainData) testFeatures = create_features(testData) trainer = Trainer() for dialogue in trainFeatures : trainer.append(dialogue[0],dialogue[1]) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('./model.pkl') outputFile = open(outputFPath,'w') tagger = Tagger() tagger.open('./model.pkl') totalUtter=correctUtter=0 for dialogue in testFeatures : preds = tagger.tag(dialogue[0]) labels = dialogue[1] for i,pred in enumerate(preds) : outputFile.write(pred+'\n') if len(labels)>0 : totalUtter += 1 if labels[i]==pred : correctUtter += 1 outputFile.write('\n') if totalUtter > 0 : accuracy = correctUtter/totalUtter print('Accuracy: '+str(accuracy)) outputFile.close()
def test_dump(tmpdir, model_filename): with Tagger().open(model_filename) as tagger: dump_filename = str(tmpdir.join("dump.txt")) tagger.dump(dump_filename) with open(dump_filename, 'rb') as f: res = f.read().decode('utf8') assert 'LABELS = {' in res assert u'солнце:не светит --> rainy:' in res # it shouldn't segfault on a closed tagger with pytest.raises(RuntimeError): tagger.dump(dump_filename)
def predict_crf(reader, model_path, _log, _run): _log.info('Loading model from %s', model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) tagger = Tagger() tagger.open(model_path) _log.info('Extracting features from test corpus') itemseq = ItemSequence( [fs for sent in reader.sents() for fs in extract_crf_features(sent)]) _log.info('Making predictions with the model') return tagger.tag(itemseq)
def test_open_close_labels(model_filename, yseq): tagger = Tagger() with pytest.raises(ValueError): # tagger should be closed, so labels() method should fail here labels = tagger.labels() with tagger.open(model_filename): labels = tagger.labels() assert set(labels) == set(yseq) with pytest.raises(ValueError): # tagger should be closed, so labels() method should fail here labels = tagger.labels()
def _load_tagger(self): # In pycrfsuite, you have to save the model first, then load it as a tagger self.model_name = 'model_{}'.format(self.task_obj.unique_id) file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name) try: tagger = Tagger() tagger.open(file_path) except Exception as e: print(e) self.error_logger.error('Failed to load crf model from the filesystem.', exc_info=True, extra={ 'model_name': self.model_name, 'file_path': file_path}) self.tagger = tagger return self.tagger
def test_info(model_filename): with Tagger().open(model_filename) as tagger: res = tagger.info() assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny', 'rainy')] assert res.state_features[('walk', 'sunny')] > res.state_features[( 'walk', 'rainy')] assert (u'солнце:не светит', u'rainy') in res.state_features assert res.header['num_labels'] == '2' assert set(res.labels.keys()) == set(['sunny', 'rainy']) assert set(res.attributes.keys()) == set( ['shop', 'walk', 'clean', u'солнце:не светит']) # it shouldn't segfault on a closed tagger with pytest.raises(RuntimeError): tagger.info()
def test_append_nested_dicts(tmpdir): trainer = Trainer() trainer.append([ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": { "x": -0.5, "y": -0.1 } }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ], ['first', 'second']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set([ 'foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y', 'foo:ham:x', 'foo:ham:y', 'foo:bar:ham', ]) for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']: assert info.state_features[(feat, 'first')] > 0 assert info.state_features.get((feat, 'second'), 0) <= 0 for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']: assert info.state_features[(feat, 'second')] > 0 assert info.state_features.get((feat, 'first'), 0) <= 0
def cal_confidence_score(sample, model_name): ''' 给未标注数据打标签,并且计算得分,返回结果 :param model: 模型 :param sample: 对象 :return: 预测的功能点名称 置信度 ''' model = Tagger() model.open(model_name) # unlabeled sample features feature_sequence = build_model_features(sample, 17, False) # words # words = sample.sen_words chars = list(sample.sentence) model.set(feature_sequence) predicted_labels = model.tag() # get predicted_fps fp_list = [] fp = '' for index, label in enumerate(predicted_labels): if label == 'B' or label == 'I' or label == 'E': fp += chars[index] if label == 'N' and len(fp) > 0: fp_list.append(fp) fp = '' # calculate the probability of tagging crf_confidence = model.probability(predicted_labels) lan_confidence = 0 filtered_fp_list = [] for fp_name in fp_list: filtered_fp_list.append(fp_name) if len(filtered_fp_list) == 0: predicted_fps = 'null' else: predicted_fps = ' '.join(filtered_fp_list) # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence)) # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息 return sample.story_id, sample.sentence, predicted_fps, crf_confidence
def test_append_strstr_dicts(tmpdir): trainer = Trainer() trainer.append([{ 'foo': 'bar' }, { 'baz': False }, { 'foo': 'bar', 'baz': True }, { 'baz': 0.2 }], ['spam', 'egg', 'spam', 'spam']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set(['foo:bar', 'baz']) assert info.state_features[('foo:bar', 'spam')] > 0
def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()}
def test_open_non_existing(): tagger = Tagger() with pytest.raises(IOError): tagger.open('foo')
def __init__(self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder) -> None: self.feature_extractor = feature_extractor self._encoder = encoder self.tagger = Tagger()
def test_tag_string_lists(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: # Working with lists is supported, # but if we discard weights the results become different data = [x.keys() for x in xseq] assert tagger.tag(data) != yseq
def test_tag_item_sequence(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: assert tagger.tag(ItemSequence(xseq)) == yseq
def test_tag(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq
def test_tag_not_opened(xseq): tagger = Tagger() with pytest.raises(Exception): tagger.tag(xseq)
def model(self) -> Tagger: tagger = Tagger() tagger.open(self._model_path) return tagger
def test_open_inmemory(model_bytes, xseq, yseq): with Tagger().open_inmemory(model_bytes) as tagger: assert tagger.tag(xseq) == yseq
def test_open_invalid_with_correct_signature(tmpdir): tmp = tmpdir.join('tmp.txt') tmp.write(b"lCRFfoo" * 100) tagger = Tagger() with pytest.raises(ValueError): tagger.open(str(tmp))
def test_open_invalid_small(tmpdir): tmp = tmpdir.join('tmp.txt') tmp.write(b'foo') tagger = Tagger() with pytest.raises(ValueError): tagger.open(str(tmp))
# YOUR CODE HERE ..... pass # -- Load data and crfsuite model and convert them------------------------- RECREATE = True # set to True to recreate flexcrf data with new model CRFSUITE_MODEL_FILE = '../conll2002/conll2002-esp.crfsuite' CRFSUITE_TEST_DATA_FILE = '../conll2002/conll2002-esp_crfsuite-test-data.dump' FLEXCRF_TEST_DATA_FILE = '../conll2002/conll2002-esp_flexcrf-test-data.dump' # crfsuite model tagger = Tagger() tagger.open(CRFSUITE_MODEL_FILE) model = tagger.info() data = pickle.load(open(CRFSUITE_TEST_DATA_FILE)) print "test data loaded." if RECREATE: dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3) pickle.dump({ 'dataset': dataset, 'thetas': thetas }, open(FLEXCRF_TEST_DATA_FILE, 'wb')) else: dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE)) dataset = dd['dataset']
def test(features: pd.Series) -> list: tagger = Tagger() tagger.open('crf.model') y_pred = [tagger.tag(xseq) for xseq in features] return y_pred
def test_open_invalid(): tagger = Tagger() with pytest.raises(ValueError): tagger.open(__file__)