def load_model(model_path, _log, _run): _log.info('Loading model from %s', model_path) tagger = Tagger() tagger.open(model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) return tagger
def train(self, training_data, classifier_path="classifier/cache/label_crf_classifier", c1=0, c2=10, period=300, minfreq=5): self.preprocess(training_data) train = Trainer() for i1, i in enumerate(self.x): train.append(ItemSequence(i), self.y[i1]) params = { "c1": c1, "c2": c2, "period": period, "feature.minfreq": minfreq, "max_iterations": 1000 # "calibration.eta": 0.05, # "calibration_samples": 400, } # train.select(algorithm = "l2sgd") train.set_params(params) train.train(classifier_path) self.tagger = Tagger() self.tagger.open(classifier_path)
def test_open_inmemory_invalid(): tagger = Tagger() with pytest.raises(ValueError): tagger.open_inmemory(b'') with pytest.raises(ValueError): tagger.open_inmemory(b'lCRFabc')
def tag(self, data, form_col=None, ilbl_col=None, tagger=None, cols=None, ts=None): """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model. See documentation for `train` for more details on requirements for the data passed to this method. :param data: data :type data: str or recarray :param form_col: form column name :type form_col: str :param ilbl_col: inference label column name :type ilbl_col: str :param tagger: CRFS tagger :type tagger: Tagger :param cols: TSV column names :type cols: str or list of str :param ts: tab separator for TSV :type ts: str :return: tagged data :rtype: recarray """ fc = form_col if form_col else self.form_col c = cols if cols else self.cols sep = ts if ts else self.ts ilc = ilbl_col if ilbl_col else self.ilbl_col if type(data) in [np.core.records.recarray, np.ndarray]: d = data elif type(data) == str: d = parse_tsv(s=data, cols=c, ts=sep) else: raise ValueError('Invalid input type.') tgr = tagger if tgr is None and self.tagger: tgr = self.tagger elif tgr is None: tgr = Tagger() tgr.open('%s.crfs' % self.model_path) # extracting features X = self._extract_features(d, form_col=fc) # tagging sentences idx = 0 for fts in X: for l in tgr.tag(fts): d[idx][ilc] = l idx += 1 return d
def load_tagger(model_path): """Loads tagger from a CRFSUITE binary model file. :param str model_path: path to the binary model file. """ tagger = Tagger() tagger.open(model_path) return tagger
class SentimentTagger: def __init__(self): self.tagger = Tagger() def load_model(self, path): self.tagger.open(path) def tag_tweets(self, tweet_features_list): features = ItemSequence(tweet_features_list) labels = self.tagger.tag(features) return labels
def main(argv) : inputDir = argv[0] testDir = argv[1] outputFPath = argv[2] trainData = list(get_data(inputDir)) testData = list(get_data(testDir)) random.shuffle(trainData) # create features trainFeatures = create_features(trainData) testFeatures = create_features(testData) trainer = Trainer() for dialogue in trainFeatures : trainer.append(dialogue[0],dialogue[1]) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('./model.pkl') outputFile = open(outputFPath,'w') tagger = Tagger() tagger.open('./model.pkl') totalUtter=correctUtter=0 for dialogue in testFeatures : preds = tagger.tag(dialogue[0]) labels = dialogue[1] for i,pred in enumerate(preds) : outputFile.write(pred+'\n') if len(labels)>0 : totalUtter += 1 if labels[i]==pred : correctUtter += 1 outputFile.write('\n') if totalUtter > 0 : accuracy = correctUtter/totalUtter print('Accuracy: '+str(accuracy)) outputFile.close()
def predict_crf(reader, model_path, _log, _run): _log.info('Loading model from %s', model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) tagger = Tagger() tagger.open(model_path) _log.info('Extracting features from test corpus') itemseq = ItemSequence( [fs for sent in reader.sents() for fs in extract_crf_features(sent)]) _log.info('Making predictions with the model') return tagger.tag(itemseq)
def test_open_close_labels(model_filename, yseq): tagger = Tagger() with pytest.raises(ValueError): # tagger should be closed, so labels() method should fail here labels = tagger.labels() with tagger.open(model_filename): labels = tagger.labels() assert set(labels) == set(yseq) with pytest.raises(ValueError): # tagger should be closed, so labels() method should fail here labels = tagger.labels()
class CRFchunk: def __init__(self, corpus: str = "orchidpp"): self.corpus = corpus self.load_model(self.corpus) def load_model(self, corpus: str): self.tagger = CRFTagger() if corpus == "orchidpp": self.path = path_pythainlp_corpus("crfchunk_orchidpp.model") self.tagger.open(self.path) def parse(self, token_pos: List[Tuple[str, str]]) -> List[str]: self.xseq = extract_features(token_pos) return self.tagger.tag(self.xseq)
def _load_tagger(self): # In pycrfsuite, you have to save the model first, then load it as a tagger self.model_name = 'model_{}'.format(self.task_obj.unique_id) file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name) try: tagger = Tagger() tagger.open(file_path) except Exception as e: print(e) self.error_logger.error('Failed to load crf model from the filesystem.', exc_info=True, extra={ 'model_name': self.model_name, 'file_path': file_path}) self.tagger = tagger return self.tagger
def _load_tagger(self): # In pycrfsuite, you have to save the model first, then load it as a tagger self.model_name = 'model_{}'.format(self.task_obj.unique_id) file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name) try: tagger = Tagger() tagger.open(file_path) except Exception as e: print(e) logging.getLogger(ERROR_LOGGER).error('Failed to load crf model from the filesystem.', exc_info=True, extra={ 'model_name': self.model_name, 'file_path': file_path}) self.tagger = tagger return self.tagger
def test_tag_probability(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: res = tagger.tag(xseq) prob = tagger.probability(res) prob2 = tagger.probability([yseq[0]] * len(yseq)) assert prob > prob2 assert 0 < prob < 1 assert 0 < prob2 < 1
def test_tag_formats(tmpdir, xseq, yseq): # make all coefficients 1 and check that results are the same model_filename = str(tmpdir.join('model.crfsuite')) xseq = [dict((key, 1) for key in x) for x in xseq] trainer = Trainer() trainer.set('c2', 1e-6) # make sure model overfits trainer.append(xseq, yseq) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq # strings with Tagger().open(model_filename) as tagger: data = [x.keys() for x in xseq] assert tagger.tag(data) == yseq
def test_tag_bools(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: # Some values are bools: # True <=> 1.0; False <=> 0.0 data = [ dict((k, bool(v) if v == 0 or v == 1 else v) for (k, v) in x.items()) for x in xseq ] assert tagger.tag(data) == yseq
class CRFsuiteEntityRecognizer: def __init__( self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder ) -> None: self.feature_extractor = feature_extractor self._encoder = encoder self.tagger = Tagger() @property def encoder(self) -> EntityEncoder: return self._encoder def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = Trainer(algorithm=algorithm, params=params, verbose=False) for doc in docs: for sentence in doc.sents: tokens = list(sentence) features = self.feature_extractor.extract( [str(token) for token in tokens] ) labels = self.encoder.encode(tokens) trainer.append(features, labels) trainer.train(path) self.tagger.close() self.tagger.open(path) def __call__(self, doc: Doc) -> Doc: doc_ent = [] for sentence in doc.sents: tokens = list(sentence) labels = self.predict_labels([str(token) for token in tokens]) entities = decode_bilou(labels, tokens, doc) # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities))) for entity in entities: doc_ent.append(entity) doc.ents = doc_ent return doc def predict_labels(self, tokens: Sequence[str]) -> List[str]: features = self.feature_extractor.extract(tokens) return self.tagger.tag(features)
def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()}
def crf_predict( tagger: pycrfsuite.Tagger, gp_data: list, mode: str = 'raw', exclude_labels: list = ['NOL', 'NAT', 'NEE'] ) -> Union[list, Tuple[list, pd.DataFrame]]: """Return predictions for the test data, grouped by file. 3 modes for return: * Return raw predictions (raw) * Return predictions with only valid tags (exclude_ool) * Return predictions (valid tags) and probabilities for each class (rt_proba) Predictions are returned unflattened https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html """ if mode not in ['raw', 'exclude_ool', 'rt_proba']: raise ValueError( f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}") if mode == 'raw': return [tagger.tag(xseq) for xseq in gp_data] labels = tagger.labels() res = [] y_pred = [] for fi, xseq in enumerate(gp_data): tagger.set(xseq) file_proba = pd.DataFrame({ label: [tagger.marginal(label, i) for i in range(len(xseq))] for label in labels }) y_pred.append(file_proba[[ col for col in file_proba.columns if col not in exclude_labels ]].idxmax(axis=1).tolist()) file_proba['file_id'] = fi res.append(file_proba) if mode == 'rt_proba': return y_pred, pd.concat(res, axis=0) return y_pred # else
def test_dump(tmpdir, model_filename): with Tagger().open(model_filename) as tagger: dump_filename = str(tmpdir.join("dump.txt")) tagger.dump(dump_filename) with open(dump_filename, 'rb') as f: res = f.read().decode('utf8') assert 'LABELS = {' in res assert u'солнце:не светит --> rainy:' in res # it shouldn't segfault on a closed tagger with pytest.raises(RuntimeError): tagger.dump(dump_filename)
def gen(corpus=test, model='m.model', indir=INDIR, outdir=''): tagger = Tagger() tagger.open(model) for doc in corpus.documents: path = setup_newdir(doc.filepath, olddir=indir, newdir=outdir, suffix='--', renew=True) if not path: continue mkparentdirs(path) task = etree.Element(TASK_ROOT) tags = etree.Element(TAGS_ROOT) tokens = etree.Element(TOKENS_ROOT) task.append(tags) task.append(tokens) sents = doc.sentences seqs = doc.sequence_list() tagged_seqs = [tagger.tag(seq) for seq in seqs] freq_dict = defaultdict(int) for (sent, seq, tagged_seq) in zip(sents, seqs, tagged_seqs): s = etree.Element('s') for (lex, feat, label) in zip(sent.getchildren(), seq, tagged_seq): lex_tag = etree.Element(lex.tag, lex.attrib) lex_tag.text = lex.text s.append(lex_tag) if label != 'None': iso_tag = etree.Element(label) if label in attribs: for key in attribs[label]: iso_tag.attrib[key] = attribs[label][key] iso_tag.attrib['text'] = lex.text iso_tag.attrib['id'] = ids[label] + str(freq_dict[label]) lex_tag.attrib['id'] = iso_tag.attrib['id'] freq_dict[label] += 1 tags.append(iso_tag) tokens.append(s) s = etree.tostring(task, pretty_print=True) with open(path, 'w') as f: print>>f, HEADER print>>f, s
def __init__(self, version: str = "1.5") -> None: """ Thai named-entity recognizer. :param str version: Thai NER version. It's support Thai NER 1.4 & 1.5. The defualt value is `1.5` """ self.crf = CRFTagger() if version == "1.4": self.crf.open(get_corpus_path("thainer-1.4", version="1.4")) self.pos_tag_name = "orchid_ud" else: self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5")) self.pos_tag_name = "lst20"
def test_info(model_filename): with Tagger().open(model_filename) as tagger: res = tagger.info() assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny', 'rainy')] assert res.state_features[('walk', 'sunny')] > res.state_features[( 'walk', 'rainy')] assert (u'солнце:не светит', u'rainy') in res.state_features assert res.header['num_labels'] == '2' assert set(res.labels.keys()) == set(['sunny', 'rainy']) assert set(res.attributes.keys()) == set( ['shop', 'walk', 'clean', u'солнце:не светит']) # it shouldn't segfault on a closed tagger with pytest.raises(RuntimeError): tagger.info()
def cal_confidence_score(sample, model_name): ''' 给未标注数据打标签,并且计算得分,返回结果 :param model: 模型 :param sample: 对象 :return: 预测的功能点名称 置信度 ''' model = Tagger() model.open(model_name) # unlabeled sample features feature_sequence = build_model_features(sample, 17, False) # words # words = sample.sen_words chars = list(sample.sentence) model.set(feature_sequence) predicted_labels = model.tag() # get predicted_fps fp_list = [] fp = '' for index, label in enumerate(predicted_labels): if label == 'B' or label == 'I' or label == 'E': fp += chars[index] if label == 'N' and len(fp) > 0: fp_list.append(fp) fp = '' # calculate the probability of tagging crf_confidence = model.probability(predicted_labels) lan_confidence = 0 filtered_fp_list = [] for fp_name in fp_list: filtered_fp_list.append(fp_name) if len(filtered_fp_list) == 0: predicted_fps = 'null' else: predicted_fps = ' '.join(filtered_fp_list) # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence)) # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息 return sample.story_id, sample.sentence, predicted_fps, crf_confidence
def test_append_nested_dicts(tmpdir): trainer = Trainer() trainer.append([ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": { "x": -0.5, "y": -0.1 } }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ], ['first', 'second']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set([ 'foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y', 'foo:ham:x', 'foo:ham:y', 'foo:bar:ham', ]) for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']: assert info.state_features[(feat, 'first')] > 0 assert info.state_features.get((feat, 'second'), 0) <= 0 for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']: assert info.state_features[(feat, 'second')] > 0 assert info.state_features.get((feat, 'first'), 0) <= 0
def test_append_strstr_dicts(tmpdir): trainer = Trainer() trainer.append([{ 'foo': 'bar' }, { 'baz': False }, { 'foo': 'bar', 'baz': True }, { 'baz': 0.2 }], ['spam', 'egg', 'spam', 'spam']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set(['foo:bar', 'baz']) assert info.state_features[('foo:bar', 'spam')] > 0
class ThaiNameTagger: def __init__(self): """ Thai named-entity recognizer. """ self.crf = CRFTagger() self.crf.open(get_corpus_path(_CORPUS_NAME)) def get_ner( self, text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ This function tags named-entitiy from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Note: * For the POS tags to be included in the results, this function uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron` and corpus as orchid_ud`. :Example: >>> from pythainlp.tag.named_entity import ThaiNameTagger >>> >>> ner = ThaiNameTagger() >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')] >>> >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True) 'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>' """ #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False) tokens = _tokenizer.word_tokenize(text) pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud") x_test = ThaiNameTagger.__extract_features(pos_tags) y = self.crf.tag(x_test) sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] if tag: temp = "" sent = "" for idx, (word, ner) in enumerate(sent_ner): if ner.startswith("B-") and temp != "": sent += "</" + temp + ">" temp = ner[2:] sent += "<" + temp + ">" elif ner.startswith("B-"): temp = ner[2:] sent += "<" + temp + ">" elif ner == "O" and temp != "": sent += "</" + temp + ">" temp = "" sent += word if idx == len(sent_ner) - 1 and temp != "": sent += "</" + temp + ">" return sent if pos: return [(pos_tags[i][0], pos_tags[i][1], data) for i, data in enumerate(y)] return sent_ner @staticmethod def __extract_features(doc): return [_doc2features(doc, i) for i in range(len(doc))]
def __init__(self): """ Thai named-entity recognizer. """ self.crf = CRFTagger() self.crf.open(get_corpus_path(_CORPUS_NAME))
# YOUR CODE HERE ..... pass # -- Load data and crfsuite model and convert them------------------------- RECREATE = True # set to True to recreate flexcrf data with new model CRFSUITE_MODEL_FILE = '../conll2002/conll2002-esp.crfsuite' CRFSUITE_TEST_DATA_FILE = '../conll2002/conll2002-esp_crfsuite-test-data.dump' FLEXCRF_TEST_DATA_FILE = '../conll2002/conll2002-esp_flexcrf-test-data.dump' # crfsuite model tagger = Tagger() tagger.open(CRFSUITE_MODEL_FILE) model = tagger.info() data = pickle.load(open(CRFSUITE_TEST_DATA_FILE)) print "test data loaded." if RECREATE: dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3) pickle.dump({ 'dataset': dataset, 'thetas': thetas }, open(FLEXCRF_TEST_DATA_FILE, 'wb')) else: dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE)) dataset = dd['dataset']
def test(features: pd.Series) -> list: tagger = Tagger() tagger.open('crf.model') y_pred = [tagger.tag(xseq) for xseq in features] return y_pred
def test_tag_item_sequence(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: assert tagger.tag(ItemSequence(xseq)) == yseq
def test_tag(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq
def test_tag_not_opened(xseq): tagger = Tagger() with pytest.raises(Exception): tagger.tag(xseq)
class PassageTagger(object): def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()} def read_input(self, filename): str_seqs = [] str_seq = [] feat_seqs = [] feat_seq = [] label_seqs = [] label_seq = [] for line in codecs.open(filename, "r", "utf-8"): lnstrp = line.strip() if lnstrp == "": if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] else: if self.do_train: clause, label = lnstrp.split("\t") label_seq.append(label) else: clause = lnstrp str_seq.append(clause) feats = self.fp.get_features(clause) feat_dict = {} for f in feats: if f in feat_dict: feat_dict[f] += 1 else: feat_dict[f] = 1 #feat_dict = {i: v for i, v in enumerate(feats)} feat_seq.append(feat_dict) if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] return str_seqs, feat_seqs, label_seqs def predict(self, feat_seqs): print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs) if self.algorithm == "crf": self.tagger.open(self.trained_model_name) preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs] else: Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: if f in self.feat_index: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) pred_ind_seqs = self.tagger.predict(Xs) preds = [] for ps in pred_ind_seqs: pred = [] for pred_ind in ps: pred.append(self.rev_label_index[pred_ind]) preds.append(pred) return preds def train(self, feat_seqs, label_seqs): print >>sys.stderr, "Training on %d sequences"%len(feat_seqs) if self.algorithm == "crf": for feat_seq, label_seq in zip(feat_seqs, label_seqs): self.trainer.append(ItemSequence(feat_seq), label_seq) self.trainer.train(self.trained_model_name) else: for fs in feat_seqs: for feat_dict in fs: for f in feat_dict: if f not in self.feat_index: self.feat_index[f] = len(self.feat_index) Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) for ls in label_seqs: for label in ls: if label not in self.label_index: self.label_index[label] = len(self.label_index) Ys = [] for ls in label_seqs: Y = [] for label in ls: Y.append(self.label_index[label]) Ys.append(numpy.asarray(Y)) self.trainer.fit(Xs, Ys) pickle.dump(self.trainer, open(self.trained_model_name, "wb")) pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb")) pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
def test_tag_string_lists(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: # Working with lists is supported, # but if we discard weights the results become different data = [x.keys() for x in xseq] assert tagger.tag(data) != yseq
def test_open_non_existing(): tagger = Tagger() with pytest.raises(IOError): tagger.open('foo')
def test_open_invalid_with_correct_signature(tmpdir): tmp = tmpdir.join('tmp.txt') tmp.write(b"lCRFfoo"*100) tagger = Tagger() with pytest.raises(ValueError): tagger.open(str(tmp))
def test_open_invalid_small(tmpdir): tmp = tmpdir.join('tmp.txt') tmp.write(b'foo') tagger = Tagger() with pytest.raises(ValueError): tagger.open(str(tmp))
def test_open_invalid(): tagger = Tagger() with pytest.raises(ValueError): tagger.open(__file__)