def tag(self, data, form_col=None, ilbl_col=None, tagger=None, cols=None, ts=None): """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model. See documentation for `train` for more details on requirements for the data passed to this method. :param data: data :type data: str or recarray :param form_col: form column name :type form_col: str :param ilbl_col: inference label column name :type ilbl_col: str :param tagger: CRFS tagger :type tagger: Tagger :param cols: TSV column names :type cols: str or list of str :param ts: tab separator for TSV :type ts: str :return: tagged data :rtype: recarray """ fc = form_col if form_col else self.form_col c = cols if cols else self.cols sep = ts if ts else self.ts ilc = ilbl_col if ilbl_col else self.ilbl_col if type(data) in [np.core.records.recarray, np.ndarray]: d = data elif type(data) == str: d = parse_tsv(s=data, cols=c, ts=sep) else: raise ValueError('Invalid input type.') tgr = tagger if tgr is None and self.tagger: tgr = self.tagger elif tgr is None: tgr = Tagger() tgr.open('%s.crfs' % self.model_path) # extracting features X = self._extract_features(d, form_col=fc) # tagging sentences idx = 0 for fts in X: for l in tgr.tag(fts): d[idx][ilc] = l idx += 1 return d
class SentimentTagger: def __init__(self): self.tagger = Tagger() def load_model(self, path): self.tagger.open(path) def tag_tweets(self, tweet_features_list): features = ItemSequence(tweet_features_list) labels = self.tagger.tag(features) return labels
def main(argv) : inputDir = argv[0] testDir = argv[1] outputFPath = argv[2] trainData = list(get_data(inputDir)) testData = list(get_data(testDir)) random.shuffle(trainData) # create features trainFeatures = create_features(trainData) testFeatures = create_features(testData) trainer = Trainer() for dialogue in trainFeatures : trainer.append(dialogue[0],dialogue[1]) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('./model.pkl') outputFile = open(outputFPath,'w') tagger = Tagger() tagger.open('./model.pkl') totalUtter=correctUtter=0 for dialogue in testFeatures : preds = tagger.tag(dialogue[0]) labels = dialogue[1] for i,pred in enumerate(preds) : outputFile.write(pred+'\n') if len(labels)>0 : totalUtter += 1 if labels[i]==pred : correctUtter += 1 outputFile.write('\n') if totalUtter > 0 : accuracy = correctUtter/totalUtter print('Accuracy: '+str(accuracy)) outputFile.close()
def predict_crf(reader, model_path, _log, _run): _log.info('Loading model from %s', model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) tagger = Tagger() tagger.open(model_path) _log.info('Extracting features from test corpus') itemseq = ItemSequence( [fs for sent in reader.sents() for fs in extract_crf_features(sent)]) _log.info('Making predictions with the model') return tagger.tag(itemseq)
class CRFchunk: def __init__(self, corpus: str = "orchidpp"): self.corpus = corpus self.load_model(self.corpus) def load_model(self, corpus: str): self.tagger = CRFTagger() if corpus == "orchidpp": self.path = path_pythainlp_corpus("crfchunk_orchidpp.model") self.tagger.open(self.path) def parse(self, token_pos: List[Tuple[str, str]]) -> List[str]: self.xseq = extract_features(token_pos) return self.tagger.tag(self.xseq)
def cal_confidence_score(sample, model_name): ''' 给未标注数据打标签,并且计算得分,返回结果 :param model: 模型 :param sample: 对象 :return: 预测的功能点名称 置信度 ''' model = Tagger() model.open(model_name) # unlabeled sample features feature_sequence = build_model_features(sample, 17, False) # words # words = sample.sen_words chars = list(sample.sentence) model.set(feature_sequence) predicted_labels = model.tag() # get predicted_fps fp_list = [] fp = '' for index, label in enumerate(predicted_labels): if label == 'B' or label == 'I' or label == 'E': fp += chars[index] if label == 'N' and len(fp) > 0: fp_list.append(fp) fp = '' # calculate the probability of tagging crf_confidence = model.probability(predicted_labels) lan_confidence = 0 filtered_fp_list = [] for fp_name in fp_list: filtered_fp_list.append(fp_name) if len(filtered_fp_list) == 0: predicted_fps = 'null' else: predicted_fps = ' '.join(filtered_fp_list) # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence)) # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息 return sample.story_id, sample.sentence, predicted_fps, crf_confidence
class CRFsuiteEntityRecognizer: def __init__( self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder ) -> None: self.feature_extractor = feature_extractor self._encoder = encoder self.tagger = Tagger() @property def encoder(self) -> EntityEncoder: return self._encoder def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = Trainer(algorithm=algorithm, params=params, verbose=False) for doc in docs: for sentence in doc.sents: tokens = list(sentence) features = self.feature_extractor.extract( [str(token) for token in tokens] ) labels = self.encoder.encode(tokens) trainer.append(features, labels) trainer.train(path) self.tagger.close() self.tagger.open(path) def __call__(self, doc: Doc) -> Doc: doc_ent = [] for sentence in doc.sents: tokens = list(sentence) labels = self.predict_labels([str(token) for token in tokens]) entities = decode_bilou(labels, tokens, doc) # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities))) for entity in entities: doc_ent.append(entity) doc.ents = doc_ent return doc def predict_labels(self, tokens: Sequence[str]) -> List[str]: features = self.feature_extractor.extract(tokens) return self.tagger.tag(features)
def crf_predict( tagger: pycrfsuite.Tagger, gp_data: list, mode: str = 'raw', exclude_labels: list = ['NOL', 'NAT', 'NEE'] ) -> Union[list, Tuple[list, pd.DataFrame]]: """Return predictions for the test data, grouped by file. 3 modes for return: * Return raw predictions (raw) * Return predictions with only valid tags (exclude_ool) * Return predictions (valid tags) and probabilities for each class (rt_proba) Predictions are returned unflattened https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html """ if mode not in ['raw', 'exclude_ool', 'rt_proba']: raise ValueError( f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}") if mode == 'raw': return [tagger.tag(xseq) for xseq in gp_data] labels = tagger.labels() res = [] y_pred = [] for fi, xseq in enumerate(gp_data): tagger.set(xseq) file_proba = pd.DataFrame({ label: [tagger.marginal(label, i) for i in range(len(xseq))] for label in labels }) y_pred.append(file_proba[[ col for col in file_proba.columns if col not in exclude_labels ]].idxmax(axis=1).tolist()) file_proba['file_id'] = fi res.append(file_proba) if mode == 'rt_proba': return y_pred, pd.concat(res, axis=0) return y_pred # else
def gen(corpus=test, model='m.model', indir=INDIR, outdir=''): tagger = Tagger() tagger.open(model) for doc in corpus.documents: path = setup_newdir(doc.filepath, olddir=indir, newdir=outdir, suffix='--', renew=True) if not path: continue mkparentdirs(path) task = etree.Element(TASK_ROOT) tags = etree.Element(TAGS_ROOT) tokens = etree.Element(TOKENS_ROOT) task.append(tags) task.append(tokens) sents = doc.sentences seqs = doc.sequence_list() tagged_seqs = [tagger.tag(seq) for seq in seqs] freq_dict = defaultdict(int) for (sent, seq, tagged_seq) in zip(sents, seqs, tagged_seqs): s = etree.Element('s') for (lex, feat, label) in zip(sent.getchildren(), seq, tagged_seq): lex_tag = etree.Element(lex.tag, lex.attrib) lex_tag.text = lex.text s.append(lex_tag) if label != 'None': iso_tag = etree.Element(label) if label in attribs: for key in attribs[label]: iso_tag.attrib[key] = attribs[label][key] iso_tag.attrib['text'] = lex.text iso_tag.attrib['id'] = ids[label] + str(freq_dict[label]) lex_tag.attrib['id'] = iso_tag.attrib['id'] freq_dict[label] += 1 tags.append(iso_tag) tokens.append(s) s = etree.tostring(task, pretty_print=True) with open(path, 'w') as f: print>>f, HEADER print>>f, s
def test_tag_not_opened(xseq): tagger = Tagger() with pytest.raises(Exception): tagger.tag(xseq)
def test(features: pd.Series) -> list: tagger = Tagger() tagger.open('crf.model') y_pred = [tagger.tag(xseq) for xseq in features] return y_pred
def evaluate_model_by_story(model_name, test_samples): model = Tagger() model.open(model_name) story_fps = dict() for sample in test_samples: model.set(build_model_features(sample, 17, True)) predicted_labels = model.tag() chars = list(sample.sentence) predicted_fps = [] fp = '' for index, word in enumerate(predicted_labels): if word == 'E' or word == 'S': fp += chars[index] predicted_fps.append(fp) fp = '' if word == 'B' or word == 'I': fp += chars[index] actual_fps = [fp for fp in sample.fps if fp != '' and fp != 'null' and fp in sample.sentence] filtered_predicted_fps = predicted_fps # for predicted_fp in predicted_fps: # lan_confidence_temp = lmmodel.score(predicted_fp, bos=True, eos=True) / len(predicted_fp) # if len(re.findall('[a-zA-Z0-9+]+', predicted_fp)) > 0: # lan_confidence_temp += 5 # if lan_confidence_temp > -2.4: # filtered_predicted_fps.append(predicted_fp) if sample.story_id not in story_fps: story_fps[sample.story_id] = [set(actual_fps), set(filtered_predicted_fps)] else: story_fps[sample.story_id][0].update(actual_fps) story_fps[sample.story_id][1].update(filtered_predicted_fps) # print(len(story_fps)) global sim_t sim_threshold = sim_t TP_precision = 0 TP_recall = 0 all_actual_fps = 0 all_predicted_fps = 0 for story_id, (actual_fps, predicted_fps) in story_fps.items(): story_precision = 0.0 story_recall = 0.0 all_actual_fps += len(actual_fps) all_predicted_fps += len(predicted_fps) # for actual_fp in actual_fps: story = samples_dao.read_story_by_story_id(int(story_id)) data = [story_id, story[0] if story is not None else '', story[1] if story is not None else '', story[2] if story is not None else '', story[3] if story is not None else '', story[4] if story is not None else '', actual_fps, predicted_fps] with open('../Archive/date_performance/resultsIterRes_by_story_details.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data) for predicted_fp in predicted_fps: sim = [] for actual_fp in actual_fps: similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1) # if actual_fp in predicted_fp: # similarity = 1 sim.append(similarity) # print(sim) if len(sim) == 0: sim = [0] if max(sim) >= sim_threshold: TP_precision += 1 story_precision += 1 for actual_fp in actual_fps: sim = [] for predicted_fp in predicted_fps: similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1) sim.append(similarity) # print(sim) if len(sim) == 0: sim = [0] if max(sim) >= sim_threshold: TP_recall += 1 story_recall += 1 # 每个故事的详情 story_precision = 0 if len(filtered_predicted_fps) == 0 else story_precision/len(filtered_predicted_fps) story_recall = 0 if len(actual_fps) == 0 else story_recall/len(actual_fps) data = ["STORY " + story_id, story_precision, story_recall] with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data) with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(["THE END!!!"]) # 整体的详情 precision = TP_precision/all_predicted_fps recall = TP_recall/all_actual_fps f1 = 2 * precision * recall / (precision + recall) print("By Story: Iteration: %s\n\tPrecision: %f\n\tRecall: %f\n\tF1: %f\n\n\n" % (model_name.split('_')[2], precision, recall, f1)) data = ["BY STORY: Iteration " + model_name.split('_')[2], precision, recall, f1] with open('../Archive/date_performance/results/IterRes_by_story.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data) return precision, recall, f1
def evaluate_model(model_name, test_samples): ''' 最后一次迭代训练模型,并输出测试结果 :param test_samples: :param model_name: :return: ''' model = Tagger() model.open(model_name) accuracy = 0.0 recall = 0.0 f1 = 0.0 # sample_accuracy = 0.0 iteration_test_details = [] for sample in test_samples: model.set(build_model_features(sample, 17, True)) predicted_labels = model.tag() true_labels = sample.char_label predicted_label_index = [] for predicted_label in predicted_labels: if predicted_label == 'N': predicted_label_index.append(0) else: predicted_label_index.append(1) true_label_index = [] for true_label in true_labels: if true_label == 'N': true_label_index.append(0) else: true_label_index.append(1) iteration_test_details = [] chars = list(sample.sentence) # sen_words = sample.sen_words iteration_test_details.append(sample.sentence) predicted_fps = '' actual_fps = '' for index, word in enumerate(predicted_labels): if word != 'N': predicted_fps += chars[index] if len(predicted_fps) == 0: predicted_fps = '-----' for index, word in enumerate(true_labels): if word != 'N': actual_fps += chars[index] iteration_test_details.append(actual_fps) iteration_test_details.append(predicted_fps) with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(iteration_test_details) # print(sample.sen_words) # print(predicted_labels) # print(true_labels) accuracy += metrics.accuracy_score(true_label_index, predicted_label_index) recall += metrics.recall_score(true_label_index, predicted_label_index, average='binary', pos_label=1) f1 += 2*accuracy*recall/(accuracy+recall) # sample_accuracy += metrics.sequence_accuracy_score(true_labels, predicted_labels) print("Iteration: %s\n\tAccuracy: %f\n\tRecall: %f\n\tF1: %f\n\n\n" % ( model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples))) data = ["Iteration " + model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples)] with open('../Archive/date_performance/results/IterRes.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data) with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(data) return accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples)
class PassageTagger(object): def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()} def read_input(self, filename): str_seqs = [] str_seq = [] feat_seqs = [] feat_seq = [] label_seqs = [] label_seq = [] for line in codecs.open(filename, "r", "utf-8"): lnstrp = line.strip() if lnstrp == "": if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] else: if self.do_train: clause, label = lnstrp.split("\t") label_seq.append(label) else: clause = lnstrp str_seq.append(clause) feats = self.fp.get_features(clause) feat_dict = {} for f in feats: if f in feat_dict: feat_dict[f] += 1 else: feat_dict[f] = 1 #feat_dict = {i: v for i, v in enumerate(feats)} feat_seq.append(feat_dict) if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] return str_seqs, feat_seqs, label_seqs def predict(self, feat_seqs): print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs) if self.algorithm == "crf": self.tagger.open(self.trained_model_name) preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs] else: Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: if f in self.feat_index: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) pred_ind_seqs = self.tagger.predict(Xs) preds = [] for ps in pred_ind_seqs: pred = [] for pred_ind in ps: pred.append(self.rev_label_index[pred_ind]) preds.append(pred) return preds def train(self, feat_seqs, label_seqs): print >>sys.stderr, "Training on %d sequences"%len(feat_seqs) if self.algorithm == "crf": for feat_seq, label_seq in zip(feat_seqs, label_seqs): self.trainer.append(ItemSequence(feat_seq), label_seq) self.trainer.train(self.trained_model_name) else: for fs in feat_seqs: for feat_dict in fs: for f in feat_dict: if f not in self.feat_index: self.feat_index[f] = len(self.feat_index) Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) for ls in label_seqs: for label in ls: if label not in self.label_index: self.label_index[label] = len(self.label_index) Ys = [] for ls in label_seqs: Y = [] for label in ls: Y.append(self.label_index[label]) Ys.append(numpy.asarray(Y)) self.trainer.fit(Xs, Ys) pickle.dump(self.trainer, open(self.trained_model_name, "wb")) pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb")) pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
if RECREATE: dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3) pickle.dump({ 'dataset': dataset, 'thetas': thetas }, open(FLEXCRF_TEST_DATA_FILE, 'wb')) else: dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE)) dataset = dd['dataset'] thetas = dd['thetas'] # -- Start classification ------------------------------------------------ for seq in range(len(dataset)): # -- with crfsuite s_ = tagger.tag(data['X'][seq]) y_ = np.array([int(model.labels[s]) for s in s_]) prob_ = tagger.probability(s_) print "\n-- With crfsuite:" print "labels:\n", s_, "\n", y_ print "probability:\t %f" % prob_ # -- with flexcrf f_xy, y = dataset[seq] theta = thetas[seq] m_xy, f_m_xy = _compute_all_potentials(f_xy, theta) y_pred = viterbi_decoder(m_xy)
class label_crf_classifier(object): """Conditional Random Field model""" def __init__(self, stopword_path="data/stop_words.txt"): self.stoplist = set( open(stopword_path).read().decode("utf8", "ignore").split("\n")) def preprocess(self, training_data): self.training_df = training_data[training_data["section_label"] != ""] self.training_df['crf_feature'] = self.training_df['crf_feature'].map( lambda x: [y for y in x if y not in self.stoplist]) self.x = self.training_df.groupby("resume_id")["crf_feature"].apply( list) self.y = self.training_df.groupby("resume_id")["section_label"].apply( list) def train(self, training_data, classifier_path="classifier/cache/label_crf_classifier", c1=0, c2=10, period=300, minfreq=5): self.preprocess(training_data) train = Trainer() for i1, i in enumerate(self.x): train.append(ItemSequence(i), self.y[i1]) params = { "c1": c1, "c2": c2, "period": period, "feature.minfreq": minfreq, "max_iterations": 1000 # "calibration.eta": 0.05, # "calibration_samples": 400, } # train.select(algorithm = "l2sgd") train.set_params(params) train.train(classifier_path) self.tagger = Tagger() self.tagger.open(classifier_path) def save_classifier(self, classifier_path="classifier/cache/label_crf_classifier" ): pass def load_classifier(self, classifier_path="classifier/cache/label_crf_classifier" ): self.tagger = Tagger() self.tagger.open(classifier_path) def predict(self, test_data): """Input: x should be a list of strings""" result = self.tagger.tag(ItemSequence(test_data["crf_feature"])) test_data["section_label"] = result return test_data # def score(self, training_data, classifier_path="classifier/cache/label_crf_classifier", portion=0.8, c1=0, c2=10, period=300, minfreq=10): # # split resume_id # resume_ids = np.unique([resume['resume_id'] for resume in training_data]) # length = len(resume_ids) # shuffle(resume_ids) # train_ids = resume_ids[:int(length*portion)] # test_ids = resume_ids[int(length*portion):] # train_df = [resume for resume in training_data if resume['resume_id'] in train_ids] # test_df = [resume for resume in training_data if resume['resume_id'] in test_ids] # # train model on train_ids # self.train(train_df, classifier_path=classifier_path, c1=c1, c2=c2, period=period, minfreq=minfreq) # test_pred = self.predict_all(test_df) # train_pred = self.predict_all(train_df) # # print out result # return train_pred, test_pred # if __name__ == "__main__": # data = MongoRetriveData() # resumes = data.get_data_mongo() # # pickle.dump(resumes, open('./resume_data.pkl', 'wb')) # # resumes = pickle.load(open('./resume_data.pkl', 'rb')) # stopword_path = './stopword.txt' # model_path = './model.txt' # resume_data = resumes # clf = Crf(stopword_path, model_path, resume_data) # clf.CleanData() # clf.Fit() # clf.Score() # # result = clf.Predict(clf.data) # # print result
class ThaiNameTagger: def __init__(self): """ Thai named-entity recognizer. """ self.crf = CRFTagger() self.crf.open(get_corpus_path(_CORPUS_NAME)) def get_ner( self, text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ This function tags named-entitiy from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Note: * For the POS tags to be included in the results, this function uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron` and corpus as orchid_ud`. :Example: >>> from pythainlp.tag.named_entity import ThaiNameTagger >>> >>> ner = ThaiNameTagger() >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')] >>> >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True) 'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>' """ #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False) tokens = _tokenizer.word_tokenize(text) pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud") x_test = ThaiNameTagger.__extract_features(pos_tags) y = self.crf.tag(x_test) sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] if tag: temp = "" sent = "" for idx, (word, ner) in enumerate(sent_ner): if ner.startswith("B-") and temp != "": sent += "</" + temp + ">" temp = ner[2:] sent += "<" + temp + ">" elif ner.startswith("B-"): temp = ner[2:] sent += "<" + temp + ">" elif ner == "O" and temp != "": sent += "</" + temp + ">" temp = "" sent += word if idx == len(sent_ner) - 1 and temp != "": sent += "</" + temp + ">" return sent if pos: return [(pos_tags[i][0], pos_tags[i][1], data) for i, data in enumerate(y)] return sent_ner @staticmethod def __extract_features(doc): return [_doc2features(doc, i) for i in range(len(doc))]
def test(features): print("Testing..") tagger = Tagger() tagger.open('crf.model') y_pred = [tagger.tag(xseq) for xseq in features] return y_pred