Example #1
0
    def tag(self,
            data,
            form_col=None,
            ilbl_col=None,
            tagger=None,
            cols=None,
            ts=None):
        """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model.

        See documentation for `train` for more details on requirements for the
        data passed to this method.

        :param data: data
        :type data: str or recarray
        :param form_col: form column name
        :type form_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param tagger: CRFS tagger
        :type tagger: Tagger
        :param cols: TSV column names
        :type cols: str or list of str
        :param ts: tab separator for TSV
        :type ts: str
        :return: tagged data
        :rtype: recarray
        """

        fc = form_col if form_col else self.form_col
        c = cols if cols else self.cols
        sep = ts if ts else self.ts
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep)
        else:
            raise ValueError('Invalid input type.')

        tgr = tagger

        if tgr is None and self.tagger:
            tgr = self.tagger
        elif tgr is None:
            tgr = Tagger()
            tgr.open('%s.crfs' % self.model_path)

        # extracting features
        X = self._extract_features(d, form_col=fc)

        # tagging sentences
        idx = 0
        for fts in X:
            for l in tgr.tag(fts):
                d[idx][ilc] = l
                idx += 1

        return d
Example #2
0
class SentimentTagger:
    def __init__(self):
        self.tagger = Tagger()

    def load_model(self, path):
        self.tagger.open(path)

    def tag_tweets(self, tweet_features_list):
        features = ItemSequence(tweet_features_list)
        labels = self.tagger.tag(features)
        return labels
Example #3
0
def main(argv) :

    inputDir = argv[0]
    testDir = argv[1]
    outputFPath = argv[2]


    trainData = list(get_data(inputDir))
    testData = list(get_data(testDir))


    random.shuffle(trainData)


    # create features
    trainFeatures = create_features(trainData)
    testFeatures = create_features(testData)

    trainer = Trainer()
    for dialogue in trainFeatures :
        trainer.append(dialogue[0],dialogue[1])

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('./model.pkl')

    outputFile = open(outputFPath,'w')
    tagger = Tagger()
    tagger.open('./model.pkl')


    totalUtter=correctUtter=0
    for dialogue in testFeatures :
        preds = tagger.tag(dialogue[0])
        labels = dialogue[1]
        for i,pred in enumerate(preds) :
            outputFile.write(pred+'\n')
            if len(labels)>0 :
                totalUtter += 1
                if labels[i]==pred :
                    correctUtter += 1
        outputFile.write('\n')

    if totalUtter > 0 :
        accuracy = correctUtter/totalUtter
        print('Accuracy: '+str(accuracy))
    outputFile.close()
Example #4
0
def predict_crf(reader, model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    tagger = Tagger()
    tagger.open(model_path)

    _log.info('Extracting features from test corpus')
    itemseq = ItemSequence(
        [fs for sent in reader.sents() for fs in extract_crf_features(sent)])

    _log.info('Making predictions with the model')
    return tagger.tag(itemseq)
Example #5
0
class CRFchunk:
    def __init__(self, corpus: str = "orchidpp"):
        self.corpus = corpus
        self.load_model(self.corpus)

    def load_model(self, corpus: str):
        self.tagger = CRFTagger()
        if corpus == "orchidpp":
            self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
        self.tagger.open(self.path)

    def parse(self, token_pos: List[Tuple[str, str]]) -> List[str]:
        self.xseq = extract_features(token_pos)
        return self.tagger.tag(self.xseq)
Example #6
0
def cal_confidence_score(sample, model_name):
    '''
    给未标注数据打标签,并且计算得分,返回结果
    :param model: 模型
    :param sample: 对象
    :return: 预测的功能点名称  置信度
    '''
    model = Tagger()
    model.open(model_name)
    # unlabeled sample features
    feature_sequence = build_model_features(sample, 17, False)
    # words
    # words = sample.sen_words
    chars = list(sample.sentence)
    model.set(feature_sequence)
    predicted_labels = model.tag()

    # get predicted_fps
    fp_list = []
    fp = ''
    for index, label in enumerate(predicted_labels):
        if label == 'B' or label == 'I' or label == 'E':
            fp += chars[index]
        if label == 'N' and len(fp) > 0:
            fp_list.append(fp)
            fp = ''

    # calculate the probability of tagging
    crf_confidence = model.probability(predicted_labels)

    lan_confidence = 0
    filtered_fp_list = []
    for fp_name in fp_list:
        filtered_fp_list.append(fp_name)

    if len(filtered_fp_list) == 0:
        predicted_fps = 'null'
    else:
        predicted_fps = ' '.join(filtered_fp_list)

    # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence))
    # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息
    return sample.story_id, sample.sentence, predicted_fps, crf_confidence
class CRFsuiteEntityRecognizer:
    def __init__(
        self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder
    ) -> None:
        self.feature_extractor = feature_extractor
        self._encoder = encoder
        self.tagger = Tagger()

    @property
    def encoder(self) -> EntityEncoder:
        return self._encoder

    def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None:
        trainer = Trainer(algorithm=algorithm, params=params, verbose=False)
        for doc in docs:
            for sentence in doc.sents:
                tokens = list(sentence)
                features = self.feature_extractor.extract(
                    [str(token) for token in tokens]
                )
                labels = self.encoder.encode(tokens)
                trainer.append(features, labels)
        trainer.train(path)
        self.tagger.close()
        self.tagger.open(path)

    def __call__(self, doc: Doc) -> Doc:
        doc_ent = []
        for sentence in doc.sents:
            tokens = list(sentence)
            labels = self.predict_labels([str(token) for token in tokens])
            entities = decode_bilou(labels, tokens, doc)
            # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities)))
            for entity in entities:
                doc_ent.append(entity)
        doc.ents = doc_ent
        return doc

    def predict_labels(self, tokens: Sequence[str]) -> List[str]:
        features = self.feature_extractor.extract(tokens)
        return self.tagger.tag(features)
Example #8
0
def crf_predict(
    tagger: pycrfsuite.Tagger,
    gp_data: list,
    mode: str = 'raw',
    exclude_labels: list = ['NOL', 'NAT', 'NEE']
) -> Union[list, Tuple[list, pd.DataFrame]]:
    """Return predictions for the test data, grouped by file. 3 modes for return:
		* Return raw predictions (raw)
		* Return predictions with only valid tags (exclude_ool)
		* Return predictions (valid tags) and probabilities for each class (rt_proba)

	Predictions are returned unflattened
	
	https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html
	"""
    if mode not in ['raw', 'exclude_ool', 'rt_proba']:
        raise ValueError(
            f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}")
    if mode == 'raw':
        return [tagger.tag(xseq) for xseq in gp_data]
    labels = tagger.labels()

    res = []
    y_pred = []
    for fi, xseq in enumerate(gp_data):
        tagger.set(xseq)
        file_proba = pd.DataFrame({
            label: [tagger.marginal(label, i) for i in range(len(xseq))]
            for label in labels
        })
        y_pred.append(file_proba[[
            col for col in file_proba.columns if col not in exclude_labels
        ]].idxmax(axis=1).tolist())
        file_proba['file_id'] = fi
        res.append(file_proba)

    if mode == 'rt_proba':
        return y_pred, pd.concat(res, axis=0)
    return y_pred  # else
Example #9
0
def gen(corpus=test, model='m.model', indir=INDIR, outdir=''):
    tagger = Tagger()
    tagger.open(model)
    for doc in corpus.documents:
        path = setup_newdir(doc.filepath, olddir=indir, newdir=outdir,
                            suffix='--', renew=True)
        if not path:
            continue
        mkparentdirs(path)
        task = etree.Element(TASK_ROOT)
        tags = etree.Element(TAGS_ROOT)
        tokens = etree.Element(TOKENS_ROOT)
        task.append(tags)
        task.append(tokens)
        sents = doc.sentences
        seqs = doc.sequence_list()
        tagged_seqs = [tagger.tag(seq) for seq in seqs]
        freq_dict = defaultdict(int)
        for (sent, seq, tagged_seq) in zip(sents, seqs, tagged_seqs):
            s = etree.Element('s')
            for (lex, feat, label) in zip(sent.getchildren(), seq, tagged_seq):
                    lex_tag = etree.Element(lex.tag, lex.attrib)
                    lex_tag.text = lex.text
                    s.append(lex_tag)
                    if label != 'None':
                        iso_tag = etree.Element(label)
                        if label in attribs:
                            for key in attribs[label]:
                                iso_tag.attrib[key] = attribs[label][key]
                        iso_tag.attrib['text'] = lex.text
                        iso_tag.attrib['id'] = ids[label] + str(freq_dict[label])
                        lex_tag.attrib['id'] = iso_tag.attrib['id']
                        freq_dict[label] += 1
                        tags.append(iso_tag)
            tokens.append(s)
        s = etree.tostring(task, pretty_print=True)
        with open(path, 'w') as f:
            print>>f, HEADER
            print>>f, s
Example #10
0
def test_tag_not_opened(xseq):
    tagger = Tagger()
    with pytest.raises(Exception):
        tagger.tag(xseq)
Example #11
0
def test_tag_not_opened(xseq):
    tagger = Tagger()
    with pytest.raises(Exception):
        tagger.tag(xseq)
Example #12
0
def test(features: pd.Series) -> list:
    tagger = Tagger()
    tagger.open('crf.model')
    y_pred = [tagger.tag(xseq) for xseq in features]
    return y_pred
Example #13
0
def evaluate_model_by_story(model_name, test_samples):
    model = Tagger()
    model.open(model_name)

    story_fps = dict()
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()

        chars = list(sample.sentence)
        predicted_fps = []
        fp = ''
        for index, word in enumerate(predicted_labels):
            if word == 'E' or word == 'S':
                fp += chars[index]
                predicted_fps.append(fp)
                fp = ''
            if word == 'B' or word == 'I':
                fp += chars[index]

        actual_fps = [fp for fp in sample.fps if fp != '' and fp != 'null' and fp in sample.sentence]

        filtered_predicted_fps = predicted_fps
        # for predicted_fp in predicted_fps:
        #     lan_confidence_temp = lmmodel.score(predicted_fp, bos=True, eos=True) / len(predicted_fp)
        #     if len(re.findall('[a-zA-Z0-9+]+', predicted_fp)) > 0:
        #         lan_confidence_temp += 5
        #     if lan_confidence_temp > -2.4:
        #         filtered_predicted_fps.append(predicted_fp)

        if sample.story_id not in story_fps:
            story_fps[sample.story_id] = [set(actual_fps), set(filtered_predicted_fps)]
        else:
            story_fps[sample.story_id][0].update(actual_fps)
            story_fps[sample.story_id][1].update(filtered_predicted_fps)

    # print(len(story_fps))
    global sim_t
    sim_threshold = sim_t

    TP_precision = 0
    TP_recall = 0
    all_actual_fps = 0
    all_predicted_fps = 0
    for story_id, (actual_fps, predicted_fps) in story_fps.items():
        story_precision = 0.0
        story_recall = 0.0

        all_actual_fps += len(actual_fps)

        all_predicted_fps += len(predicted_fps)
        # for actual_fp in actual_fps:

        story = samples_dao.read_story_by_story_id(int(story_id))
        data = [story_id,
                story[0] if story is not None else '',
                story[1] if story is not None else '',
                story[2] if story is not None else '',
                story[3] if story is not None else '',
                story[4] if story is not None else '',
                actual_fps,
                predicted_fps]
        with open('../Archive/date_performance/resultsIterRes_by_story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
        for predicted_fp in predicted_fps:
            sim = []
            for actual_fp in actual_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                # if actual_fp in predicted_fp:
                #     similarity = 1
                sim.append(similarity)
            # print(sim)

            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_precision += 1
                story_precision += 1

        for actual_fp in actual_fps:
            sim = []
            for predicted_fp in predicted_fps:
                similarity = 1-distance.nlevenshtein(actual_fp, predicted_fp, method=1)
                sim.append(similarity)
            # print(sim)
            if len(sim) == 0:
                sim = [0]
            if max(sim) >= sim_threshold:
                TP_recall += 1
                story_recall += 1

        # 每个故事的详情
        story_precision = 0 if len(filtered_predicted_fps) == 0 else story_precision/len(filtered_predicted_fps)
        story_recall = 0 if len(actual_fps) == 0 else story_recall/len(actual_fps)
        data = ["STORY " + story_id, story_precision, story_recall]
        with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(data)
    with open('../Archive/date_performance/results/story_details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(["THE END!!!"])

    # 整体的详情
    precision = TP_precision/all_predicted_fps
    recall = TP_recall/all_actual_fps
    f1 = 2 * precision * recall / (precision + recall)

    print("By Story: Iteration: %s\n\tPrecision: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (model_name.split('_')[2], precision, recall, f1))

    data = ["BY STORY: Iteration " + model_name.split('_')[2], precision, recall, f1]

    with open('../Archive/date_performance/results/IterRes_by_story.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return precision, recall, f1
Example #14
0
def evaluate_model(model_name, test_samples):
    '''
    最后一次迭代训练模型,并输出测试结果
    :param test_samples:
    :param model_name:
    :return:
    '''
    model = Tagger()
    model.open(model_name)

    accuracy = 0.0
    recall = 0.0
    f1 = 0.0
    # sample_accuracy = 0.0
    iteration_test_details = []
    for sample in test_samples:
        model.set(build_model_features(sample, 17, True))
        predicted_labels = model.tag()
        true_labels = sample.char_label

        predicted_label_index = []
        for predicted_label in predicted_labels:
            if predicted_label == 'N':
                predicted_label_index.append(0)
            else:
                predicted_label_index.append(1)

        true_label_index = []
        for true_label in true_labels:
            if true_label == 'N':
                true_label_index.append(0)
            else:
                true_label_index.append(1)

        iteration_test_details = []
        chars = list(sample.sentence)
        # sen_words = sample.sen_words
        iteration_test_details.append(sample.sentence)
        predicted_fps = ''
        actual_fps = ''
        for index, word in enumerate(predicted_labels):
            if word != 'N':
                predicted_fps += chars[index]
        if len(predicted_fps) == 0:
            predicted_fps = '-----'

        for index, word in enumerate(true_labels):
            if word != 'N':
                actual_fps += chars[index]

        iteration_test_details.append(actual_fps)
        iteration_test_details.append(predicted_fps)

        with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(iteration_test_details)

        # print(sample.sen_words)
        # print(predicted_labels)
        # print(true_labels)

        accuracy += metrics.accuracy_score(true_label_index, predicted_label_index)
        recall += metrics.recall_score(true_label_index, predicted_label_index, average='binary', pos_label=1)
        f1 += 2*accuracy*recall/(accuracy+recall)
        # sample_accuracy += metrics.sequence_accuracy_score(true_labels, predicted_labels)

    print("Iteration: %s\n\tAccuracy: %f\n\tRecall: %f\n\tF1: %f\n\n\n"
          % (
              model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
              f1 / len(test_samples)))

    data = ["Iteration " + model_name.split('_')[2], accuracy / len(test_samples), recall / len(test_samples),
            f1 / len(test_samples)]

    with open('../Archive/date_performance/results/IterRes.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    with open('../Archive/date_performance/results/Iteration_Test_Details.csv', 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)

    return accuracy / len(test_samples), recall / len(test_samples), f1 / len(test_samples)
Example #15
0
class PassageTagger(object):
  def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
    self.trained_model_name = trained_model_name
    self.fp = FeatureProcessing()
    self.do_train = do_train
    self.algorithm = algorithm
    if algorithm == "crf":
      if do_train:
        self.trainer = Trainer()
      else:
        self.tagger = Tagger()
    else:
      if do_train:
        model = ChainCRF()
        self.trainer = FrankWolfeSSVM(model=model)
        self.feat_index = {}
        self.label_index = {}
      else:
        self.tagger = pickle.load(open(self.trained_model_name, "rb"))
        self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
        label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
        self.rev_label_index = {i: x for x, i in label_index.items()}

  def read_input(self, filename):
    str_seqs = []
    str_seq = []
    feat_seqs = []
    feat_seq = []
    label_seqs = []
    label_seq = []
    for line in codecs.open(filename, "r", "utf-8"):
      lnstrp = line.strip()
      if lnstrp == "":
        if len(str_seq) != 0:
          str_seqs.append(str_seq)
          str_seq = []
          feat_seqs.append(feat_seq)
          feat_seq = []
          label_seqs.append(label_seq)
          label_seq = []
      else:
        if self.do_train:
          clause, label = lnstrp.split("\t")
          label_seq.append(label)
        else:
          clause = lnstrp
        str_seq.append(clause)
        feats = self.fp.get_features(clause)
        feat_dict = {}
        for f in feats:
          if f in feat_dict:
            feat_dict[f] += 1
          else:
            feat_dict[f] = 1
        #feat_dict = {i: v for i, v in enumerate(feats)}
        feat_seq.append(feat_dict)
    if len(str_seq) != 0:
      str_seqs.append(str_seq)
      str_seq = []
      feat_seqs.append(feat_seq)
      feat_seq = []
      label_seqs.append(label_seq)
      label_seq = []
    return str_seqs, feat_seqs, label_seqs

  def predict(self, feat_seqs):
    print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      self.tagger.open(self.trained_model_name)
      preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
    else:
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            if f in self.feat_index:
              x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))
      pred_ind_seqs = self.tagger.predict(Xs)
      preds = []
      for ps in pred_ind_seqs:
        pred = []
        for pred_ind in ps:
          pred.append(self.rev_label_index[pred_ind])
        preds.append(pred)
    return preds

  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
Example #16
0
if RECREATE:
    dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3)
    pickle.dump({
        'dataset': dataset,
        'thetas': thetas
    }, open(FLEXCRF_TEST_DATA_FILE, 'wb'))
else:
    dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE))
    dataset = dd['dataset']
    thetas = dd['thetas']

# -- Start classification ------------------------------------------------

for seq in range(len(dataset)):
    # -- with crfsuite
    s_ = tagger.tag(data['X'][seq])
    y_ = np.array([int(model.labels[s]) for s in s_])
    prob_ = tagger.probability(s_)

    print "\n-- With crfsuite:"
    print "labels:\n", s_, "\n", y_
    print "probability:\t %f" % prob_

    # -- with flexcrf
    f_xy, y = dataset[seq]
    theta = thetas[seq]

    m_xy, f_m_xy = _compute_all_potentials(f_xy, theta)

    y_pred = viterbi_decoder(m_xy)
class label_crf_classifier(object):
    """Conditional Random Field model"""
    def __init__(self, stopword_path="data/stop_words.txt"):
        self.stoplist = set(
            open(stopword_path).read().decode("utf8", "ignore").split("\n"))

    def preprocess(self, training_data):
        self.training_df = training_data[training_data["section_label"] != ""]
        self.training_df['crf_feature'] = self.training_df['crf_feature'].map(
            lambda x: [y for y in x if y not in self.stoplist])
        self.x = self.training_df.groupby("resume_id")["crf_feature"].apply(
            list)
        self.y = self.training_df.groupby("resume_id")["section_label"].apply(
            list)

    def train(self,
              training_data,
              classifier_path="classifier/cache/label_crf_classifier",
              c1=0,
              c2=10,
              period=300,
              minfreq=5):
        self.preprocess(training_data)
        train = Trainer()
        for i1, i in enumerate(self.x):
            train.append(ItemSequence(i), self.y[i1])
        params = {
            "c1": c1,
            "c2": c2,
            "period": period,
            "feature.minfreq": minfreq,
            "max_iterations": 1000
            # "calibration.eta": 0.05,
            # "calibration_samples": 400,
        }
        # train.select(algorithm = "l2sgd")
        train.set_params(params)
        train.train(classifier_path)
        self.tagger = Tagger()
        self.tagger.open(classifier_path)

    def save_classifier(self,
                        classifier_path="classifier/cache/label_crf_classifier"
                        ):
        pass

    def load_classifier(self,
                        classifier_path="classifier/cache/label_crf_classifier"
                        ):
        self.tagger = Tagger()
        self.tagger.open(classifier_path)

    def predict(self, test_data):
        """Input: x should be a list of strings"""
        result = self.tagger.tag(ItemSequence(test_data["crf_feature"]))
        test_data["section_label"] = result
        return test_data

    # def score(self, training_data, classifier_path="classifier/cache/label_crf_classifier", portion=0.8, c1=0, c2=10, period=300, minfreq=10):
    #     # split resume_id
    #     resume_ids = np.unique([resume['resume_id'] for resume in training_data])
    #     length = len(resume_ids)
    #     shuffle(resume_ids)
    #     train_ids = resume_ids[:int(length*portion)]
    #     test_ids = resume_ids[int(length*portion):]

    #     train_df = [resume for resume in training_data if resume['resume_id'] in train_ids]
    #     test_df = [resume for resume in training_data if resume['resume_id'] in test_ids]

    #     # train model on train_ids
    #     self.train(train_df, classifier_path=classifier_path, c1=c1, c2=c2, period=period, minfreq=minfreq)
    #     test_pred = self.predict_all(test_df)
    #     train_pred = self.predict_all(train_df)

    #     # print out result
    #     return train_pred, test_pred


# if __name__ == "__main__":
#     data = MongoRetriveData()
#     resumes = data.get_data_mongo()
#     # pickle.dump(resumes, open('./resume_data.pkl', 'wb'))
#     # resumes = pickle.load(open('./resume_data.pkl', 'rb'))
#     stopword_path = './stopword.txt'
#     model_path = './model.txt'
#     resume_data = resumes
#     clf = Crf(stopword_path, model_path, resume_data)
#     clf.CleanData()
#     clf.Fit()
#     clf.Score()
#     # result = clf.Predict(clf.data)
#     # print result
class ThaiNameTagger:
    def __init__(self):
        """
        Thai named-entity recognizer.
        """
        self.crf = CRFTagger()
        self.crf.open(get_corpus_path(_CORPUS_NAME))

    def get_ner(
        self,
        text: str,
        pos: bool = True,
        tag: bool = False
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        This function tags named-entitiy from text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: To include POS tags in the results (`True`) or
                            exclude (`False`). The defualt value is `True`
        :param bool tag: output like html tag.
        :return: a list of tuple associated with tokenized word, NER tag,
                 POS tag (if the parameter `pos` is specified as `True`),
                 and output like html tag (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuple associated with tokenized
                 word and NER tag
        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

        :Note:
            * For the POS tags to be included in the results, this function
              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
              and corpus as orchid_ud`.

        :Example:

            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>>
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
            ('น.', 'NOUN', 'I-TIME')]
            >>>
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            pos=False)
            [('วันที่', 'O'), (' ', 'O'),
            ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
            ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'),
            ('เวลา', 'O'), (' ', 'O'),
            ('14', 'B-TIME'), (':', 'I-TIME'),
            ('49', 'I-TIME'), (' ', 'I-TIME'),
            ('น.', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            tag=True)
            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
        """
        #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False)
        tokens = _tokenizer.word_tokenize(text)
        pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
        x_test = ThaiNameTagger.__extract_features(pos_tags)
        y = self.crf.tag(x_test)

        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

        if tag:
            temp = ""
            sent = ""
            for idx, (word, ner) in enumerate(sent_ner):
                if ner.startswith("B-") and temp != "":
                    sent += "</" + temp + ">"
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner.startswith("B-"):
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner == "O" and temp != "":
                    sent += "</" + temp + ">"
                    temp = ""
                sent += word

                if idx == len(sent_ner) - 1 and temp != "":
                    sent += "</" + temp + ">"

            return sent

        if pos:
            return [(pos_tags[i][0], pos_tags[i][1], data)
                    for i, data in enumerate(y)]

        return sent_ner

    @staticmethod
    def __extract_features(doc):
        return [_doc2features(doc, i) for i in range(len(doc))]
Example #19
0
def test(features):
    print("Testing..")
    tagger = Tagger()
    tagger.open('crf.model')
    y_pred = [tagger.tag(xseq) for xseq in features]
    return y_pred