Example #1
0
def load_model(model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    tagger = Tagger()
    tagger.open(model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    return tagger
 def train(self,
           training_data,
           classifier_path="classifier/cache/label_crf_classifier",
           c1=0,
           c2=10,
           period=300,
           minfreq=5):
     self.preprocess(training_data)
     train = Trainer()
     for i1, i in enumerate(self.x):
         train.append(ItemSequence(i), self.y[i1])
     params = {
         "c1": c1,
         "c2": c2,
         "period": period,
         "feature.minfreq": minfreq,
         "max_iterations": 1000
         # "calibration.eta": 0.05,
         # "calibration_samples": 400,
     }
     # train.select(algorithm = "l2sgd")
     train.set_params(params)
     train.train(classifier_path)
     self.tagger = Tagger()
     self.tagger.open(classifier_path)
Example #3
0
def test_open_inmemory_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open_inmemory(b'')

    with pytest.raises(ValueError):
        tagger.open_inmemory(b'lCRFabc')
Example #4
0
    def tag(self,
            data,
            form_col=None,
            ilbl_col=None,
            tagger=None,
            cols=None,
            ts=None):
        """Tags TSV/CSV or np.recarray data using the loaded CRFSuite model.

        See documentation for `train` for more details on requirements for the
        data passed to this method.

        :param data: data
        :type data: str or recarray
        :param form_col: form column name
        :type form_col: str
        :param ilbl_col: inference label column name
        :type ilbl_col: str
        :param tagger: CRFS tagger
        :type tagger: Tagger
        :param cols: TSV column names
        :type cols: str or list of str
        :param ts: tab separator for TSV
        :type ts: str
        :return: tagged data
        :rtype: recarray
        """

        fc = form_col if form_col else self.form_col
        c = cols if cols else self.cols
        sep = ts if ts else self.ts
        ilc = ilbl_col if ilbl_col else self.ilbl_col

        if type(data) in [np.core.records.recarray, np.ndarray]:
            d = data
        elif type(data) == str:
            d = parse_tsv(s=data, cols=c, ts=sep)
        else:
            raise ValueError('Invalid input type.')

        tgr = tagger

        if tgr is None and self.tagger:
            tgr = self.tagger
        elif tgr is None:
            tgr = Tagger()
            tgr.open('%s.crfs' % self.model_path)

        # extracting features
        X = self._extract_features(d, form_col=fc)

        # tagging sentences
        idx = 0
        for fts in X:
            for l in tgr.tag(fts):
                d[idx][ilc] = l
                idx += 1

        return d
def load_tagger(model_path):
    """Loads tagger from a CRFSUITE binary model file.

    :param str model_path: path to the binary model file.

    """
    tagger = Tagger()
    tagger.open(model_path)
    return tagger
Example #6
0
class SentimentTagger:
    def __init__(self):
        self.tagger = Tagger()

    def load_model(self, path):
        self.tagger.open(path)

    def tag_tweets(self, tweet_features_list):
        features = ItemSequence(tweet_features_list)
        labels = self.tagger.tag(features)
        return labels
Example #7
0
def main(argv) :

    inputDir = argv[0]
    testDir = argv[1]
    outputFPath = argv[2]


    trainData = list(get_data(inputDir))
    testData = list(get_data(testDir))


    random.shuffle(trainData)


    # create features
    trainFeatures = create_features(trainData)
    testFeatures = create_features(testData)

    trainer = Trainer()
    for dialogue in trainFeatures :
        trainer.append(dialogue[0],dialogue[1])

    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('./model.pkl')

    outputFile = open(outputFPath,'w')
    tagger = Tagger()
    tagger.open('./model.pkl')


    totalUtter=correctUtter=0
    for dialogue in testFeatures :
        preds = tagger.tag(dialogue[0])
        labels = dialogue[1]
        for i,pred in enumerate(preds) :
            outputFile.write(pred+'\n')
            if len(labels)>0 :
                totalUtter += 1
                if labels[i]==pred :
                    correctUtter += 1
        outputFile.write('\n')

    if totalUtter > 0 :
        accuracy = correctUtter/totalUtter
        print('Accuracy: '+str(accuracy))
    outputFile.close()
Example #8
0
def predict_crf(reader, model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    tagger = Tagger()
    tagger.open(model_path)

    _log.info('Extracting features from test corpus')
    itemseq = ItemSequence(
        [fs for sent in reader.sents() for fs in extract_crf_features(sent)])

    _log.info('Making predictions with the model')
    return tagger.tag(itemseq)
Example #9
0
def test_open_close_labels(model_filename, yseq):
    tagger = Tagger()

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()

    with tagger.open(model_filename):
        labels = tagger.labels()
    assert set(labels) == set(yseq)

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()
Example #10
0
class CRFchunk:
    def __init__(self, corpus: str = "orchidpp"):
        self.corpus = corpus
        self.load_model(self.corpus)

    def load_model(self, corpus: str):
        self.tagger = CRFTagger()
        if corpus == "orchidpp":
            self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
        self.tagger.open(self.path)

    def parse(self, token_pos: List[Tuple[str, str]]) -> List[str]:
        self.xseq = extract_features(token_pos)
        return self.tagger.tag(self.xseq)
Example #11
0
    def _load_tagger(self):
        # In pycrfsuite, you have to save the model first, then load it as a tagger
        self.model_name = 'model_{}'.format(self.task_obj.unique_id)
        file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name)
        try:
            tagger = Tagger()
            tagger.open(file_path)
        except Exception as e:
            print(e)
            self.error_logger.error('Failed to load crf model from the filesystem.', exc_info=True, extra={
                'model_name': self.model_name,
                'file_path': file_path})

        self.tagger = tagger
        return self.tagger
Example #12
0
    def _load_tagger(self):
        # In pycrfsuite, you have to save the model first, then load it as a tagger
        self.model_name = 'model_{}'.format(self.task_obj.unique_id)
        file_path = os.path.join(MODELS_DIR, self.task_type, self.model_name)
        try:
            tagger = Tagger()
            tagger.open(file_path)
        except Exception as e:
            print(e)
            logging.getLogger(ERROR_LOGGER).error('Failed to load crf model from the filesystem.', exc_info=True, extra={
                'model_name': self.model_name,
                'file_path':  file_path})

        self.tagger = tagger
        return self.tagger
Example #13
0
def test_tag_probability(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        res = tagger.tag(xseq)
        prob = tagger.probability(res)
        prob2 = tagger.probability([yseq[0]] * len(yseq))
        assert prob > prob2
        assert 0 < prob < 1
        assert 0 < prob2 < 1
Example #14
0
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq
Example #15
0
def test_tag_bools(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        # Some values are bools:
        # True <=> 1.0; False <=> 0.0
        data = [
            dict((k, bool(v) if v == 0 or v == 1 else v)
                 for (k, v) in x.items()) for x in xseq
        ]
        assert tagger.tag(data) == yseq
class CRFsuiteEntityRecognizer:
    def __init__(
        self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder
    ) -> None:
        self.feature_extractor = feature_extractor
        self._encoder = encoder
        self.tagger = Tagger()

    @property
    def encoder(self) -> EntityEncoder:
        return self._encoder

    def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None:
        trainer = Trainer(algorithm=algorithm, params=params, verbose=False)
        for doc in docs:
            for sentence in doc.sents:
                tokens = list(sentence)
                features = self.feature_extractor.extract(
                    [str(token) for token in tokens]
                )
                labels = self.encoder.encode(tokens)
                trainer.append(features, labels)
        trainer.train(path)
        self.tagger.close()
        self.tagger.open(path)

    def __call__(self, doc: Doc) -> Doc:
        doc_ent = []
        for sentence in doc.sents:
            tokens = list(sentence)
            labels = self.predict_labels([str(token) for token in tokens])
            entities = decode_bilou(labels, tokens, doc)
            # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities)))
            for entity in entities:
                doc_ent.append(entity)
        doc.ents = doc_ent
        return doc

    def predict_labels(self, tokens: Sequence[str]) -> List[str]:
        features = self.feature_extractor.extract(tokens)
        return self.tagger.tag(features)
Example #17
0
 def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
   self.trained_model_name = trained_model_name
   self.fp = FeatureProcessing()
   self.do_train = do_train
   self.algorithm = algorithm
   if algorithm == "crf":
     if do_train:
       self.trainer = Trainer()
     else:
       self.tagger = Tagger()
   else:
     if do_train:
       model = ChainCRF()
       self.trainer = FrankWolfeSSVM(model=model)
       self.feat_index = {}
       self.label_index = {}
     else:
       self.tagger = pickle.load(open(self.trained_model_name, "rb"))
       self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
       label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
       self.rev_label_index = {i: x for x, i in label_index.items()}
Example #18
0
def test_open_inmemory_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open_inmemory(b'')

    with pytest.raises(ValueError):
        tagger.open_inmemory(b'lCRFabc')
Example #19
0
def crf_predict(
    tagger: pycrfsuite.Tagger,
    gp_data: list,
    mode: str = 'raw',
    exclude_labels: list = ['NOL', 'NAT', 'NEE']
) -> Union[list, Tuple[list, pd.DataFrame]]:
    """Return predictions for the test data, grouped by file. 3 modes for return:
		* Return raw predictions (raw)
		* Return predictions with only valid tags (exclude_ool)
		* Return predictions (valid tags) and probabilities for each class (rt_proba)

	Predictions are returned unflattened
	
	https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html
	"""
    if mode not in ['raw', 'exclude_ool', 'rt_proba']:
        raise ValueError(
            f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}")
    if mode == 'raw':
        return [tagger.tag(xseq) for xseq in gp_data]
    labels = tagger.labels()

    res = []
    y_pred = []
    for fi, xseq in enumerate(gp_data):
        tagger.set(xseq)
        file_proba = pd.DataFrame({
            label: [tagger.marginal(label, i) for i in range(len(xseq))]
            for label in labels
        })
        y_pred.append(file_proba[[
            col for col in file_proba.columns if col not in exclude_labels
        ]].idxmax(axis=1).tolist())
        file_proba['file_id'] = fi
        res.append(file_proba)

    if mode == 'rt_proba':
        return y_pred, pd.concat(res, axis=0)
    return y_pred  # else
Example #20
0
def test_dump(tmpdir, model_filename):
    with Tagger().open(model_filename) as tagger:
        dump_filename = str(tmpdir.join("dump.txt"))
        tagger.dump(dump_filename)

        with open(dump_filename, 'rb') as f:
            res = f.read().decode('utf8')
            assert 'LABELS = {' in res
            assert u'солнце:не светит --> rainy:' in res

    # it shouldn't segfault on a closed tagger
    with pytest.raises(RuntimeError):
        tagger.dump(dump_filename)
Example #21
0
def gen(corpus=test, model='m.model', indir=INDIR, outdir=''):
    tagger = Tagger()
    tagger.open(model)
    for doc in corpus.documents:
        path = setup_newdir(doc.filepath, olddir=indir, newdir=outdir,
                            suffix='--', renew=True)
        if not path:
            continue
        mkparentdirs(path)
        task = etree.Element(TASK_ROOT)
        tags = etree.Element(TAGS_ROOT)
        tokens = etree.Element(TOKENS_ROOT)
        task.append(tags)
        task.append(tokens)
        sents = doc.sentences
        seqs = doc.sequence_list()
        tagged_seqs = [tagger.tag(seq) for seq in seqs]
        freq_dict = defaultdict(int)
        for (sent, seq, tagged_seq) in zip(sents, seqs, tagged_seqs):
            s = etree.Element('s')
            for (lex, feat, label) in zip(sent.getchildren(), seq, tagged_seq):
                    lex_tag = etree.Element(lex.tag, lex.attrib)
                    lex_tag.text = lex.text
                    s.append(lex_tag)
                    if label != 'None':
                        iso_tag = etree.Element(label)
                        if label in attribs:
                            for key in attribs[label]:
                                iso_tag.attrib[key] = attribs[label][key]
                        iso_tag.attrib['text'] = lex.text
                        iso_tag.attrib['id'] = ids[label] + str(freq_dict[label])
                        lex_tag.attrib['id'] = iso_tag.attrib['id']
                        freq_dict[label] += 1
                        tags.append(iso_tag)
            tokens.append(s)
        s = etree.tostring(task, pretty_print=True)
        with open(path, 'w') as f:
            print>>f, HEADER
            print>>f, s
    def __init__(self, version: str = "1.5") -> None:
        """
        Thai named-entity recognizer.

        :param str version: Thai NER version.
                            It's support Thai NER 1.4 & 1.5.
                            The defualt value is `1.5`
        """
        self.crf = CRFTagger()

        if version == "1.4":
            self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
            self.pos_tag_name = "orchid_ud"
        else:
            self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
            self.pos_tag_name = "lst20"
Example #23
0
def test_info(model_filename):
    with Tagger().open(model_filename) as tagger:
        res = tagger.info()

        assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny',
                                                                      'rainy')]
        assert res.state_features[('walk', 'sunny')] > res.state_features[(
            'walk', 'rainy')]
        assert (u'солнце:не светит', u'rainy') in res.state_features
        assert res.header['num_labels'] == '2'
        assert set(res.labels.keys()) == set(['sunny', 'rainy'])
        assert set(res.attributes.keys()) == set(
            ['shop', 'walk', 'clean', u'солнце:не светит'])

    # it shouldn't segfault on a closed tagger
    with pytest.raises(RuntimeError):
        tagger.info()
Example #24
0
def cal_confidence_score(sample, model_name):
    '''
    给未标注数据打标签,并且计算得分,返回结果
    :param model: 模型
    :param sample: 对象
    :return: 预测的功能点名称  置信度
    '''
    model = Tagger()
    model.open(model_name)
    # unlabeled sample features
    feature_sequence = build_model_features(sample, 17, False)
    # words
    # words = sample.sen_words
    chars = list(sample.sentence)
    model.set(feature_sequence)
    predicted_labels = model.tag()

    # get predicted_fps
    fp_list = []
    fp = ''
    for index, label in enumerate(predicted_labels):
        if label == 'B' or label == 'I' or label == 'E':
            fp += chars[index]
        if label == 'N' and len(fp) > 0:
            fp_list.append(fp)
            fp = ''

    # calculate the probability of tagging
    crf_confidence = model.probability(predicted_labels)

    lan_confidence = 0
    filtered_fp_list = []
    for fp_name in fp_list:
        filtered_fp_list.append(fp_name)

    if len(filtered_fp_list) == 0:
        predicted_fps = 'null'
    else:
        predicted_fps = ' '.join(filtered_fp_list)

    # print(str(sample.story_id) +' '+ sample.sentence +' '+ fp +' '+ str(confidence))
    # 为防止多进程乱序执行导致结果跟sample不对应,因此同时返回sample信息
    return sample.story_id, sample.sentence, predicted_fps, crf_confidence
Example #25
0
def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([
        {
            "foo": {
                "bar": "baz",
                "spam": 0.5,
                "egg": ["x", "y"],
                "ham": {
                    "x": -0.5,
                    "y": -0.1
                }
            },
        },
        {
            "foo": {
                "bar": "ham",
                "spam": -0.5,
                "ham": set(["x", "y"])
            },
        },
    ], ['first', 'second'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0
Example #26
0
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append([{
        'foo': 'bar'
    }, {
        'baz': False
    }, {
        'foo': 'bar',
        'baz': True
    }, {
        'baz': 0.2
    }], ['spam', 'egg', 'spam', 'spam'])
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0
Example #27
0
def test_open_close_labels(model_filename, yseq):
    tagger = Tagger()

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()

    with tagger.open(model_filename):
        labels = tagger.labels()
    assert set(labels) == set(yseq)

    with pytest.raises(ValueError):
        # tagger should be closed, so labels() method should fail here
        labels = tagger.labels()
class ThaiNameTagger:
    def __init__(self):
        """
        Thai named-entity recognizer.
        """
        self.crf = CRFTagger()
        self.crf.open(get_corpus_path(_CORPUS_NAME))

    def get_ner(
        self,
        text: str,
        pos: bool = True,
        tag: bool = False
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        This function tags named-entitiy from text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: To include POS tags in the results (`True`) or
                            exclude (`False`). The defualt value is `True`
        :param bool tag: output like html tag.
        :return: a list of tuple associated with tokenized word, NER tag,
                 POS tag (if the parameter `pos` is specified as `True`),
                 and output like html tag (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuple associated with tokenized
                 word and NER tag
        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

        :Note:
            * For the POS tags to be included in the results, this function
              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
              and corpus as orchid_ud`.

        :Example:

            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>>
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
            ('น.', 'NOUN', 'I-TIME')]
            >>>
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            pos=False)
            [('วันที่', 'O'), (' ', 'O'),
            ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
            ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'),
            ('เวลา', 'O'), (' ', 'O'),
            ('14', 'B-TIME'), (':', 'I-TIME'),
            ('49', 'I-TIME'), (' ', 'I-TIME'),
            ('น.', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            tag=True)
            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
        """
        #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False)
        tokens = _tokenizer.word_tokenize(text)
        pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
        x_test = ThaiNameTagger.__extract_features(pos_tags)
        y = self.crf.tag(x_test)

        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

        if tag:
            temp = ""
            sent = ""
            for idx, (word, ner) in enumerate(sent_ner):
                if ner.startswith("B-") and temp != "":
                    sent += "</" + temp + ">"
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner.startswith("B-"):
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner == "O" and temp != "":
                    sent += "</" + temp + ">"
                    temp = ""
                sent += word

                if idx == len(sent_ner) - 1 and temp != "":
                    sent += "</" + temp + ">"

            return sent

        if pos:
            return [(pos_tags[i][0], pos_tags[i][1], data)
                    for i, data in enumerate(y)]

        return sent_ner

    @staticmethod
    def __extract_features(doc):
        return [_doc2features(doc, i) for i in range(len(doc))]
 def __init__(self):
     """
     Thai named-entity recognizer.
     """
     self.crf = CRFTagger()
     self.crf.open(get_corpus_path(_CORPUS_NAME))
Example #30
0
    # YOUR CODE HERE .....

    pass


# -- Load data and crfsuite model and convert them-------------------------

RECREATE = True  # set to True to recreate flexcrf data with new model

CRFSUITE_MODEL_FILE = '../conll2002/conll2002-esp.crfsuite'
CRFSUITE_TEST_DATA_FILE = '../conll2002/conll2002-esp_crfsuite-test-data.dump'
FLEXCRF_TEST_DATA_FILE = '../conll2002/conll2002-esp_flexcrf-test-data.dump'

# crfsuite model
tagger = Tagger()
tagger.open(CRFSUITE_MODEL_FILE)
model = tagger.info()

data = pickle.load(open(CRFSUITE_TEST_DATA_FILE))
print "test data loaded."

if RECREATE:
    dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3)
    pickle.dump({
        'dataset': dataset,
        'thetas': thetas
    }, open(FLEXCRF_TEST_DATA_FILE, 'wb'))
else:
    dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE))
    dataset = dd['dataset']
Example #31
0
def test(features: pd.Series) -> list:
    tagger = Tagger()
    tagger.open('crf.model')
    y_pred = [tagger.tag(xseq) for xseq in features]
    return y_pred
Example #32
0
def test_tag_item_sequence(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(ItemSequence(xseq)) == yseq
Example #33
0
def test_tag(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq
Example #34
0
def test_tag_not_opened(xseq):
    tagger = Tagger()
    with pytest.raises(Exception):
        tagger.tag(xseq)
Example #35
0
class PassageTagger(object):
  def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
    self.trained_model_name = trained_model_name
    self.fp = FeatureProcessing()
    self.do_train = do_train
    self.algorithm = algorithm
    if algorithm == "crf":
      if do_train:
        self.trainer = Trainer()
      else:
        self.tagger = Tagger()
    else:
      if do_train:
        model = ChainCRF()
        self.trainer = FrankWolfeSSVM(model=model)
        self.feat_index = {}
        self.label_index = {}
      else:
        self.tagger = pickle.load(open(self.trained_model_name, "rb"))
        self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
        label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
        self.rev_label_index = {i: x for x, i in label_index.items()}

  def read_input(self, filename):
    str_seqs = []
    str_seq = []
    feat_seqs = []
    feat_seq = []
    label_seqs = []
    label_seq = []
    for line in codecs.open(filename, "r", "utf-8"):
      lnstrp = line.strip()
      if lnstrp == "":
        if len(str_seq) != 0:
          str_seqs.append(str_seq)
          str_seq = []
          feat_seqs.append(feat_seq)
          feat_seq = []
          label_seqs.append(label_seq)
          label_seq = []
      else:
        if self.do_train:
          clause, label = lnstrp.split("\t")
          label_seq.append(label)
        else:
          clause = lnstrp
        str_seq.append(clause)
        feats = self.fp.get_features(clause)
        feat_dict = {}
        for f in feats:
          if f in feat_dict:
            feat_dict[f] += 1
          else:
            feat_dict[f] = 1
        #feat_dict = {i: v for i, v in enumerate(feats)}
        feat_seq.append(feat_dict)
    if len(str_seq) != 0:
      str_seqs.append(str_seq)
      str_seq = []
      feat_seqs.append(feat_seq)
      feat_seq = []
      label_seqs.append(label_seq)
      label_seq = []
    return str_seqs, feat_seqs, label_seqs

  def predict(self, feat_seqs):
    print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      self.tagger.open(self.trained_model_name)
      preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
    else:
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            if f in self.feat_index:
              x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))
      pred_ind_seqs = self.tagger.predict(Xs)
      preds = []
      for ps in pred_ind_seqs:
        pred = []
        for pred_ind in ps:
          pred.append(self.rev_label_index[pred_ind])
        preds.append(pred)
    return preds

  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
Example #36
0
def test_tag_string_lists(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        # Working with lists is supported,
        # but if we discard weights the results become different
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) != yseq
Example #37
0
def test_open_non_existing():
    tagger = Tagger()
    with pytest.raises(IOError):
        tagger.open('foo')
Example #38
0
def test_open_invalid_with_correct_signature(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b"lCRFfoo"*100)
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #39
0
def test_open_invalid_small(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b'foo')
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp))
Example #40
0
def test_open_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(__file__)