def test_bad():
    with pytest.raises(ValueError):
        seq = pycrfsuite.ItemSequence('foo')
        print(seq.items())

    with pytest.raises(ValueError):
        seq = pycrfsuite.ItemSequence([[{'foo': 'bar'}]])
        print(seq.items())
Esempio n. 2
0
 def _add_point_to_model(self, srcid, trainer):
     if self.concatenate_sentences:
         sentence = self.merge_sentences(self.sentence_dict[srcid])
         labels = self.merge_labels(self.label_dict[srcid])
         assert len(sentence) == len(labels)
         trainer.append(pycrfsuite.ItemSequence(
             self._calc_features(sentence, None)), labels)
     else:
         for metadata_type, sentence in self.sentence_dict[srcid].items():
             labels = self.label_dict[srcid][metadata_type]
             trainer.append(pycrfsuite.ItemSequence(
                 self._calc_features(sentence, None)), labels)
Esempio n. 3
0
    def update_model(self, srcids):
        assert (len(self.source_buildings) == len(self.source_sample_num_list))
        self.learning_srcids += srcids

        algo = 'ap'
        trainer = pycrfsuite.Trainer(verbose=False, algorithm=algo)
        if algo == 'ap':
            trainer.set('max_iterations', 125)
            #trainer.set('max_iterations', 200)

            # algorithm: {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'}
        trainer.set_params({
            'feature.possible_states': True,
            'feature.possible_transitions': True
        })
        for srcid in self.learning_srcids:
            for metadata_type, sentence in self.sentence_dict[srcid].items():
                labels = self.label_dict[srcid][metadata_type]
                trainer.append(
                    pycrfsuite.ItemSequence(self._calc_features(
                        sentence, None)), labels)
        if self.use_brick_flag:
            for srcid in self.brick_srcids:
                sentence = self.brick_sentence_dict[srcid]
                labels = self.brick_label_dict[srcid]
                trainer.append(
                    pycrfsuite.ItemSequence(self._calc_features(
                        sentence, None)), labels)
        model_uuid = gen_uuid()
        crf_model_file = 'temp/{0}.{1}.model'.format(model_uuid, 'crfsuite')
        t0 = arrow.get()
        trainer.train(crf_model_file)
        t1 = arrow.get()
        print('training crf took: {0}'.format(t1 - t0))
        with open(crf_model_file, 'rb') as fp:
            model_bin = fp.read()
        model = {
            # 'source_list': sample_dict,
            'gen_time': arrow.get().datetime,
            'use_cluster_flag': self.use_cluster_flag,
            'use_brick_flag': self.use_brick_flag,
            'model_binary': BsonBinary(model_bin),
            'source_building_count': len(self.source_buildings),
            'learning_srcids': sorted(set(self.learning_srcids)),
            'uuid': model_uuid,
            'crftype': 'crfsuite'
        }
        store_model(model)
        os.remove(crf_model_file)
        self.model_uuid = model_uuid
def func_advanced(dialog):
    features = []
    for index, utt in enumerate(dialog):
        feature = {}
        if index == 0:
            feature["FirstUtt"] = 1
        if index > 0 and not(dialog[index].speaker == dialog[index-1].speaker):
            feature["Speaker_Changed"] = 1
        if (utt.pos):
            tokens = [word.token for word in utt.pos]
            feature['Token'] = tokens
            lis_t = [word.pos for word in utt.pos]
            feature['PartOfSpeech'] = lis_t
            leng_t = len(utt.pos)
            feature['Length'] = leng_t
            feature['START_WITH'] = utt.pos[0].token
            bigrams = list(zip(tokens[:-1], tokens[1:]))
            lis_t1 = [x+"_"+y for x, y in bigrams]
            feature['BiGram'] = lis_t1
            if(utt.pos[-1].token == '?'):
                feature['Statement'] = 'Question'
            else:
                feature['Statement'] = 'Answer'
            trigrams = list(zip(tokens[:-2], tokens[2:]))
            feature['TriGram'] = ["_".join(tri) for tri in trigrams]
        else:
            feature['Other'] = utt.text.strip("<>.,")
        features.append(feature)
    return pycrfsuite.ItemSequence(features)
Esempio n. 5
0
    def tag(data, model_file, class_id):
        warnings.warn('Use non-static `annotate` instead', DeprecationWarning)
        """
        :type data: nalaf.structures.data.Dataset
        :type model_file: str
        """

        tagger = pycrfsuite.Tagger()

        try:
            tagger.open(model_file)

            for sentence in data.sentences():
                labels = tagger.tag(
                    pycrfsuite.ItemSequence(token.features
                                            for token in sentence))

                for token_index in range(len(sentence)):
                    label = labels[token_index]
                    sentence[token_index].predicted_labels = [
                        Label(label, tagger.marginal(label, token_index))
                    ]

            data.form_predicted_annotations(class_id)

        finally:
            tagger.close()
def test_floatlists():
    seq = pycrfsuite.ItemSequence([{
        "w2v": FF([1., 2., 3.])
    }, {
        "w2v": FF([-1., 5, 4.])
    }])
    assert len(seq) == 2
    assert seq.items() == [{
        "w2v:0": 1.,
        "w2v:1": 2.,
        "w2v:2": 3.
    }, {
        "w2v:0": -1.,
        "w2v:1": 5.,
        "w2v:2": 4.
    }]
    assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
def test_dicts():
    seq = pycrfsuite.ItemSequence([
        {
            'foo': True,
            'bar': {
                'foo': -1,
                'baz': False
            }
        },
    ])
    assert len(seq) == 1
    assert seq.items() == [{'foo': 1.0, 'bar:foo': -1, 'bar:baz': 0.0}]
def test_unicode():
    seq = pycrfsuite.ItemSequence([
        {
            'foo': u'привет',
            u'ключ': 1.0,
            u'привет': u'мир'
        },
    ])
    assert seq.items() == [{
        u'foo:привет': 1.0,
        u'ключ': 1.0,
        u'привет:мир': 1.0
    }]
Esempio n. 9
0
    def annotate(self, corpus, class_id):
        """
        :type corpus: nalaf.structures.data.Dataset
        :type class_id: str ~ to annotate with
        """

        for sentence in corpus.sentences():
            labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence))

            for token_index in range(len(sentence)):
                label = labels[token_index]
                sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))]

        corpus.form_predicted_annotations(class_id)
def test_nested():
    seq = pycrfsuite.ItemSequence([
        {
            "foo": {
                "bar": "baz",
                "spam": 0.5,
                "egg": ["x", "y"],
                "ham": {
                    "x": -0.5,
                    "y": -0.1
                }
            },
        },
        {
            "foo": {
                "bar": "ham",
                "spam": -0.5,
                "ham": set(["x", "y"])
            },
        },
    ])
    assert len(seq) == 2
    assert seq.items() == [{
        'foo:bar:baz': 1.0,
        'foo:spam': 0.5,
        'foo:egg:x': 1.0,
        'foo:egg:y': 1.0,
        'foo:ham:x': -0.5,
        'foo:ham:y': -0.1,
    }, {
        'foo:bar:ham': 1.0,
        'foo:spam': -0.5,
        'foo:ham:x': 1.0,
        'foo:ham:y': 1.0,
    }]
    assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
Esempio n. 11
0
    def train(data, model_file, params=None):
        """
        :type data: nalaf.structures.data.Dataset
        :type model_file: str ~ filename (from local file system) to save trained model to. If None, no model is saved.
        """

        trainer = pycrfsuite.Trainer()
        if params is not None:
            trainer.set_params(params)

        for sentence in data.sentences():
            trainer.append(pycrfsuite.ItemSequence([token.features for token in sentence]),
                           [token.original_labels[0].value for token in sentence])

        # The CRFSuite library handles the "pickling" of the file; saves the model here
        trainer.train(model_file)
Esempio n. 12
0
    def annotate(self, corpus, class_id):
        """
        :type corpus: nalaf.structures.data.Dataset
        :type class_id: str ~ to annotate with
        """

        for sentence in corpus.sentences():
            labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence))

            for token_index in range(len(sentence)):
                label = labels[token_index]
                try:
                    sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))]
                except Exception as e:
                    raise Exception("Exception when assining the predicted labels; likely a Multi-Thread problem", e)

        corpus.form_predicted_annotations(class_id)
Esempio n. 13
0
    def asSequence(self, fl):
        X = []
        Y = []
        for x in self:
            if not isinstance(x, list):
                return None

            sq_dt = []
            sq_lbl = []

            for xsq in x:
                sq_dt.append(xsq[0].getIndices2(fl))
                sq_lbl.append(xsq[1])

            iq = pycrfsuite.ItemSequence(sq_dt)

            X.append(iq)
            Y.append(sq_lbl)
        return X, Y
Esempio n. 14
0
    def feature(self, l_items):

        l_items_range = range(len(l_items))
        ret = []
        for wid, item in enumerate(l_items):
            d_feature = {}
            for name, l_rule, weight in self.template:
                subfeature = []
                for field, offset in l_rule:
                    if field == "bos":
                        if wid == 0:
                            subfeature.append("__BOS__")
                        else:
                            subfeature = None
                            break
                    elif field == "eos":
                        if wid == len(l_items) - 1:
                            subfeature.append("__EOS__")
                        else:
                            subfeature = None
                            break
                    else:
                        p = wid + offset
                        if p in l_items_range:
                            temp_item = l_items[p]
                            val = self._get_item(temp_item, field)
                            subfeature.append(val)
                        else:
                            subfeature = None
                            break
                if subfeature is not None:
                    s = set(subfeature)
                    if len(s) == 1:
                        if len(s & set(self.ig_key)) == 1:
                            weight = weight * self.ig_val
                    fval = "|".join(subfeature)
                    key = "=".join((name, fval))
                    d_feature[key] = weight
                    #print(key, weight)
            ret.append(d_feature)

        return pycrfsuite.ItemSequence(ret)
Esempio n. 15
0
            "Train and test files are in 1 word per line format, w2v and c2v are pickles mapping a word or a "
            "character to its google embedding vector or char embedding.")
        exit()
    train, test = sys.argv[1], sys.argv[2]
    use_embeddings = len(sys.argv) == 5
    if use_embeddings:
        w2v_vocab, w2v_weights = w2v_matrix_vocab_generator(sys.argv[3])
        c2v_vocab, c2v_weights = w2v_matrix_vocab_generator(sys.argv[4])

    train = get_data(train)
    test = get_data(test)

    trainer = pycrfsuite.Trainer(verbose=True)
    # TRAINING
    X_train = ([
        pycrfsuite.ItemSequence(sent2features(s, use_embeddings))
        for s in train
    ])

    y_train = [sent2labels(s) for s in train]

    X_test = ([
        pycrfsuite.ItemSequence(sent2features(s, use_embeddings)) for s in test
    ])
    y_test = [sent2labels(s) for s in test]

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 0.0,  # coefficient for L1 penalty
Esempio n. 16
0
        sys.exit(1)

    # Check the testing file exists or not?
    if not os.path.exists(path):
        print "The testing file \'%s\' does not exists. Try again!" % path
        sys.exit(1)
    elif not os.path.isfile(path):
        print "The testing file \'%s\' is not a file. Try again!" % path
        sys.exit(1)

    # Loading testing file
    print "Loading the testing file ..."
    testset = np.load(path)

    # Convert testing set into CRF Feature Format
    featset = pycrfsuite.ItemSequence(testset[:, 0])
    ref = [str(label) for label in testset[:, 1]]

    # Loading the model
    print "Loading the CRF model..."
    tagger = pycrfsuite.Tagger()
    tagger.open(model)

    # Testing progress
    #sys.stdout.write("Testing: ")
    #sys.stdout.flush()
    #pred = []
    #idx = 0
    #for i in featset.items():
    #    idx += 1
    #    if idx % 1000 == 0:
def test_basic():
    seq = pycrfsuite.ItemSequence([])
    assert len(seq) == 0
    assert seq.items() == []
Esempio n. 18
0

train_sents = LoadFile.load_crf_data(train_data_path)
test_sents = LoadFile.load_crf_data(test_data_path)
X_train = [sent2features(s) for s in train_sents]

y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):

    itemseq = pycrfsuite.ItemSequence(xseq)

    trainer.append(itemseq, yseq)
trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train('conll2002-esp.crfsuite')
print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]

tagger = pycrfsuite.Tagger()
Esempio n. 19
0
    idx = 0
    X = []
    Y = []
    for sen in txt:
        for p in sen.getPredicates():

            if p.pos.startswith("V"):
                sq_dt = []
                sq_lbl = []
                for arg in sen:
                    if arg in p.arguments.keys():
                        sq_dt.append(ds[idx][0].getIndices2(fg))
                        sq_lbl.append(ds[idx][1])
                        idx += 1

                iq = pycrfsuite.ItemSequence(sq_dt)

                X.append(iq)
                Y.append(sq_lbl)
    print("start training...")
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X, Y):
        trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
Esempio n. 20
0
    labels = list(crf.classes_)
    #labels.remove('O')
    y_pred = crf.predict(X_test)
    #print(cross_val_score(crf,X_train,y_train,cv=5,scoring='f1_micro'))
    print(metrics.flat_f1_score(y_test, y_pred, average='weighted'))
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        metrics.flat_classification_report(y_test,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    clf = svm.SVC(kernel='linear')
    temp = numpy.array(DictVectorizer(sparse=False).fit_transform(X_train[0]))
    X = list()
    to_vector = pycrfsuite.ItemSequence(X_train[0]).items()
    for t in X_train:
        wordtemp = pycrfsuite.ItemSequence(t).items()
        for z in wordtemp:
            to_vector.append(z)
    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(to_vector)
    Y1 = list()
    for t in y_train[0]:
        Y1.append(t)
    for t in y_train:
        #wordtemp=pycrfsuite.ItemSequence(t).items()
        for z in t:
            Y1.append(z)
    #Y=FeatureHasher().fit_transform(Y1)
    clf.fit(X, Y1)
def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW',
         verbose=False):
    """
    A simple test to check that the calling mechanism from Python gives the same
    results as if CRFsuite were called directly from the command line. Requires
    a compiled version of the CRFsuite.

    :param raw_string: input string to test
    :type raw_string: str
    :param verbose: additional debugging output
    :type verbose: bool

    :return: None
    """
    print('Input string:', raw_string)
    print('Python Results:', tag(raw_string))

    tokens = tok.tokenize(raw_string)
    features = tok.tokens2features(tokens)

    if verbose:
        print('features:', features)

    tags = TAGGER.tag(features)
    print('Inferred tags:', tags)

    print('Probability of the sequence:', round(TAGGER.probability(tags), 6))
    assert round(TAGGER.probability(tags),
                 6) == 0.992256, 'Sequence probability not correct'

    results = [
        0.999999, 0.999999, 0.999846, 0.993642, 0.999728, 1., 1., 0.998874, 1.,
        1.
    ]
    for i, tg in enumerate(tags):
        prob = round(TAGGER.marginal(tg, i), 6)
        print('Marginal probability of', tg, 'in position', i, 'is', prob)
        assert prob == results[
            i], 'Marginal Probability of a Label not correct'

    if verbose:
        print(TAGGER.info().transitions)
        print(TAGGER.info().state_features)
        print(TAGGER.info().attributes)

    # store the ItemSequence temporarily
    tmp = pycrfsuite.ItemSequence(features)

    # write to a text file
    fh = open('training/test.txt', 'w')
    for i, tg in enumerate(tags):
        fh.write(tg + '\t')
        items = tmp.items()[i]
        for item in sorted(items):
            itemtext = str(item)
            fh.write(
                itemtext.replace(':', '\:') + ':' + str(items[item]) + '\t')
        fh.write('\n')
    fh.close()

    # command line call to the C code to test the output
    print('\nCRFsuite call results:')
    os.system(
        'crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')
Esempio n. 22
0
iter_num = 15
sample_num = 300
precision_list = list()
for c in range(0, iter_num):
    print(c)
    #%%time
    trainer = pycrfsuite.Trainer(verbose=False)
    #for srcid, setence in sentenceDict.items():

    randomIdxList = random.sample(range(0, len(labelListDict)), sample_num)
    for i, (srcid, labels) in enumerate(labelListDict.items()):
        if i not in randomIdxList:
            continue
        sentence = sentenceDict[srcid]
        #trainer.append(pycrfsuite.ItemSequence(calc_features(sentence, labels)), labels)
        trainer.append(pycrfsuite.ItemSequence(calc_features(sentence)),
                       labels)

    # In[6]:

    #%%time
    trainer.train('random.crfsuite')

    # In[7]:

    tagger = pycrfsuite.Tagger()
    tagger.open('random.crfsuite')

    # In[8]:

    #%%time
Esempio n. 23
0
def sent2features(sent, features):
    return pycrfsuite.ItemSequence(
        [word2features(sent, i, features) for i in range(len(sent))])
Esempio n. 24
0
 def itemsequence(self):
     return pycrfsuite.ItemSequence(self.features)
def to_item_sequence(x, feat):
    return pycrf.ItemSequence([feat(x, i) for i in range(0, len(x))])
def test_lists():
    seq = pycrfsuite.ItemSequence([['foo', 'bar'], ['bar', 'baz']])
    assert len(seq) == 2
    assert seq.items() == [{'foo': 1.0, 'bar': 1.0}, {'bar': 1.0, 'baz': 1.0}]
    assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
Esempio n. 27
0
def sent2features(d, sent, h_input):
    return pycrfsuite.ItemSequence(
        [word2features(d, sent, h_input, i) for i in range(len(sent))])
Esempio n. 28
0
        sys.exit(2)

    # Check the training file exists or not?
    if not os.path.exists(path):
        print "The training file \'%s\' does not exists. Try again!" % path
        sys.exit(1)
    elif not os.path.isfile(path):
        print "The training file \'%s\' is not a file. Try again!" % path
        sys.exit(1)

    # Loading training data
    print "Loading the training data..."
    trainset = np.load(path)

    # Convert training data to CRF Feature Format
    featset = pycrfsuite.ItemSequence(trainset[:, 0])
    labelset = trainset[:, 1]

    # Create a trainer
    trainer = pycrfsuite.Trainer()

    # Feeding training data to Trainer
    trainer.append(featset, labelset)

    # Set up some parameters of Trainer
    trainer.set_params({'c1': 1.0,\
                        'c2': 1e-3,\
                        'max_iterations': 50,\
                        'feature.possible_transitions': True})

    # Show parameters of Trainer