コード例 #1
0
ファイル: profile.py プロジェクト: REVLWorld/pyhacrf
def test_derivate_large():
    classes = ['a', 'b', 'c']
    y = 'b'
    x = random.randn(20, 3, 10) * 5 + 3
    state_machine, states_to_classes = Hacrf._default_state_machine(classes)
    parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
    parameters = random.randn(*parameters.shape) * 10 - 2

    test_model = _Model(state_machine, states_to_classes, x, y)
    expected_dll = np.zeros(parameters.shape)

    # Finite difference gradient approximation
    delta = 10.0**-7
    S, D = expected_dll.shape
    for s in xrange(S):
        for d in xrange(D):
            dg = np.zeros(parameters.shape)
            dg[s, d] = delta
            y0, _ = test_model.forward_backward(parameters)
            y1, _ = test_model.forward_backward(parameters + dg)
            expected_dll[s, d] = (y1 - y0) / delta

    actual_ll, actual_dll = test_model.forward_backward(parameters)

    print (abs(actual_dll) - abs(expected_dll)).sum()
    assert_array_almost_equal(actual_dll, expected_dll, decimal=4)
コード例 #2
0
ファイル: profile.py プロジェクト: tirkarthi/pyhacrf
def test_derivate_large():
    classes = ['a', 'b', 'c']
    y = 'b'
    x = random.randn(20, 3, 10) * 5 + 3
    state_machine, states_to_classes = Hacrf._default_state_machine(classes)
    parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
    parameters = random.randn(*parameters.shape) * 10 - 2

    test_model = _Model(state_machine, states_to_classes, x, y)
    expected_dll = np.zeros(parameters.shape)

    # Finite difference gradient approximation
    delta = 10.0**-7
    S, D = expected_dll.shape
    for s in xrange(S):
        for d in xrange(D):
            dg = np.zeros(parameters.shape)
            dg[s, d] = delta
            y0, _ = test_model.forward_backward(parameters)
            y1, _ = test_model.forward_backward(parameters + dg)
            expected_dll[s, d] = (y1 - y0) / delta

    actual_ll, actual_dll = test_model.forward_backward(parameters)

    print(abs(actual_dll) - abs(expected_dll)).sum()
    assert_array_almost_equal(actual_dll, expected_dll, decimal=4)
    def train(self):
        # Training
        self.fe = StringPairFeatureExtractor(match=True,
                                             numeric=True,
                                             transition=True)
        if self.needTraining:
            lines = open(self.infile, 'r').readlines()
            # Generate Positive Correction Pair
            ppairs = []
            ppairs = [
                line.split('\t')[1].strip().split(' | ') for line in lines
            ]
            ppairs = [(pair[0], pair[i]) for pair in ppairs
                      for i in xrange(1, len(pair))]

            # Generate Positive Training Correction Pairs and Testing Correction Pairs
            ppairs_train, ppairs_test = train_test_split(ppairs,
                                                         test_size=200,
                                                         random_state=1)
            self.ppairs_train = [
                tuple(ppair_train) for ppair_train in ppairs_train
            ]
            self.ppairs_test = [
                tuple(ppair_test) for ppair_test in ppairs_test
            ]

            # Generate Negative Training Correction Pairs
            incorrect = list(zip(*ppairs_train)[0])
            shuffle(incorrect)
            correct = list(zip(*ppairs_train)[1])
            npairs_train = zip(incorrect, correct)

            # Raw training set
            x_raw = ppairs_train + npairs_train
            # Label of the training set
            self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train)

            # Extract Features from the raw training set
            self.x_train = x_orig = self.fe.fit_transform(x_raw)
            #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42)
            self.m = Hacrf(l2_regularization=10.0,
                           optimizer=fmin_l_bfgs_b,
                           optimizer_kwargs={'maxfun': 45},
                           state_machine=None)
            self.m.fit(self.x_train, self.y_train, verbosity=20)
            cPickle.dump(self.m, open('Corrector.pkl', 'wb'))
        else:
            print "start training"
            self.m = cPickle.load(open('Corrector.pkl', 'rb'))
            print "finish training"
コード例 #4
0
ファイル: __init__.py プロジェクト: fagan2888/highered
    def __init__(self):
        classes = ['match', 'non-match']
        self.model = Hacrf(l2_regularization=100.0,
                           state_machine=DefaultStateMachine(classes))
        self.model.parameters = np.array(
            [[-0.22937526, 0.51326066], [0.01038001, -0.13348901],
             [-0.03062821, 0.13769178], [0.02024813, -0.01835538],
             [0.09208272, 0.15466022], [-0.08170265, -0.02484392],
             [-0.01762858, 0.17504624], [0.02800866, -0.04442708]],
            order='F')
        self.parameters = self.model.parameters.T
        self.model.classes = ['match', 'non-match']

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=False)
コード例 #5
0
ファイル: __init__.py プロジェクト: datamade/highered
class CRFEditDistance(object) :
    def __init__(self) :
        classes = ['match', 'non-match']
        self.model = Hacrf(l2_regularization=100.0,
                           state_machine=DefaultStateMachine(classes))
        self.model.parameters = np.array(
            [[-0.22937526,  0.51326066],
             [ 0.01038001, -0.13348901],
             [-0.03062821,  0.13769178],
             [ 0.02024813, -0.01835538],
             [ 0.09208272,  0.15466022],
             [-0.08170265, -0.02484392],
             [-0.01762858,  0.17504624],
             [ 0.02800866, -0.04442708]],
            order='F')
        self.parameters = self.model.parameters.T
        self.model.classes = ['match', 'non-match']

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=False)


        
    def fast_pair(self, x):
        x_dot_parameters = np.matmul(x, self.parameters)

        probs = forward_predict(x_dot_parameters, 2)

        return probs


    def train(self, examples, labels) :
        examples = [(string_2, string_1) 
                    if len(string_1) > len(string_2)
                    else (string_1, string_2)
                    for string_1, string_2
                    in examples]
        print(examples)
        extracted_examples = self.feature_extractor.fit_transform(examples)
        self.model.fit(extracted_examples, labels, verbosity=1)

    def __call__(self, string_1, string_2) :
        if len(string_1) > len(string_2) :
            string_1, string_2 = string_2, string_1
        array1 = np.array(tuple(string_1)).reshape(-1, 1)
        array2 = np.array(tuple(string_2)).reshape(1, -1)
        features = self.feature_extractor._extract_features(array1, array2)
        return self.fast_pair(features)[1]
コード例 #6
0
    def test_derivate_large(self):
        classes = ['a', 'b', 'c']
        y = 'b'
        x = random.randn(8, 3, 10) * 5 + 3
        state_machine = DefaultStateMachine(classes)
        parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
        parameters = random.randn(*parameters.shape) * 10 - 2

        test_model = _AdjacentModel(state_machine, x, y)

        expected_dll = np.zeros(parameters.shape)

        # Finite difference gradient approximation
        delta = 10.0**-7
        S, D = expected_dll.shape
        for s in range(S):
            for d in range(D):
                dg = np.zeros(parameters.shape)
                dg[s, d] = delta
                y0, _ = test_model.forward_backward(parameters)
                y1, _ = test_model.forward_backward(parameters + dg)
                print(s, d, y0, y1)
                expected_dll[s, d] = (y1 - y0) / delta

        actual_ll, actual_dll = test_model.forward_backward(parameters)

        print(expected_dll)
        print(actual_dll)
        self.assertEqual((np.isnan(actual_dll)).any(), False)
        assert_array_almost_equal(actual_dll,
                                  expected_dll,
                                  decimal=TEST_PRECISION)
コード例 #7
0
ファイル: test_model.py プロジェクト: pombredanne/pyhacrf
    def test_derivate_large(self):
        classes = ['a', 'b', 'c']
        y = 'b'
        x = random.randn(8, 3, 10) * 5 + 3
        state_machine = DefaultStateMachine(classes)
        parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
        parameters = random.randn(*parameters.shape) * 10 - 2

        test_model = _Model(state_machine, x, y)
        print(test_model._lattice)

        expected_dll = np.zeros(parameters.shape)

        # Finite difference gradient approximation
        delta = 10.0**-7
        S, D = expected_dll.shape
        for s in range(S):
            for d in range(D):
                dg = np.zeros(parameters.shape)
                dg[s, d] = delta
                y0, _ = test_model.forward_backward(parameters)
                y1, _ = test_model.forward_backward(parameters + dg)
                print(s, d, y0, y1)
                expected_dll[s, d] = (y1 - y0) / delta

        actual_ll, actual_dll = test_model.forward_backward(parameters)

        print(expected_dll)
        print(actual_dll)
        self.assertEqual((np.isnan(actual_dll)).any(), False)
        assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION)
コード例 #8
0
ファイル: test_model.py プロジェクト: pombredanne/pyhacrf
    def test_fit_predict_regularized(self):
        incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
        correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf(l2_regularization=10.0)
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        print(model.parameters)

        expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
                                        [0.00187709, -0.06377866, 0.],
                                        [-0.01908823, 0.00586189, 0.],
                                        [0.01721114, -0.00636556, 0.],
                                        [0.01578279, 0.0078614, 0.],
                                        [-0.0139057, -0.00862948, 0.],
                                        [-0.00623241, 0.02937325, 0.],
                                        [0.00810951, -0.01774676, 0.]])
        assert_array_almost_equal(model.parameters, expected_parameters, 
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[0.5227226, 0.4772774],
                                    [0.52568993, 0.47431007],
                                    [0.4547091, 0.5452909],
                                    [0.51179222, 0.48820778],
                                    [0.46347576, 0.53652424],
                                    [0.45710098, 0.54289902],
                                    [0.46159657, 0.53840343],
                                    [0.42997978, 0.57002022],
                                    [0.47419724, 0.52580276],
                                    [0.50797852, 0.49202148]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas, expected_probas, 
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions, expected_predictions, 
                                  decimal=TEST_PRECISION)
コード例 #9
0
ファイル: test_model.py プロジェクト: pombredanne/pyhacrf
    def test_fit_predict(self):
        incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
        correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf()
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

        expected_parameters = np.array([[-10.76945326, 144.03414923, 0.],
                                        [31.84369748, -106.41885651, 0.],
                                        [-52.08919467, 4.56943665, 0.],
                                        [31.01495044, -13.0593297, 0.],
                                        [49.77302218, -6.42566204, 0.],
                                        [-28.69877796, 24.47127009, 0.],
                                        [-85.34524911, 21.87370646, 0.],
                                        [106.41949333, 6.18587125, 0.]])
        print(model.parameters)
        assert_array_almost_equal(model.parameters, expected_parameters,
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[1.00000000e+000, 3.51235685e-039],
                                    [1.00000000e+000, 4.79716208e-039],
                                    [1.00000000e+000, 2.82744641e-139],
                                    [1.00000000e+000, 6.49580729e-012],
                                    [9.99933798e-001, 6.62022561e-005],
                                    [8.78935957e-005, 9.99912106e-001],
                                    [4.84538335e-009, 9.99999995e-001],
                                    [1.25170233e-250, 1.00000000e+000],
                                    [2.46673086e-010, 1.00000000e+000],
                                    [1.03521293e-033, 1.00000000e+000]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas, expected_probas,
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions, expected_predictions,
                                  decimal=TEST_PRECISION)
コード例 #10
0
ファイル: test_model.py プロジェクト: pombredanne/pyhacrf
    def test_fit_predict_regularized_viterbi(self):
        incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
        correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf(l2_regularization=10.0, viterbi=True)
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        print(model.parameters)

        expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
                                        [0.00187709, -0.06377866, 0.],
                                        [-0.01908823, 0.00586189, 0.],
                                        [0.01721114, -0.00636556, 0.],
                                        [0.01578279, 0.0078614, 0.],
                                        [-0.0139057, -0.00862948, 0.],
                                        [-0.00623241, 0.02937325, 0.],
                                        [0.00810951, -0.01774676, 0.]])
        assert_array_almost_equal(model.parameters, expected_parameters,
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[0.56394611, 0.43605389],
                                    [0.52977205, 0.47022795],
                                    [0.4751729, 0.5248271],
                                    [0.51183761, 0.48816239],
                                    [0.48608081, 0.51391919],
                                    [0.4986367, 0.5013633],
                                    [0.46947222, 0.53052778],
                                    [0.43233544, 0.56766456],
                                    [0.47463002, 0.52536998],
                                    [0.51265109, 0.48734891]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas, expected_probas,
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions, expected_predictions,
                                  decimal=TEST_PRECISION)
コード例 #11
0
ファイル: __init__.py プロジェクト: fagan2888/highered
class CRFEditDistance(object):
    def __init__(self):
        classes = ['match', 'non-match']
        self.model = Hacrf(l2_regularization=100.0,
                           state_machine=DefaultStateMachine(classes))
        self.model.parameters = np.array(
            [[-0.22937526, 0.51326066], [0.01038001, -0.13348901],
             [-0.03062821, 0.13769178], [0.02024813, -0.01835538],
             [0.09208272, 0.15466022], [-0.08170265, -0.02484392],
             [-0.01762858, 0.17504624], [0.02800866, -0.04442708]],
            order='F')
        self.parameters = self.model.parameters.T
        self.model.classes = ['match', 'non-match']

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=False)

    def fast_pair(self, x):
        x_dot_parameters = np.matmul(x, self.parameters)

        probs = forward_predict(x_dot_parameters, 2)

        return probs

    def train(self, examples, labels):
        examples = [(string_2, string_1) if len(string_1) > len(string_2) else
                    (string_1, string_2) for string_1, string_2 in examples]
        print(examples)
        extracted_examples = self.feature_extractor.fit_transform(examples)
        self.model.fit(extracted_examples, labels, verbosity=1)

    def __call__(self, string_1, string_2):
        if len(string_1) > len(string_2):
            string_1, string_2 = string_2, string_1
        array1 = np.array(tuple(string_1)).reshape(-1, 1)
        array2 = np.array(tuple(string_2)).reshape(1, -1)
        features = self.feature_extractor._extract_features(array1, array2)
        return self.fast_pair(features)[1]
コード例 #12
0
ファイル: test_model.py プロジェクト: pombredanne/pyhacrf
    def test_initialize_parameters(self):
        start_states = [0]
        transitions = [(0, 0, (1, 1)),
                       (0, 1, (0, 1)),
                       (0, 0, (1, 0))]
        states_to_classes = {0: 'a'}
        state_machine = GeneralStateMachine(start_states=start_states,
                                            transitions=transitions,
                                            states_to_classes=states_to_classes)

        n_features = 3

        actual_parameters = Hacrf._initialize_parameters(state_machine, n_features)
        expected_parameter_shape = (5, 3)
        self.assertEqual(actual_parameters.shape, expected_parameter_shape)
コード例 #13
0
    def test_initialize_parameters(self):
        start_states = [0]
        transitions = [(0, 0, (1, 1)), (0, 1, (0, 1)), (0, 0, (1, 0))]
        states_to_classes = {0: 'a'}
        state_machine = GeneralStateMachine(
            start_states=start_states,
            transitions=transitions,
            states_to_classes=states_to_classes)

        n_features = 3

        actual_parameters = Hacrf._initialize_parameters(
            state_machine, n_features)
        expected_parameter_shape = (5, 3)
        self.assertEqual(actual_parameters.shape, expected_parameter_shape)
コード例 #14
0
ファイル: __init__.py プロジェクト: pombredanne/highered
class CRFEditDistance(object) :
    def __init__(self) :
        self.model = Hacrf(l2_regularization=1.0)
        self.model.parameters = np.array(
            [[-1.14087105,  2.41450373, -0.42000576],
             [-0.0619002,   0.79430259,  0.33864121],
             [-0.25353303,  1.69376742,  0.71731646],
             [ 0.31544095,  1.47012227, -0.39960507],
             [ 0.51356569, -0.67293917, -0.56861512],
             [-0.57547361,  0.57599782,  0.3115221 ],
             [ 0.55744877,  0.16423292, -0.64028285],
             [-0.61935669, -0.02237494,  0.49829992]])
        self.model.classes = ['match', 'non-match']

        self.model._state_machine = WiderStateMachine(self.model.classes)

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=True)

    def train(self, examples, labels) :
        examples = [(string_2, string_1) 
                    if len(string_1) > len(string_2)
                    else (string_1, string_2)
                    for string_1, string_2
                    in examples]
        print(examples)
        extracted_examples = self.feature_extractor.fit_transform(examples)
        self.model.fit(extracted_examples, labels, verbosity=1)

    def __call__(self, string_1, string_2) :
        if not string_1 or not string_2 :
            return np.nan
        if len(string_1) > len(string_2) :
            string_1, string_2 = string_2, string_1
        features = self.feature_extractor.fit_transform(((string_1, string_2),))
        return self.model.predict_proba(features)[0,1]
コード例 #15
0
ファイル: __init__.py プロジェクト: pombredanne/highered
    def __init__(self) :
        self.model = Hacrf(l2_regularization=1.0)
        self.model.parameters = np.array(
            [[-1.14087105,  2.41450373, -0.42000576],
             [-0.0619002,   0.79430259,  0.33864121],
             [-0.25353303,  1.69376742,  0.71731646],
             [ 0.31544095,  1.47012227, -0.39960507],
             [ 0.51356569, -0.67293917, -0.56861512],
             [-0.57547361,  0.57599782,  0.3115221 ],
             [ 0.55744877,  0.16423292, -0.64028285],
             [-0.61935669, -0.02237494,  0.49829992]])
        self.model.classes = ['match', 'non-match']

        self.model._state_machine = WiderStateMachine(self.model.classes)

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=True)
コード例 #16
0
    def test_fit_predict(self):
        incorrect = [
            'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship',
            'hubby', 'krazii', 'mite', 'tropic'
        ]
        correct = [
            'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship',
            'husband', 'crazy', 'might', 'topic'
        ]
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf()
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

        expected_parameters = np.array([[-10.76945326, 144.03414923, 0.],
                                        [31.84369748, -106.41885651, 0.],
                                        [-52.08919467, 4.56943665, 0.],
                                        [31.01495044, -13.0593297, 0.],
                                        [49.77302218, -6.42566204, 0.],
                                        [-28.69877796, 24.47127009, 0.],
                                        [-85.34524911, 21.87370646, 0.],
                                        [106.41949333, 6.18587125, 0.]])
        print(model.parameters)
        assert_array_almost_equal(model.parameters,
                                  expected_parameters,
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[1.00000000e+000, 3.51235685e-039],
                                    [1.00000000e+000, 4.79716208e-039],
                                    [1.00000000e+000, 2.82744641e-139],
                                    [1.00000000e+000, 6.49580729e-012],
                                    [9.99933798e-001, 6.62022561e-005],
                                    [8.78935957e-005, 9.99912106e-001],
                                    [4.84538335e-009, 9.99999995e-001],
                                    [1.25170233e-250, 1.00000000e+000],
                                    [2.46673086e-010, 1.00000000e+000],
                                    [1.03521293e-033, 1.00000000e+000]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas,
                                  expected_probas,
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions,
                                  expected_predictions,
                                  decimal=TEST_PRECISION)
コード例 #17
0
    def test_fit_predict_regularized(self):
        incorrect = [
            'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship',
            'hubby', 'krazii', 'mite', 'tropic'
        ]
        correct = [
            'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship',
            'husband', 'crazy', 'might', 'topic'
        ]
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf(l2_regularization=10.0)
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        print(model.parameters)

        expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
                                        [0.00187709, -0.06377866, 0.],
                                        [-0.01908823, 0.00586189, 0.],
                                        [0.01721114, -0.00636556, 0.],
                                        [0.01578279, 0.0078614, 0.],
                                        [-0.0139057, -0.00862948, 0.],
                                        [-0.00623241, 0.02937325, 0.],
                                        [0.00810951, -0.01774676, 0.]])
        assert_array_almost_equal(model.parameters,
                                  expected_parameters,
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[0.5227226, 0.4772774],
                                    [0.52568993, 0.47431007],
                                    [0.4547091, 0.5452909],
                                    [0.51179222, 0.48820778],
                                    [0.46347576, 0.53652424],
                                    [0.45710098, 0.54289902],
                                    [0.46159657, 0.53840343],
                                    [0.42997978, 0.57002022],
                                    [0.47419724, 0.52580276],
                                    [0.50797852, 0.49202148]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas,
                                  expected_probas,
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions,
                                  expected_predictions,
                                  decimal=TEST_PRECISION)
コード例 #18
0
    def test_fit_predict_regularized_viterbi(self):
        incorrect = [
            'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship',
            'hubby', 'krazii', 'mite', 'tropic'
        ]
        correct = [
            'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship',
            'husband', 'crazy', 'might', 'topic'
        ]
        training = zip(incorrect, correct)

        fe = StringPairFeatureExtractor(match=True, numeric=True)
        xf = fe.fit_transform(training)

        model = Hacrf(l2_regularization=10.0, viterbi=True)
        model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        print(model.parameters)

        expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
                                        [0.00187709, -0.06377866, 0.],
                                        [-0.01908823, 0.00586189, 0.],
                                        [0.01721114, -0.00636556, 0.],
                                        [0.01578279, 0.0078614, 0.],
                                        [-0.0139057, -0.00862948, 0.],
                                        [-0.00623241, 0.02937325, 0.],
                                        [0.00810951, -0.01774676, 0.]])
        assert_array_almost_equal(model.parameters,
                                  expected_parameters,
                                  decimal=TEST_PRECISION)

        expected_probas = np.array([[0.56394611, 0.43605389],
                                    [0.52977205, 0.47022795],
                                    [0.4751729, 0.5248271],
                                    [0.51183761, 0.48816239],
                                    [0.48608081, 0.51391919],
                                    [0.4986367, 0.5013633],
                                    [0.46947222, 0.53052778],
                                    [0.43233544, 0.56766456],
                                    [0.47463002, 0.52536998],
                                    [0.51265109, 0.48734891]])
        actual_predict_probas = model.predict_proba(xf)
        print(actual_predict_probas)
        assert_array_almost_equal(actual_predict_probas,
                                  expected_probas,
                                  decimal=TEST_PRECISION)

        expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
        actual_predictions = model.predict(xf)
        assert_array_almost_equal(actual_predictions,
                                  expected_predictions,
                                  decimal=TEST_PRECISION)
コード例 #19
0
ファイル: __init__.py プロジェクト: datamade/highered
    def __init__(self) :
        classes = ['match', 'non-match']
        self.model = Hacrf(l2_regularization=100.0,
                           state_machine=DefaultStateMachine(classes))
        self.model.parameters = np.array(
            [[-0.22937526,  0.51326066],
             [ 0.01038001, -0.13348901],
             [-0.03062821,  0.13769178],
             [ 0.02024813, -0.01835538],
             [ 0.09208272,  0.15466022],
             [-0.08170265, -0.02484392],
             [-0.01762858,  0.17504624],
             [ 0.02800866, -0.04442708]],
            order='F')
        self.parameters = self.model.parameters.T
        self.model.classes = ['match', 'non-match']

        self.feature_extractor = StringPairFeatureExtractor(match=True,
                                                            numeric=False)
コード例 #20
0
 def train(self):
     # Training
     self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None)
     self.m.fit(self.x_train, self.y_train, verbosity=20)
コード例 #21
0
class MisspellingCorrection:
    def __init__(self, infile):
        lines = open(infile, 'r').readlines()
        # Generate Positive Correction Pair
        ppairs = []
        ppairs = [line.split('\t')[1].strip().split(' | ') for line in lines]
        ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))]
        self.dictionary = [pair[i] for pair in ppairs for i in xrange(1, len(pair))]
        
        # Generate Positive Training Correction Pairs and Testing Correction Pairs
        ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1)
        self.ppairs_train = [tuple(ppair_train) for ppair_train in ppairs_train]
        self.ppairs_test = [tuple(ppair_test) for ppair_test in ppairs_test]


        # Generate Negative Training Correction Pairs
        incorrect = list(zip(*ppairs_train)[0])
        shuffle(incorrect)
        correct = list(zip(*ppairs_train)[1])
        npairs_train = zip(incorrect, correct)
        
        # Raw training set
        x_raw = ppairs_train + npairs_train
        # Label of the training set
        self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train)

        # Extract Features from the raw training set
        self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True)
        self.x_train = x_orig = self.fe.fit_transform(x_raw)
        #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42)
        self.train()
        
    def train(self):
        # Training
        self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None)
        self.m.fit(self.x_train, self.y_train, verbosity=20)
    
    def test(self):
        count = 0
        for incorrect, correct in self.ppairs_test:
            # Get the top 100 candidats with smallest levenshtein distance
            test_pairs = [(incorrect, candidate) for candidate in 
                          heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))]
            gx_test = self.fe.transform(test_pairs)
            # Pr is a list of probability, corresponding to each correction pair in test_pairs 
            pr = self.m.predict_proba(gx_test)
            cr = zip(pr, test_pairs)
            # We use the one with largest probability as the correction of the incorrect word
            cr = max(cr, key=lambda x: x[0][0])
            if cr[1][1] == correct:
                count += 1
            else:
                print (incorrect, correct),
                print cr[1][1]
            print
        print count/float(len(self.ppairs_test))
        
    def correct(self, incorrect):
        test_pairs = [(incorrect, candidate) for candidate in 
                      heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))]
        gx_test = self.fe.transform(test_pairs)
        # Pr is a list of probability, corresponding to each correction pair in test_pairs 
        pr = self.m.predict_proba(gx_test)
        cr = zip(pr, test_pairs)
        # We use the one with largest probability as the correction of the incorrect word
        cr = max(cr, key=lambda x: x[0][0])
        return cr[1][1]
class MisspellingCorrector:
    def __init__(self, infile, dict_file, needTraining=False):
        print "**************************************"
        self.needTraining = needTraining
        self.dictionary = sorted(cPickle.load(open(dict_file, 'rb')))
        self.infile = infile
        self.train()

    def train(self):
        # Training
        self.fe = StringPairFeatureExtractor(match=True,
                                             numeric=True,
                                             transition=True)
        if self.needTraining:
            lines = open(self.infile, 'r').readlines()
            # Generate Positive Correction Pair
            ppairs = []
            ppairs = [
                line.split('\t')[1].strip().split(' | ') for line in lines
            ]
            ppairs = [(pair[0], pair[i]) for pair in ppairs
                      for i in xrange(1, len(pair))]

            # Generate Positive Training Correction Pairs and Testing Correction Pairs
            ppairs_train, ppairs_test = train_test_split(ppairs,
                                                         test_size=200,
                                                         random_state=1)
            self.ppairs_train = [
                tuple(ppair_train) for ppair_train in ppairs_train
            ]
            self.ppairs_test = [
                tuple(ppair_test) for ppair_test in ppairs_test
            ]

            # Generate Negative Training Correction Pairs
            incorrect = list(zip(*ppairs_train)[0])
            shuffle(incorrect)
            correct = list(zip(*ppairs_train)[1])
            npairs_train = zip(incorrect, correct)

            # Raw training set
            x_raw = ppairs_train + npairs_train
            # Label of the training set
            self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train)

            # Extract Features from the raw training set
            self.x_train = x_orig = self.fe.fit_transform(x_raw)
            #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42)
            self.m = Hacrf(l2_regularization=10.0,
                           optimizer=fmin_l_bfgs_b,
                           optimizer_kwargs={'maxfun': 45},
                           state_machine=None)
            self.m.fit(self.x_train, self.y_train, verbosity=20)
            cPickle.dump(self.m, open('Corrector.pkl', 'wb'))
        else:
            print "start training"
            self.m = cPickle.load(open('Corrector.pkl', 'rb'))
            print "finish training"

    def test(self):
        count = 0
        for incorrect, correct in self.ppairs_test:
            # Get the top 100 candidats with smallest levenshtein distance
            test_pairs = [
                (incorrect, candidate) for candidate in heapq.nsmallest(
                    100,
                    self.dictionary,
                    key=lambda x: levenshtein.levenshtein(incorrect, x))
            ]
            gx_test = self.fe.transform(test_pairs)
            # Pr is a list of probability, corresponding to each correction pair in test_pairs
            pr = self.m.predict_proba(gx_test)
            cr = zip(pr, test_pairs)
            # We use the one with largest probability as the correction of the incorrect word
            cr = max(cr, key=lambda x: x[0][0])
            if cr[1][1] == correct:
                count += 1
            else:
                print(incorrect, correct),
                print cr[1][1]
            print
        print count / float(len(self.ppairs_test))

    def correct(self, incorrect):
        test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(
            10,
            self.dictionary,
            key=lambda x: levenshtein.levenshtein(incorrect, x))]
        gx_test = self.fe.transform(test_pairs)
        # Pr is a list of probability, corresponding to each correction pair in test_pairs
        pr = self.m.predict_proba(gx_test)
        print pr
        cr = zip(pr, test_pairs)
        print cr
        # We use the one with largest probability as the correction of the incorrect word
        cr = max(cr, key=lambda x: x[0][0])
        if levenshtein.levenshtein(incorrect, cr[1][1]) > 2:
            return 'gopdebate'
        else:
            return cr[1][1]