def test_fit_predict_regularized(self): incorrect = [ 'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic' ] correct = [ 'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic' ] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf(l2_regularization=10.0) model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) print(model.parameters) expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], [0.00187709, -0.06377866, 0.], [-0.01908823, 0.00586189, 0.], [0.01721114, -0.00636556, 0.], [0.01578279, 0.0078614, 0.], [-0.0139057, -0.00862948, 0.], [-0.00623241, 0.02937325, 0.], [0.00810951, -0.01774676, 0.]]) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[0.5227226, 0.4772774], [0.52568993, 0.47431007], [0.4547091, 0.5452909], [0.51179222, 0.48820778], [0.46347576, 0.53652424], [0.45710098, 0.54289902], [0.46159657, 0.53840343], [0.42997978, 0.57002022], [0.47419724, 0.52580276], [0.50797852, 0.49202148]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def test_fit_predict(self): incorrect = [ 'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic' ] correct = [ 'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic' ] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf() model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) expected_parameters = np.array([[-10.76945326, 144.03414923, 0.], [31.84369748, -106.41885651, 0.], [-52.08919467, 4.56943665, 0.], [31.01495044, -13.0593297, 0.], [49.77302218, -6.42566204, 0.], [-28.69877796, 24.47127009, 0.], [-85.34524911, 21.87370646, 0.], [106.41949333, 6.18587125, 0.]]) print(model.parameters) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[1.00000000e+000, 3.51235685e-039], [1.00000000e+000, 4.79716208e-039], [1.00000000e+000, 2.82744641e-139], [1.00000000e+000, 6.49580729e-012], [9.99933798e-001, 6.62022561e-005], [8.78935957e-005, 9.99912106e-001], [4.84538335e-009, 9.99999995e-001], [1.25170233e-250, 1.00000000e+000], [2.46673086e-010, 1.00000000e+000], [1.03521293e-033, 1.00000000e+000]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def test_fit_predict_regularized_viterbi(self): incorrect = [ 'helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic' ] correct = [ 'hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic' ] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf(l2_regularization=10.0, viterbi=True) model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) print(model.parameters) expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], [0.00187709, -0.06377866, 0.], [-0.01908823, 0.00586189, 0.], [0.01721114, -0.00636556, 0.], [0.01578279, 0.0078614, 0.], [-0.0139057, -0.00862948, 0.], [-0.00623241, 0.02937325, 0.], [0.00810951, -0.01774676, 0.]]) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[0.56394611, 0.43605389], [0.52977205, 0.47022795], [0.4751729, 0.5248271], [0.51183761, 0.48816239], [0.48608081, 0.51391919], [0.4986367, 0.5013633], [0.46947222, 0.53052778], [0.43233544, 0.56766456], [0.47463002, 0.52536998], [0.51265109, 0.48734891]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def train(self): # Training self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) if self.needTraining: lines = open(self.infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [ line.split('\t')[1].strip().split(' | ') for line in lines ] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [ tuple(ppair_train) for ppair_train in ppairs_train ] self.ppairs_test = [ tuple(ppair_test) for ppair_test in ppairs_test ] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None) self.m.fit(self.x_train, self.y_train, verbosity=20) cPickle.dump(self.m, open('Corrector.pkl', 'wb')) else: print "start training" self.m = cPickle.load(open('Corrector.pkl', 'rb')) print "finish training"
def __init__(self): classes = ['match', 'non-match'] self.model = Hacrf(l2_regularization=100.0, state_machine=DefaultStateMachine(classes)) self.model.parameters = np.array( [[-0.22937526, 0.51326066], [0.01038001, -0.13348901], [-0.03062821, 0.13769178], [0.02024813, -0.01835538], [0.09208272, 0.15466022], [-0.08170265, -0.02484392], [-0.01762858, 0.17504624], [0.02800866, -0.04442708]], order='F') self.parameters = self.model.parameters.T self.model.classes = ['match', 'non-match'] self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=False)
class CRFEditDistance(object) : def __init__(self) : classes = ['match', 'non-match'] self.model = Hacrf(l2_regularization=100.0, state_machine=DefaultStateMachine(classes)) self.model.parameters = np.array( [[-0.22937526, 0.51326066], [ 0.01038001, -0.13348901], [-0.03062821, 0.13769178], [ 0.02024813, -0.01835538], [ 0.09208272, 0.15466022], [-0.08170265, -0.02484392], [-0.01762858, 0.17504624], [ 0.02800866, -0.04442708]], order='F') self.parameters = self.model.parameters.T self.model.classes = ['match', 'non-match'] self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=False) def fast_pair(self, x): x_dot_parameters = np.matmul(x, self.parameters) probs = forward_predict(x_dot_parameters, 2) return probs def train(self, examples, labels) : examples = [(string_2, string_1) if len(string_1) > len(string_2) else (string_1, string_2) for string_1, string_2 in examples] print(examples) extracted_examples = self.feature_extractor.fit_transform(examples) self.model.fit(extracted_examples, labels, verbosity=1) def __call__(self, string_1, string_2) : if len(string_1) > len(string_2) : string_1, string_2 = string_2, string_1 array1 = np.array(tuple(string_1)).reshape(-1, 1) array2 = np.array(tuple(string_2)).reshape(1, -1) features = self.feature_extractor._extract_features(array1, array2) return self.fast_pair(features)[1]
def __init__(self, infile): lines = open(infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [line.split('\t')[1].strip().split(' | ') for line in lines] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] self.dictionary = [pair[i] for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [tuple(ppair_train) for ppair_train in ppairs_train] self.ppairs_test = [tuple(ppair_test) for ppair_test in ppairs_test] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.train()
def test_fit_predict(self): incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf() model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) expected_parameters = np.array([[-10.76945326, 144.03414923, 0.], [31.84369748, -106.41885651, 0.], [-52.08919467, 4.56943665, 0.], [31.01495044, -13.0593297, 0.], [49.77302218, -6.42566204, 0.], [-28.69877796, 24.47127009, 0.], [-85.34524911, 21.87370646, 0.], [106.41949333, 6.18587125, 0.]]) print(model.parameters) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[1.00000000e+000, 3.51235685e-039], [1.00000000e+000, 4.79716208e-039], [1.00000000e+000, 2.82744641e-139], [1.00000000e+000, 6.49580729e-012], [9.99933798e-001, 6.62022561e-005], [8.78935957e-005, 9.99912106e-001], [4.84538335e-009, 9.99999995e-001], [1.25170233e-250, 1.00000000e+000], [2.46673086e-010, 1.00000000e+000], [1.03521293e-033, 1.00000000e+000]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def test_fit_predict_regularized(self): incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf(l2_regularization=10.0) model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) print(model.parameters) expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], [0.00187709, -0.06377866, 0.], [-0.01908823, 0.00586189, 0.], [0.01721114, -0.00636556, 0.], [0.01578279, 0.0078614, 0.], [-0.0139057, -0.00862948, 0.], [-0.00623241, 0.02937325, 0.], [0.00810951, -0.01774676, 0.]]) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[0.5227226, 0.4772774], [0.52568993, 0.47431007], [0.4547091, 0.5452909], [0.51179222, 0.48820778], [0.46347576, 0.53652424], [0.45710098, 0.54289902], [0.46159657, 0.53840343], [0.42997978, 0.57002022], [0.47419724, 0.52580276], [0.50797852, 0.49202148]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def test_fit_predict_regularized_viterbi(self): incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] training = zip(incorrect, correct) fe = StringPairFeatureExtractor(match=True, numeric=True) xf = fe.fit_transform(training) model = Hacrf(l2_regularization=10.0, viterbi=True) model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) print(model.parameters) expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], [0.00187709, -0.06377866, 0.], [-0.01908823, 0.00586189, 0.], [0.01721114, -0.00636556, 0.], [0.01578279, 0.0078614, 0.], [-0.0139057, -0.00862948, 0.], [-0.00623241, 0.02937325, 0.], [0.00810951, -0.01774676, 0.]]) assert_array_almost_equal(model.parameters, expected_parameters, decimal=TEST_PRECISION) expected_probas = np.array([[0.56394611, 0.43605389], [0.52977205, 0.47022795], [0.4751729, 0.5248271], [0.51183761, 0.48816239], [0.48608081, 0.51391919], [0.4986367, 0.5013633], [0.46947222, 0.53052778], [0.43233544, 0.56766456], [0.47463002, 0.52536998], [0.51265109, 0.48734891]]) actual_predict_probas = model.predict_proba(xf) print(actual_predict_probas) assert_array_almost_equal(actual_predict_probas, expected_probas, decimal=TEST_PRECISION) expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) actual_predictions = model.predict(xf) assert_array_almost_equal(actual_predictions, expected_predictions, decimal=TEST_PRECISION)
def test_transform_transition(self): s1 = "ba" s2 = "ca" # a . . # b . . # c a chars = StringPairFeatureExtractor.CHARACTERS nchars = len(chars) print(nchars) expected_x = np.zeros((2, 2, len(chars)**2 + 1)) expected_x[:, :, 0] = 1.0 expected_x[0, 0, 2 + nchars * 1 + 1] = 1.0 # b->c expected_x[0, 1, 0 + nchars * 1 + 1] = 1.0 # b->a expected_x[1, 0, 2 + nchars * 0 + 1] = 1.0 # a->c expected_x[1, 1, 0 + nchars * 0 + 1] = 1.0 # a->a test_extractor = StringPairFeatureExtractor(transition=True) actual_X = test_extractor.fit_transform([(s1, s2)]) assert_array_almost_equal(expected_x, actual_X[0])
def test_transform_binary(self): s1 = "kat1" s2 = "cat2" # 1 . . . n # t . . m . # a . m . . # k . . . . # c a t 2 expected_x = np.zeros((4, 4, 4)) expected_x[:, :, 0] = 2.0 expected_x[:, 0, 1] = 1.0 expected_x[0, :, 1] = 1.0 expected_x[1, 1, 2] = 1.0 expected_x[2, 2, 2] = 1.0 expected_x[3, 3, 3] = 1.0 test_extractor = StringPairFeatureExtractor(bias=2.0, start=True, match=True, numeric=True) actual_X = test_extractor.fit_transform([(s1, s2)]) assert_array_almost_equal(expected_x, actual_X[0])
class CRFEditDistance(object): def __init__(self): classes = ['match', 'non-match'] self.model = Hacrf(l2_regularization=100.0, state_machine=DefaultStateMachine(classes)) self.model.parameters = np.array( [[-0.22937526, 0.51326066], [0.01038001, -0.13348901], [-0.03062821, 0.13769178], [0.02024813, -0.01835538], [0.09208272, 0.15466022], [-0.08170265, -0.02484392], [-0.01762858, 0.17504624], [0.02800866, -0.04442708]], order='F') self.parameters = self.model.parameters.T self.model.classes = ['match', 'non-match'] self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=False) def fast_pair(self, x): x_dot_parameters = np.matmul(x, self.parameters) probs = forward_predict(x_dot_parameters, 2) return probs def train(self, examples, labels): examples = [(string_2, string_1) if len(string_1) > len(string_2) else (string_1, string_2) for string_1, string_2 in examples] print(examples) extracted_examples = self.feature_extractor.fit_transform(examples) self.model.fit(extracted_examples, labels, verbosity=1) def __call__(self, string_1, string_2): if len(string_1) > len(string_2): string_1, string_2 = string_2, string_1 array1 = np.array(tuple(string_1)).reshape(-1, 1) array2 = np.array(tuple(string_2)).reshape(1, -1) features = self.feature_extractor._extract_features(array1, array2) return self.fast_pair(features)[1]
class CRFEditDistance(object) : def __init__(self) : self.model = Hacrf(l2_regularization=1.0) self.model.parameters = np.array( [[-1.14087105, 2.41450373, -0.42000576], [-0.0619002, 0.79430259, 0.33864121], [-0.25353303, 1.69376742, 0.71731646], [ 0.31544095, 1.47012227, -0.39960507], [ 0.51356569, -0.67293917, -0.56861512], [-0.57547361, 0.57599782, 0.3115221 ], [ 0.55744877, 0.16423292, -0.64028285], [-0.61935669, -0.02237494, 0.49829992]]) self.model.classes = ['match', 'non-match'] self.model._state_machine = WiderStateMachine(self.model.classes) self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=True) def train(self, examples, labels) : examples = [(string_2, string_1) if len(string_1) > len(string_2) else (string_1, string_2) for string_1, string_2 in examples] print(examples) extracted_examples = self.feature_extractor.fit_transform(examples) self.model.fit(extracted_examples, labels, verbosity=1) def __call__(self, string_1, string_2) : if not string_1 or not string_2 : return np.nan if len(string_1) > len(string_2) : string_1, string_2 = string_2, string_1 features = self.feature_extractor.fit_transform(((string_1, string_2),)) return self.model.predict_proba(features)[0,1]
def __init__(self) : self.model = Hacrf(l2_regularization=1.0) self.model.parameters = np.array( [[-1.14087105, 2.41450373, -0.42000576], [-0.0619002, 0.79430259, 0.33864121], [-0.25353303, 1.69376742, 0.71731646], [ 0.31544095, 1.47012227, -0.39960507], [ 0.51356569, -0.67293917, -0.56861512], [-0.57547361, 0.57599782, 0.3115221 ], [ 0.55744877, 0.16423292, -0.64028285], [-0.61935669, -0.02237494, 0.49829992]]) self.model.classes = ['match', 'non-match'] self.model._state_machine = WiderStateMachine(self.model.classes) self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)
def __init__(self) : classes = ['match', 'non-match'] self.model = Hacrf(l2_regularization=100.0, state_machine=DefaultStateMachine(classes)) self.model.parameters = np.array( [[-0.22937526, 0.51326066], [ 0.01038001, -0.13348901], [-0.03062821, 0.13769178], [ 0.02024813, -0.01835538], [ 0.09208272, 0.15466022], [-0.08170265, -0.02484392], [-0.01762858, 0.17504624], [ 0.02800866, -0.04442708]], order='F') self.parameters = self.model.parameters.T self.model.classes = ['match', 'non-match'] self.feature_extractor = StringPairFeatureExtractor(match=True, numeric=False)
import numpy, pyhacrf, nltk from pyhacrf import StringPairFeatureExtractor, Hacrf training_X = [] iv = open("iv.txt", "r").read() ovv = open("ovv.txt", "r").read() for p in iv.split("\n"): for q in ovv.split("\n"): training_X.append((p, q)) #print(training_X) #training_X.append(('p', 'q')) #print(training_X) training_y = ['match', 'match', 'match', 'non-match', 'non-match'] # # Extract features feature_extractor = StringPairFeatureExtractor(match=True, numeric=True) training_X_extracted = feature_extractor.fit_transform(training_X) # # # Train model # model = Hacrf(l2_regularization=1.0) # model.fit(training_X_extracted, training_y) # # # Evaluate # from sklearn.metrics import confusion_matrix # predictions = model.predict(training_X_extracted) # # print(confusion_matrix(training_y, predictions))
class MisspellingCorrection: def __init__(self, infile): lines = open(infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [line.split('\t')[1].strip().split(' | ') for line in lines] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] self.dictionary = [pair[i] for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [tuple(ppair_train) for ppair_train in ppairs_train] self.ppairs_test = [tuple(ppair_test) for ppair_test in ppairs_test] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.train() def train(self): # Training self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None) self.m.fit(self.x_train, self.y_train, verbosity=20) def test(self): count = 0 for incorrect, correct in self.ppairs_test: # Get the top 100 candidats with smallest levenshtein distance test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if cr[1][1] == correct: count += 1 else: print (incorrect, correct), print cr[1][1] print print count/float(len(self.ppairs_test)) def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) return cr[1][1]
class MisspellingCorrector: def __init__(self, infile, dict_file, needTraining=False): print "**************************************" self.needTraining = needTraining self.dictionary = sorted(cPickle.load(open(dict_file, 'rb'))) self.infile = infile self.train() def train(self): # Training self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) if self.needTraining: lines = open(self.infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [ line.split('\t')[1].strip().split(' | ') for line in lines ] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [ tuple(ppair_train) for ppair_train in ppairs_train ] self.ppairs_test = [ tuple(ppair_test) for ppair_test in ppairs_test ] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None) self.m.fit(self.x_train, self.y_train, verbosity=20) cPickle.dump(self.m, open('Corrector.pkl', 'wb')) else: print "start training" self.m = cPickle.load(open('Corrector.pkl', 'rb')) print "finish training" def test(self): count = 0 for incorrect, correct in self.ppairs_test: # Get the top 100 candidats with smallest levenshtein distance test_pairs = [ (incorrect, candidate) for candidate in heapq.nsmallest( 100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x)) ] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if cr[1][1] == correct: count += 1 else: print(incorrect, correct), print cr[1][1] print print count / float(len(self.ppairs_test)) def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest( 10, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) print pr cr = zip(pr, test_pairs) print cr # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if levenshtein.levenshtein(incorrect, cr[1][1]) > 2: return 'gopdebate' else: return cr[1][1]