def runFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2
def testSigmoid(self): self.assertAlmostEqual(0.5, preprocess.sigmoid(0)) self.assertAlmostEqual(0.7310585786300049, preprocess.sigmoid(1)) self.assertAlmostEqual(0.2689414213699951, preprocess.sigmoid(-1)) t = numpy.array([-2, -1, 0, 1, 2]) expected = numpy.array([0.11920292202211755, 0.2689414213699951, 0.5, 0.7310585786300049, 0.8807970779778823]) actual = preprocess.sigmoid(t) self.assertAlmostEqual(0.0, linalg.norm(expected - actual))
def test(x): mu, sigma = preprocess.muSigma(x) self.assertAlmostEqual(1.23902738264240, x[1][2]) self.assertEqual(5, len(mu)) self.assertEqual(5, len(sigma)) self.assertAlmostEqual(2.87969736221038, mu[0]) self.assertAlmostEqual(2.04868506865762, sigma[0]) self.assertAlmostEqual(-0.99025024303433, (x[0][0] - mu[0]) / sigma[0]) self.assertAlmostEqual(1.97861578296198, mu[2]) self.assertAlmostEqual(2.33076030134340, sigma[2]) self.assertAlmostEqual(-0.31731637092553, (x[1][2] - mu[2]) / sigma[2]) y = preprocess.normalize(x, mu, sigma) m, n = y.shape self.assertEqual(4, m) self.assertEqual(5, n) self.assertAlmostEqual(-0.99025024303433, y[0][0]) self.assertAlmostEqual(-0.31731637092553, y[1][2]) u = preprocess.sigmoid(y) self.assertAlmostEqual(0.27086265279957, u[0][0]) self.assertAlmostEqual(0.42132990768430, u[1][2])
def normalize(self, data): x, y = data z = preprocess.sigmoid(preprocess.normalize(x, self.mu, self.sigma)) return numpy.array(z, dtype = numpy.float32), y
def runChainedFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): print_step('Loading Lvl1') lvl1_train, lvl1_test = load_cache('lvl1_fm') [ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ] lvl1_train = csr_matrix( pd.concat([ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ], axis=1).values) lvl1_test = csr_matrix( pd.concat([ lvl1_test[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_test.columns if 'fm_' in c and c != label ], axis=1).values) print_step('Merging 1/3') lvl1_valid = lvl1_train[val_index] lvl1_train = lvl1_train[dev_index] train_X = csr_matrix(hstack([train_X, lvl1_train])) print_step('Merging 2/3') test_X = csr_matrix(hstack([test_X, lvl1_valid])) print_step('Merging 3/3') test_X2 = csr_matrix(hstack([test_X2, lvl1_test])) print_step('Modeling') class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2