def test_label_binarizer_multilabel_unlabeled(): """Check that LabelBinarizer can handle an unlabeled sample""" lb = LabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(lb.fit_transform(y), Y)
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(["b", "d", "e"]) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(["a", "b", "c", "d", "e", "f"]) assert_array_equal(expected, got)
def test_label_binarizer_multilabel(): lb = LabelBinarizer() # test input as lists of tuples inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) got = lb.fit_transform(inp) assert_true(lb.multilabel_) assert_array_equal(indicator_mat, got) assert_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix lb.fit(indicator_mat) assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat)) # regression test for the two-class multilabel case lb = LabelBinarizer() inp = [[1, 0], [0], [1], [0, 1]] expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]]) got = lb.fit_transform(inp) assert_true(lb.multilabel_) assert_array_equal(expected, got) assert_equal([set(x) for x in lb.inverse_transform(got)], [set(x) for x in inp])
def test_label_binarizer_iris(): lb = LabelBinarizer() Y = lb.fit_transform(iris.target) clfs = [SGDClassifier().fit(iris.data, Y[:, k]) for k in range(len(lb.classes_))] Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T y_pred = lb.inverse_transform(Y_pred) accuracy = np.mean(iris.target == y_pred) y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data) accuracy2 = np.mean(iris.target == y_pred2) assert_almost_equal(accuracy, accuracy2)
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def test_label_binarizer(): lb = LabelBinarizer() # one-class case defaults to negative label inp = ["pos", "pos", "pos", "pos"] expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_column_y(): # first for binary classification vs multi-label with 1 possible class # lists are multi-label, array is multi-class :-/ inp_list = [[1], [2], [1]] inp_array = np.array(inp_list) multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]]) binaryclass_array = np.array([[0], [1], [0]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, multilabel_indicator) assert_array_equal(out_2, binaryclass_array) # second for multiclass classification vs multi-label with multiple # classes inp_list = [[1], [2], [1], [3]] inp_array = np.array(inp_list) # the indicator matrix output is the same in this case indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, out_2) assert_array_equal(out_2, indicator)
def test_label_binarizer_errors(): """Check that invalid arguments yield ValueError""" one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] assert_raises(ValueError, lb.transform, multi_label) lb = LabelBinarizer() assert_raises(ValueError, lb.transform, []) assert_raises(ValueError, lb.inverse_transform, []) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2) assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Fail on the number of classes assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' assert_raises(ValueError, _inverse_binarize_thresholding, y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]])) assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]), [1, 2, 3])
def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1): self.d = d self.yita = yita self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2} self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \ "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"] self.noPunctuation = False self.answerLables = LabelBinarizer() self.answerLables.fit([1, 2, 3]) self.c = set() self.Initialize(trainFile, devFile, testFile) self.input = len(self.c) * 5 + 1 self.setParameters(d, yita)
def test_label_binarizer_errors(): """Check that invalid arguments yield ValueError""" one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) assert_false(lb.multilabel_) multi_label = [(2, 3), (0, ), (0, 2)] assert_raises(ValueError, lb.transform, multi_label) lb = LabelBinarizer() assert_raises(ValueError, lb.transform, []) assert_raises(ValueError, lb.inverse_transform, []) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
def test_label_binarizer_multilabel_unlabeled(): """Check that LabelBinarizer can handle an unlabeled sample""" lb = LabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(assert_warns(DeprecationWarning, lb.fit_transform, y), Y)
def fit_binarizers(all_values): binarizers = {} for f in range(len(all_values[0])): cur_features = [context[f] for context in all_values] # only categorical values need to be binarized, ints/floats are left as they are if type(cur_features[0]) == str or type(cur_features[0]) == unicode: lb = LabelBinarizer() lb.fit(cur_features) binarizers[f] = lb elif type(cur_features[0]) == list: mlb = MultiLabelBinarizer() # default feature for unknown values cur_features.append(tuple(("__unk__",))) mlb.fit([tuple(x) for x in cur_features]) binarizers[f] = mlb return binarizers
def __init__(self, neg_label=0, pos_label=1, sparse_output=False): self._hyperparams = { 'neg_label': neg_label, 'pos_label': pos_label, 'sparse_output': sparse_output } self._wrapped_model = SKLModel(**self._hyperparams)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(['b', 'd', 'e']) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f']) assert_array_equal(expected, got)
class CategoryBinarizer(TransformerMixin): def __init__(self): self.__encoder = LabelBinarizer(sparse_output=False) def fit(self, X, y=None): # X = X.astype(str) X = X.values self.__encoder.fit(X) return self def transform(self, X): X = X.values result = self.__encoder.transform(X) result = pd.DataFrame(result) result.columns = self.__encoder.classes_ return result
def test_label_binarizer(): lb = LabelBinarizer() # one-class case defaults to negative label inp = ["pos", "pos", "pos", "pos"] expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"]) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer(): lb = LabelBinarizer() # one-class case defaults to negative label inp = ["pos", "pos", "pos", "pos"] expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
class LabelBinarizerImpl(): def __init__(self, neg_label=0, pos_label=1, sparse_output=False): self._hyperparams = { 'neg_label': neg_label, 'pos_label': pos_label, 'sparse_output': sparse_output } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def getLoss(w, x, y, lam): m = x.shape[0] # First we get the number of training examples #y_mat = oneHotIt(y) #Next we convert the integer class coding into a one-hot representation lb = LabelBinarizer() y_mat = lb.fit_transform(y) b = np.random.rand(len(x), 10) #scores = np.sum(np.dot(x, w), b) # Then we compute raw class scores given our input and current weights scores = np.dot( x, w ) # Then we compute raw class scores given our input and current weights prob = softmax( scores ) # Next we perform a softmax on these scores to get their probabilities loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam / 2) * np.sum( w * w) # We then find the loss of the probabilities grad = (-1 / m) * np.dot( x.T, (y_mat - prob)) + lam * w # And compute the gradient for that loss return loss, grad
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None): """ This is shorter ans simpler version og log_loss, which supports sample_weight """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight) y_true = column_or_1d(y_true) lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = numpy.append(1 - T, T, axis=1) # Clipping Y = numpy.clip(y_pred, eps, 1 - eps) # Check if dimensions are consistent. T, Y = check_arrays(T, Y) # Renormalize Y /= Y.sum(axis=1)[:, numpy.newaxis] loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight) return loss
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding(binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
def test_label_binarize_with_multilabel_indicator(): """Check that passing a binary indicator matrix is not noop""" classes = np.arange(3) neg_label = -1 pos_label = 2 y = np.array([[0, 1, 0], [1, 1, 1]]) expected = np.array([[-1, 2, -1], [2, 2, 2]]) # With label binarize output = label_binarize(y, classes, multilabel=True, neg_label=neg_label, pos_label=pos_label) assert_array_equal(output, expected) # With the transformer lb = LabelBinarizer(pos_label=pos_label, neg_label=neg_label) output = lb.fit_transform(y) assert_array_equal(output, expected) output = lb.fit(y).transform(y) assert_array_equal(output, expected)
def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=2) # two-class case inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 2, 2, -2]]).T got = lb.fit_transform(inp) assert_false(lb.multilabel_) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]]) got = lb.fit_transform(inp) assert_false(lb.multilabel_) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]]) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_multilabel(): lb = LabelBinarizer() # test input as lists of tuples inp = [(2, 3), (1, ), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) got = assert_warns(DeprecationWarning, lb.fit_transform, inp) assert_true(lb.multilabel_) assert_array_equal(indicator_mat, got) assert_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix lb.fit(indicator_mat) assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat)) # regression test for the two-class multilabel case lb = LabelBinarizer() inp = [[1, 0], [0], [1], [0, 1]] expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]]) got = assert_warns(DeprecationWarning, lb.fit_transform, inp) assert_true(lb.multilabel_) assert_array_equal(expected, got) assert_equal([set(x) for x in lb.inverse_transform(got)], [set(x) for x in inp])
def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]]) got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer(): lb = LabelBinarizer() # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(lb.multilabel_) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_false(lb.multilabel_) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def __init__(self): self.__encoder = LabelBinarizer(sparse_output=False)
class GOAMultilayerPerceptron: def __init__(self, N, hidden_layer_sizes, max_iter, random_state, x_val, y_val, activation="relu"): self.N = N self.hidden_layer_sizes = hidden_layer_sizes self.activation = activation self.max_iter = max_iter self.random_state = check_random_state(random_state) self.xval = x_val self.yval = y_val def _forward_pass(self, activations, coefs, intercepts): hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], coefs[i]) activations[i + 1] += intercepts[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer activations[self.n_layers_-1] = logistic(activations[self.n_layers_-1]) return activations def initialize(self, y, layer_units, coefs_, intercepts_): self.n_outputs_ = y.shape[1] self.n_layers_ = len(layer_units) self.out_activation_ = 'logistic' self.n_coefs = [] self.n_intercepts = [] self.bound = 0 bound = 0 self.coefs_ = coefs_ self.intercepts_ = intercepts_ grasshopper_vector = self.encode(coefs_, intercepts_) for x in grasshopper_vector: if abs(x) > bound: bound = abs(x) bound = math.ceil(bound) self.grasshopper_vector = grasshopper_vector self.dim = len(grasshopper_vector) self.ub = bound self.lb = -bound def fit(self, X, y): inicial_mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=self.hidden_layer_sizes, random_state=8997) inicial_mlp.fit(X, y) N = self.N max_iter = self.max_iter hidden_layer_sizes = self.hidden_layer_sizes hidden_layer_sizes = list(hidden_layer_sizes) X, y = self.validate_input(X, y) n_samples, n_features = X.shape if y.ndim == 1: y = y.reshape((-1, 1)) self.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) self.initialize(y, layer_units, inicial_mlp.coefs_, inicial_mlp.intercepts_) y = self.label_binarizer.inverse_transform(y) bestauc = 0 flag = 0 dim = self.dim print("dim:", dim) lb = self.lb ub = self.ub ub = np.ones((dim, 1)) * ub lb = np.ones((dim, 1)) * lb if dim % 2 != 0: dim = dim + 1 ub = np.append(ub, self.ub) lb = np.append(lb, self.lb) flag = 1 if flag == 1: self.grasshopper_vector.append(0) grasshopper_positions = [] for i in range(N): grasshopper_positions.append(self.grasshopper_vector) # grasshopper_positions = initialization(N, dim, self.lb, self.ub) grasshopper_positions = np.array(grasshopper_positions) grasshopper_fitness = [] cmax = 1 cmin = 0.00004 for i in range(np.size(grasshopper_positions, 0)): if flag == 1: grasshopper_position = grasshopper_positions[i][0:-1] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness.append(auc1) # grasshopper_fitness.append(binary_log_loss(y, y_pred)) else: grasshopper_position = grasshopper_positions[i] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness.append(auc1) # grasshopper_fitness.append(binary_log_loss(y, y_pred)) sorted_indexes = list(np.array(grasshopper_fitness).argsort()) grasshopper_fitness.sort(reverse=True) sorted_grasshopper = [] for new_index in range(N): sorted_grasshopper.append(grasshopper_positions[sorted_indexes[new_index]]) target_position = sorted_grasshopper[0] target_fitness = grasshopper_fitness[0] print("target_position:", target_position) print("target_fitness:", target_fitness) l = 2 grasshopper_positions = np.array(grasshopper_positions) print(np.shape(grasshopper_positions)) while l < max_iter + 1: print("iteration ", l) tp = np.array(target_position) cc = cmax - l * ((cmax - cmin) / max_iter) for i in range(np.size(grasshopper_positions, 0)): temp = np.transpose(grasshopper_positions) s_i = np.zeros((dim, 1)) for j in range(N): if i != j: dist = distance(temp[:, j], temp[:, i]) r_ij_vec = (temp[:, j] - temp[:, i]) / (dist + eps(1)) xj_xi = 2 + dist % 2 s_ij = np.multiply((ub - lb)*cc/2*s_func(xj_xi), r_ij_vec) s_i = s_i + np.transpose(s_ij) X_new = cc * np.transpose(s_i) + tp grasshopper_positions[i, :] = np.squeeze(np.transpose(X_new)) for i in range(N): # Relocate grasshoppers that go outside the search space tp = np.greater(grasshopper_positions[i, :], np.transpose(ub)) tm = np.less(grasshopper_positions[i, :], np.transpose(lb)) grasshopper_positions[i, :] = grasshopper_positions[i, :] * np.logical_not(tp + tm) + np.transpose( ub) * tp + np.transpose(lb) * tm if flag == 1: grasshopper_position = grasshopper_positions[i][0:-1] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness = auc1 # grasshopper_fitness = binary_log_loss(y, y_pred) else: grasshopper_position = grasshopper_positions[i] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness = auc1 #grasshopper_fitness = binary_log_loss(y, y_pred) if grasshopper_fitness > target_fitness: target_position = grasshopper_positions[i] target_fitness = grasshopper_fitness print("new_fitness:", target_fitness) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) print("training auc:", auc1) y_pred = self._predict(self.xval, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(self.yval, y_pred) auc1 = auc(fpr, tpr) if auc1>bestauc: bestauc = auc1 print("best auc on validation set:", bestauc) l=l+1 if flag == 1: target_position = target_position[0:-1] coefss, interceptss = self.decode(target_position) self.coefs_ = coefss self.intercepts_ = interceptss def init_coef(self, fan_in, fan_out): # Use the initialization method recommended by # Glorot et al. factor = 6. if self.activation == 'logistic': factor = 2. init_bound = np.sqrt(factor / (fan_in + fan_out)) # Generate weights and bias: coef_init = self.random_state.uniform(-init_bound, init_bound, (fan_in, fan_out)) intercept_init = self.random_state.uniform(-init_bound, init_bound, fan_out) return coef_init, intercept_init, init_bound def encode(self, coefs, intercepts): self.n_coefs = [] self.n_intercepts = [] grasshopper_position = [] for array in coefs: self.n_coefs.append(np.shape(array)) for line in array: grasshopper_position += list(line) for array in intercepts: self.n_intercepts.append(np.shape(array)) grasshopper_position += list(array) return grasshopper_position def decode(self, grasshopper_position:list): coefs = [] intercepts = [] pos = 0 for shape in self.n_coefs: coef = [] for j in range(shape[0]): coe = [] for k in range(shape[1]): coe.append(grasshopper_position[pos]) pos = pos+1 coef.append(coe) coefs.append(np.array(coef)) for shape in self.n_intercepts: intercept = [] for j in range(shape[0]): intercept.append(grasshopper_position[pos]) pos = pos+1 intercepts.append(np.array(intercept)) return coefs, intercepts def _predict(self, X, coefs, intercepts): X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_] # Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, coefs, intercepts) y_pred = activations[-1] return y_pred def predict(self, X): X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_] # Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, self.coefs_, self.intercepts_) y_pred = activations[-1] if self.n_outputs_ == 1: y_pred = y_pred.ravel() return self.label_binarizer.inverse_transform(y_pred) def validate_input(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) classes = unique_labels(y) self.label_binarizer = LabelBinarizer() self.label_binarizer.fit(classes) y = self.label_binarizer.transform(y) return X, y
def main(): print("Loading samples and labels") samples, labels, _ = load_files("data") print("Loaded {} samples".format(samples.shape[0])) sequence_dim = 100 print("Converting to sequences of length {}".format(sequence_dim)) samples, labels = make_sequences(samples, labels, sequence_dim) print("Number of samples from sequences: {}".format(samples.shape[0])) lb = LabelBinarizer() labels = lb.fit_transform(labels) # flattened samples for Decision Tree flatSamples = samples.reshape(samples.shape[0], -1) #tree! (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(flatSamples, labels, test_size=0.25, random_state=42) print("=" * 20) print("Building DecisionTree model") model = DecisionTreeClassifier() model.fit(trainSamples, trainLabels) treeResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) treeAcc = accuracy_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)) print("Accuracy Tree: {:.2f}".format(treeAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))) print("=" * 20) print("Building CNN model") (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) inputShape = (samples.shape[1], samples.shape[2]) model = Sequential() model.add(Conv1D(32, 10, padding="same", input_shape=inputShape)) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(64, 10, padding="same")) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(128, 10, padding="same")) model.add(Activation("relu")) model.add(Dropout(0.2)) model.add(Flatten(input_shape=inputShape)) model.add(Dense(128, activation='sigmoid')) model.add(Dense(64, activation='sigmoid')) model.add(Dense(labels.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) EPOCHS = 10 BATCH = 128 model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1), target_names=lb.classes_)) print("CNN Accuracy: {:.2f}".format( accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
# find pairs for samplesIMG in samplesCSV samples_paired = [] for i in range(samplesIMG.shape[0]): for j in range(samplesCSV.shape[0]): if namesCSV[j] == namesIMG[i]: samples_paired.append(samplesCSV[j]) samplesCSV = np.array(samples_paired) samplesIMG = np.expand_dims(samplesIMG, axis=3) print("Paired") print("Samples IMG: {}".format(len(samplesIMG))) print("Samples CSV: {}".format(len(samplesCSV))) # one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) numClasses = labels.shape[1] inputShape = (108, 192, 1) #samplesIMG.shape #model for images cnnmodel = Sequential() cnnmodel.add(Conv2D(16, (3, 3), padding="same", input_shape=inputShape)) cnnmodel.add(Activation("relu")) cnnmodel.add(MaxPooling2D(pool_size=(2, 2))) cnnmodel.add(Conv2D(32, (3, 3), padding="same")) cnnmodel.add(Activation("relu")) cnnmodel.add(MaxPooling2D(pool_size=(2, 2))) cnnmodel.add(Dropout(0.25)) cnnmodel.add(Flatten())
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in train_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array( pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in test_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add( Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
class languageIdentification(object): """ Using characters as features, each encoded by sklearn OneHotEncoder Languages are encoded into vectors using sklearn LabelBinarizer """ def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1): self.d = d self.yita = yita self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2} self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \ "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"] self.noPunctuation = False self.answerLables = LabelBinarizer() self.answerLables.fit([1, 2, 3]) self.c = set() self.Initialize(trainFile, devFile, testFile) self.input = len(self.c) * 5 + 1 self.setParameters(d, yita) def Initialize(self, trainFileName, devFileName, testFileName): trainList = [] trainResult = [] self.testFeatures = [] self.devFeatures = [] self.trainFeatures = [] self.train = [] #self.dev = [] #self.test = [] self.devResult = [] self.rawResult = [] print "train feature processing..." with open(trainFileName) as trainFile: for line in trainFile: line = line.decode('utf-8').strip() if not line: continue space = line.find(" ") if space < 5: continue answer, train = line[:space].upper(), line[space + 1:] li, ans = self.lineProc(train, answer, True) trainList += li trainResult += ans self.trainFeatures.append(li) self.rawResult.append(self.languages[answer]) with open(devFileName) as devFile: for line in devFile: line = line.decode('utf-8').strip() if not line: continue space = line.find(" ") if space < 5: continue answer, train = line[:space].upper(), line[space + 1:] li = self.lineProc(train, answer, False) self.devFeatures.append(li) self.devResult.append(self.languages[answer]) with open(testFileName) as testFile: for line in testFile: if not line: continue line = line.decode('latin-1').strip() test = self.lineProc(line, "", False) self.testFeatures.append(test) trainList, trainResult = self.FisherYatesShuffle( trainList, trainResult) trainResult = np.array(trainResult) self.trainResult = self.answerLables.fit_transform(trainResult) self.trainLabels = preprocessing.LabelEncoder() featureList = list(self.c) self.trainLabels.fit(featureList) #print self.trainLabels.classes_ length = len(self.c) print "feature length:", length self.v = preprocessing.OneHotEncoder(n_values=length) trainList = np.array(trainList) self.train = self.trainLabels.transform( trainList.ravel()).reshape(*trainList.shape) self.train = self.v.fit_transform(self.train).toarray() print "train shape", self.train.shape def directPredict(self, featureList, type): types = { "train": "self.rawResult", "dev": "self.devResult", "test": "self.testResult" } prediction = self.predictAll(featureList) accuracy = self.evaluate(prediction, eval(types[type])) return prediction, accuracy def devProcess(self, epoch, initial=True): trainAccuracy = [] devAccuracy = [] if initial: print "initial predictions..." initial_train = self.directPredict(self.trainFeatures, "train")[1] trainAccuracy.append(initial_train) print "initial train accuracy: ", initial_train initial_dev = self.directPredict(self.devFeatures, "dev")[1] print "initial dev accuracy: ", initial_dev devAccuracy.append(initial_dev) for i in xrange(epoch): print "************************************epoch:", i + 1, "************************************" self.trainNN(1) trainac = self.directPredict(self.trainFeatures, "train")[1] print "train accuracy:", trainac trainAccuracy.append(trainac) devac = self.directPredict(self.devFeatures, "dev")[1] print "dev accuracy:", devac devAccuracy.append(devac) if initial: x = [i for i in xrange(epoch + 1)] pl.plot(x, trainAccuracy, 'r--', x, devAccuracy, 'bs') pl.show() def getTestResult(self): test_results = open('languageIdentification.data/test_solutions', 'r') self.testResult = [] for line in test_results.readlines(): self.testResult.append( solution.languages[line.strip().split(" ")[1].upper()]) def setParameters(self, d, yita): self.d = d self.yita = yita self.hidden = d self.output = 3 self.ai = np.array([1.0] * self.input) self.ah = np.array([1.0] * (self.hidden + 1)) self.ao = [1.0] * self.output self.wi = np.random.uniform(size=(self.input, self.hidden)) self.wo = np.random.randn(self.hidden + 1, self.output) self.ci = np.zeros((self.input, self.hidden)) self.co = np.zeros((self.hidden + 1, self.output)) def resetParameters(self): self.ai = np.array([1.0] * self.input) self.ah = np.array([1.0] * (self.hidden + 1)) self.ao = [1.0] * self.output self.ci = np.zeros((self.input, self.hidden)) self.co = np.zeros((self.hidden + 1, self.output)) def lineProc(self, line, answer, isTraining=True): text = [] result = [] for ch in line: self.c.add(ch) if len(line) < 5: line += " " * (5 - len(line)) for i in xrange(len(line) - 4): text.append(list(line[i:i + 5])) if isTraining: result.append(self.languages[answer]) if isTraining: return (text, result) else: return text def FisherYatesShuffle(self, train, result): l = len(train) for i in xrange(l - 1, 0, -1): j = randint(0, i) train[i], train[j] = train[j], train[i] result[i], result[j] = result[j], result[i] #print result return train[:], result[:] def feedForward(self, inputs): self.resetParameters() for i in range(self.input - 1): self.ai[i] = inputs[i] self.ah[:self.hidden] = np.dot(self.ai, self.wi) self.ah[-1] = 1 self.ah = self.sigmoid(self.ah) self.ao = np.dot(self.ah, self.wo) self.ao = self.softMax(self.ao) return self.ao[:] def softMax(self, out): total = sum(np.exp(out)) #for i in xrange(self.output): out = np.exp(out) * 1.0 / total return out def backPropagate(self, result): # p(L, y) = y - y_hat d4 = self.ao - np.array(result) # kronecker delta: P(L, y_hat) = P(L, y) * P(y, y_hat) #print "before tune:", self.ao, result d3 = np.array([0.0] * self.output) for j in xrange(self.output): for i in xrange(self.output): if i == j: d3[j] += d4[i] * self.ao[i] * (1 - self.ao[j]) else: d3[j] += d4[i] * self.ao[i] * -self.ao[j] # p(L, ah) = P(L, y) * P(y, y_hat) * p(y_hat, ah) d2 = np.dot(self.wo, d3) # p(L, ah_hat) = p(L, y) * P(y, y_hat) * p(y_hat, ah) * P(ah, ah_hat) d1 = d2 * self.partialDerivativeSigmoid(self.ah) # p(L, W2) = p(L, y) * p(y, y_hat) * p(y_hat, W2) D2 = self.yita * np.outer(self.ah, d3) self.wo -= D2 + self.co self.co = D2 # p(L, w1) = p(L, y) * P(y, y_hat) * p(y_hat, ah) * P(ah, ah_hat) * P(ah_hat, w1) D1 = self.yita * np.outer(self.ai, d1[1:]) self.wi -= D1 + self.ci self.ci = D1 error = 1.0 / 2 * np.dot(d4, d4) return error def trainNN(self, epoch=3): for i in xrange(epoch): error = 0.0 for j in xrange(len(self.train)): entry = self.train[j] res = self.trainResult[j] self.feedForward(entry) error += self.backPropagate(res) print "error:", error self.resetParameters() def predict(self, test): result = Counter() for entry in test: r = self.feedForward(entry) #print r idx = np.argmax(r) + 1 result[idx] += 1 return result.most_common(1)[0][0] def partialDerivativeSigmoid(self, out): return out * 1.0 * (1.0 - out) def sigmoid(self, x): #x = np.clip(x, -500, 500) return 1.0 / (1 + np.exp(-x)) def evaluate(self, predictions, golden): return accuracy_score(golden, predictions) def predictAll(self, features): predict_result = [] for f in features: f = np.array(f) feature = self.trainLabels.transform(f.ravel()).reshape(*f.shape) feature = self.v.transform(feature).toarray() res = self.predict(feature) predict_result.append(res) return predict_result def testResultOutput(self, testFile, testPrediction): inverse = {1: "ENGLISH", 3: "FRENCH", 2: "ITALIAN"} testFile = open(testFileName, 'r') with open('./languageIdentification.output', 'w') as output: i = 0 for line in testFile.readlines(): output.write(line.strip() + " " + inverse[testPrediction[i]] + '\n') i += 1
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert_true(issparse(got)) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def main(): samples, labels, _ = loader.load_img("radio_img") #add the fourth dimension (color) samples = np.expand_dims(samples, axis=4) print("shape = {}".format(samples.shape)) inputShape = (samples.shape[1], samples.shape[2], samples.shape[3]) print("inputShape = {}".format(inputShape)) #weights class_weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels) d_class_weights = dict(enumerate(class_weights)) print("weights {}".format(d_class_weights)) #one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) classesNum = labels.shape[1] print("Classes: {}".format(classesNum)) #split to training and test (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) model = cnn_model(inputShape, classesNum) ## checkpoints # checkpt1 = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', save_best_only=True) # checkpt2 = EarlyStopping(monitor='val_loss', patience=3) EPOCHS = 20 BATCH = 50 model.fit( trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, class_weight=d_class_weights, verbose=1, #callbacks = [checkpt1,checkpt2], validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) cnnAcc = accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)) print("Accuracy CNN: {:.2f}".format(cnnAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add( Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add( Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def main(): #load data file = "datasetA_3c.csv" dataframe = pandas.read_csv(file) dataset = dataframe.values samples = dataset[:,1:] labels = dataset[:,0] samples = np.array(samples) labels = np.array(labels) labels = labels.astype(str) print("Class distribution:") print(Counter(labels)) ### choose k best attributes # from sklearn.feature_selection.univariate_selection import SelectKBest # newSamples = SelectKBest(k=100).fit_transform(samples, labels) # print(newSamples.shape) # samples = newSamples ### Calculate weights for unbalanced classes # from sklearn.utils import class_weight # d_class_weights = None # class_weights = class_weight.compute_class_weight('balanced',np.unique(labels),labels) # print("Class weights:") # print(class_weights) # d_class_weights = dict(enumerate(class_weights)) ### Normalize samples # from sklearn.preprocessing.data import normalize # normalize(samples) ## convert to one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) classesNum = labels.shape[1] print ("Classes: {}".format(classesNum)) trainSamples = samples trainLabels = labels testSamples = samples testLabels = labels ### Division into training and test samples # from sklearn.model_selection._split import train_test_split # (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) model = Sequential() model.add(Dense(250, activation='sigmoid')) model.add(Dense(250, activation='sigmoid')) model.add(Dense(250, activation='sigmoid')) model.add(Dense(classesNum, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy']) EPOCHS=50 BATCH=50 H = model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS #,class_weight=d_class_weights #,validation_data=(testSamples,testLabels) #,validation_split=0.1 ) mlpResults = model.predict(testSamples) print(confusion_matrix(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))) print(classification_report(testLabels.argmax(axis=1), mlpResults.argmax(axis=1),target_names=lb.classes_)) print("MLP Accuracy: {:.2f}".format(accuracy_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))) N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["acc"], label="train_acc") #plt.plot(N, H.history["val_loss"], label="val_loss") #plt.plot(N, H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.show()
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert issparse(got) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def prepocess_label(y): return LabelBinarizer().fit_transform(y)
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def test_label_binarizer_column_y(): # first for binary classification vs multi-label with 1 possible class # lists are multi-label, array is multi-class :-/ inp_list = [[1], [2], [1]] inp_array = np.array(inp_list) multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]]) binaryclass_array = np.array([[0], [1], [0]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, multilabel_indicator) assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_")) assert_false(assert_warns(DeprecationWarning, getattr, lb_1, "indicator_matrix_")) assert_array_equal(out_2, binaryclass_array) assert_false(assert_warns(DeprecationWarning, getattr, lb_2, "multilabel_")) # second for multiclass classification vs multi-label with multiple # classes inp_list = [[1], [2], [1], [3]] inp_array = np.array(inp_list) # the indicator matrix output is the same in this case indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, out_2) assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_")) assert_array_equal(out_2, indicator) assert_false(assert_warns(DeprecationWarning, getattr, lb_2, "multilabel_"))