Exemple #1
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    with pytest.raises(ValueError):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    with pytest.raises(ValueError):
        lb.transform([])
    with pytest.raises(ValueError):
        lb.inverse_transform([])

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=2)

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2],
                                       threshold=0)

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    with pytest.raises(ValueError):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on the dimension of 'binary'
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
                                       output_type="binary",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on multioutput data
    with pytest.raises(ValueError):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError):
        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
class KmeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, binarize_labels=True, return_distances=False, **kwargs):
        self.binarize_labels = binarize_labels
        self.return_distances = return_distances
        self.kmeans_params = kwargs

    def fit(self, y):
        self.kmeans = KMeans(**self.kmeans_params)
        self.kmeans.fit(y)
        if self.binarize_labels:
            self.binarizer = LabelBinarizer(sparse_output=True)
            self.binarizer.fit(self.kmeans.labels_)
        return self

    def transform(self, y):
        labels = self.kmeans.predict(y)
        if self.binarize_labels:
            ret_labels = self.binarizer.transform(labels)
        else:
            ret_labels = labels
        if self.return_distances:
            centroids = self.kmeans.cluster_centers_[labels]
            # noinspection PyTypeChecker
            dist = np.sum((y - centroids)**2, axis=1)
            if self.binarize_labels:
                dist = sp.csr_matrix(dist[:, None])
                return sp.hstack((ret_labels, dist))
            return np.hstack(
                (np.expand_dims(ret_labels,
                                axis=1), np.expand_dims(dist, axis=1)))
        return ret_labels
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(["b", "d", "e"])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]])
    got = lb.transform(["a", "b", "c", "d", "e", "f"])
    assert_array_equal(expected, got)
Exemple #4
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
class MyLabelBinarizer(TransformerMixin):
    # make LabelBinarizer with 2 arguments (should replace this class with CategoricalEncoder in newer version of sklearn)
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False))
    model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0],
                         [1, 0, 0],
                         [0, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
class CategoryBinarizer(TransformerMixin):
    def __init__(self):
        self.__encoder = LabelBinarizer(sparse_output=False)

    def fit(self, X, y=None):
        # X = X.astype(str)
        X = X.values
        self.__encoder.fit(X)
        return self

    def transform(self, X):
        X = X.values
        result = self.__encoder.transform(X)
        result = pd.DataFrame(result)
        result.columns = self.__encoder.classes_

        return result
Exemple #9
0
class LabelBinarizerImpl():
    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
        self._hyperparams = {
            'neg_label': neg_label,
            'pos_label': pos_label,
            'sparse_output': sparse_output
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
class GOAMultilayerPerceptron:
    def __init__(self, N, hidden_layer_sizes, max_iter, random_state, x_val, y_val, activation="relu"):
        self.N = N
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.max_iter = max_iter
        self.random_state = check_random_state(random_state)
        self.xval = x_val
        self.yval = y_val
    def _forward_pass(self, activations, coefs, intercepts):
        hidden_activation = ACTIVATIONS[self.activation]
        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 1):
            activations[i + 1] = safe_sparse_dot(activations[i], coefs[i])
            activations[i + 1] += intercepts[i]
            # For the hidden layers
            if (i + 1) != (self.n_layers_ - 1):
                activations[i + 1] = hidden_activation(activations[i + 1])
        # For the last layer
        activations[self.n_layers_-1] = logistic(activations[self.n_layers_-1])
        return activations

    def initialize(self, y, layer_units, coefs_, intercepts_):
        self.n_outputs_ = y.shape[1]
        self.n_layers_ = len(layer_units)
        self.out_activation_ = 'logistic'
        self.n_coefs = []
        self.n_intercepts = []
        self.bound = 0
        bound = 0
        self.coefs_ = coefs_
        self.intercepts_ = intercepts_
        grasshopper_vector = self.encode(coefs_, intercepts_)
        for x in grasshopper_vector:
            if abs(x) > bound:
                bound = abs(x)
        bound = math.ceil(bound)
        self.grasshopper_vector = grasshopper_vector
        self.dim = len(grasshopper_vector)
        self.ub = bound
        self.lb = -bound

    def fit(self, X, y):
        inicial_mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=self.hidden_layer_sizes, random_state=8997)
        inicial_mlp.fit(X, y)
        N = self.N
        max_iter = self.max_iter
        hidden_layer_sizes = self.hidden_layer_sizes
        hidden_layer_sizes = list(hidden_layer_sizes)
        X, y = self.validate_input(X, y)
        n_samples, n_features = X.shape
        if y.ndim == 1:
            y = y.reshape((-1, 1))
        self.n_outputs_ = y.shape[1]
        layer_units = ([n_features] + hidden_layer_sizes +
                       [self.n_outputs_])
        self.initialize(y, layer_units, inicial_mlp.coefs_, inicial_mlp.intercepts_)
        y = self.label_binarizer.inverse_transform(y)
        bestauc = 0
        flag = 0
        dim = self.dim
        print("dim:", dim)
        lb = self.lb
        ub = self.ub
        ub = np.ones((dim, 1)) * ub
        lb = np.ones((dim, 1)) * lb
        if dim % 2 != 0:
            dim = dim + 1
            ub = np.append(ub, self.ub)
            lb = np.append(lb, self.lb)
            flag = 1
        if flag == 1:
            self.grasshopper_vector.append(0)
        grasshopper_positions = []
        for i in range(N):
            grasshopper_positions.append(self.grasshopper_vector)
        # grasshopper_positions = initialization(N, dim, self.lb, self.ub)
        grasshopper_positions = np.array(grasshopper_positions)
        grasshopper_fitness = []
        cmax = 1
        cmin = 0.00004
        for i in range(np.size(grasshopper_positions, 0)):
            if flag == 1:
                grasshopper_position = grasshopper_positions[i][0:-1]
                coefs, intercepts = self.decode(grasshopper_position)
                y_pred = self._predict(X, coefs, intercepts)
                y_pred = y_pred.ravel()
                self.label_binarizer.inverse_transform(y_pred)
                fpr, tpr, thresholds = roc_curve(y, y_pred)
                auc1 = auc(fpr, tpr)
                grasshopper_fitness.append(auc1)
                # grasshopper_fitness.append(binary_log_loss(y, y_pred))
            else:
                grasshopper_position = grasshopper_positions[i]
                coefs, intercepts = self.decode(grasshopper_position)
                y_pred = self._predict(X, coefs, intercepts)
                y_pred = y_pred.ravel()
                self.label_binarizer.inverse_transform(y_pred)
                fpr, tpr, thresholds = roc_curve(y, y_pred)
                auc1 = auc(fpr, tpr)
                grasshopper_fitness.append(auc1)
                # grasshopper_fitness.append(binary_log_loss(y, y_pred))
        sorted_indexes = list(np.array(grasshopper_fitness).argsort())
        grasshopper_fitness.sort(reverse=True)
        sorted_grasshopper = []
        for new_index in range(N):
            sorted_grasshopper.append(grasshopper_positions[sorted_indexes[new_index]])
        target_position = sorted_grasshopper[0]
        target_fitness = grasshopper_fitness[0]
        print("target_position:",  target_position)
        print("target_fitness:", target_fitness)
        l = 2
        grasshopper_positions = np.array(grasshopper_positions)
        print(np.shape(grasshopper_positions))
        while l < max_iter + 1:
            print("iteration ", l)
            tp = np.array(target_position)
            cc = cmax - l * ((cmax - cmin) / max_iter)
            for i in range(np.size(grasshopper_positions, 0)):
                temp = np.transpose(grasshopper_positions)
                s_i = np.zeros((dim, 1))
                for j in range(N):
                    if i != j:
                        dist = distance(temp[:, j], temp[:, i])
                        r_ij_vec = (temp[:, j] - temp[:, i]) / (dist + eps(1))
                        xj_xi = 2 + dist % 2
                        s_ij = np.multiply((ub - lb)*cc/2*s_func(xj_xi), r_ij_vec)
                        s_i = s_i + np.transpose(s_ij)
                X_new = cc * np.transpose(s_i) + tp
                grasshopper_positions[i, :] = np.squeeze(np.transpose(X_new))
            for i in range(N):
                # Relocate grasshoppers that go outside the search space
                tp = np.greater(grasshopper_positions[i, :], np.transpose(ub))
                tm = np.less(grasshopper_positions[i, :], np.transpose(lb))
                grasshopper_positions[i, :] = grasshopper_positions[i, :] * np.logical_not(tp + tm) + np.transpose(
                    ub) * tp + np.transpose(lb) * tm
                if flag == 1:
                    grasshopper_position = grasshopper_positions[i][0:-1]
                    coefs, intercepts = self.decode(grasshopper_position)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    grasshopper_fitness = auc1
                    # grasshopper_fitness = binary_log_loss(y, y_pred)
                else:
                    grasshopper_position = grasshopper_positions[i]
                    coefs, intercepts = self.decode(grasshopper_position)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    grasshopper_fitness = auc1
                    #grasshopper_fitness = binary_log_loss(y, y_pred)
                if grasshopper_fitness > target_fitness:
                    target_position = grasshopper_positions[i]
                    target_fitness = grasshopper_fitness
                    print("new_fitness:", target_fitness)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    print("training auc:", auc1)

                    y_pred = self._predict(self.xval, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(self.yval, y_pred)
                    auc1 = auc(fpr, tpr)
                    if auc1>bestauc:
                        bestauc = auc1
                        print("best auc on validation set:", bestauc)
            l=l+1
        if flag == 1:
            target_position = target_position[0:-1]
        coefss, interceptss = self.decode(target_position)
        self.coefs_ = coefss
        self.intercepts_ = interceptss

    def init_coef(self, fan_in, fan_out):
        # Use the initialization method recommended by
        # Glorot et al.
        factor = 6.
        if self.activation == 'logistic':
            factor = 2.
        init_bound = np.sqrt(factor / (fan_in + fan_out))

        # Generate weights and bias:
        coef_init = self.random_state.uniform(-init_bound, init_bound, (fan_in, fan_out))
        intercept_init = self.random_state.uniform(-init_bound, init_bound, fan_out)
        return coef_init, intercept_init, init_bound
    def encode(self, coefs, intercepts):
        self.n_coefs = []
        self.n_intercepts = []
        grasshopper_position = []
        for array in coefs:
            self.n_coefs.append(np.shape(array))
            for line in array:
                grasshopper_position += list(line)
        for array in intercepts:
            self.n_intercepts.append(np.shape(array))
            grasshopper_position += list(array)
        return grasshopper_position
    def decode(self, grasshopper_position:list):
        coefs = []
        intercepts = []
        pos = 0
        for shape in self.n_coefs:
            coef = []
            for j in range(shape[0]):
                coe = []
                for k in range(shape[1]):
                    coe.append(grasshopper_position[pos])
                    pos = pos+1
                coef.append(coe)
            coefs.append(np.array(coef))
        for shape in self.n_intercepts:
            intercept = []
            for j in range(shape[0]):
                intercept.append(grasshopper_position[pos])
                pos = pos+1
            intercepts.append(np.array(intercept))
        return coefs, intercepts

    def _predict(self, X, coefs, intercepts):
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_]

        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, coefs, intercepts)
        y_pred = activations[-1]
        return y_pred

    def predict(self, X):
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_]

        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, self.coefs_, self.intercepts_)
        y_pred = activations[-1]
        if self.n_outputs_ == 1:
            y_pred = y_pred.ravel()
        return self.label_binarizer.inverse_transform(y_pred)

    def validate_input(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        if y.ndim == 2 and y.shape[1] == 1:
            y = column_or_1d(y, warn=True)
        classes = unique_labels(y)
        self.label_binarizer = LabelBinarizer()
        self.label_binarizer.fit(classes)
        y = self.label_binarizer.transform(y)
        return X, y
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in train_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(train_docs,
                      max_length=1014,
                      padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in test_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014,
                                    padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(
        Embedding(len(CHAR_MAP) + 1,
                  len(CHAR_MAP) + 1,
                  input_length=1014,
                  weights=[char_embedding()],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train,
              y_train,
              batch_size=64,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train,
              y_train,
              batch_size=32,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(
        Embedding(5001,
                  300,
                  input_length=100,
                  dropout=.2,
                  weights=[emb_weights],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=50,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                  in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                 in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014,
                        weights=[char_embedding()], trainable=False))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))