def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
class KmeansTransformer(BaseEstimator, TransformerMixin): def __init__(self, binarize_labels=True, return_distances=False, **kwargs): self.binarize_labels = binarize_labels self.return_distances = return_distances self.kmeans_params = kwargs def fit(self, y): self.kmeans = KMeans(**self.kmeans_params) self.kmeans.fit(y) if self.binarize_labels: self.binarizer = LabelBinarizer(sparse_output=True) self.binarizer.fit(self.kmeans.labels_) return self def transform(self, y): labels = self.kmeans.predict(y) if self.binarize_labels: ret_labels = self.binarizer.transform(labels) else: ret_labels = labels if self.return_distances: centroids = self.kmeans.cluster_centers_[labels] # noinspection PyTypeChecker dist = np.sum((y - centroids)**2, axis=1) if self.binarize_labels: dist = sp.csr_matrix(dist[:, None]) return sp.hstack((ret_labels, dist)) return np.hstack( (np.expand_dims(ret_labels, axis=1), np.expand_dims(dist, axis=1))) return ret_labels
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(["b", "d", "e"]) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(["a", "b", "c", "d", "e", "f"]) assert_array_equal(expected, got)
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(['b', 'd', 'e']) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f']) assert_array_equal(expected, got)
class MyLabelBinarizer(TransformerMixin): # make LabelBinarizer with 2 arguments (should replace this class with CategoricalEncoder in newer version of sklearn) def __init__(self, *args, **kwargs): self.encoder = LabelBinarizer(*args, **kwargs) def fit(self, x, y=0): self.encoder.fit(x) return self def transform(self, x, y=0): return self.encoder.transform(x)
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
class CategoryBinarizer(TransformerMixin): def __init__(self): self.__encoder = LabelBinarizer(sparse_output=False) def fit(self, X, y=None): # X = X.astype(str) X = X.values self.__encoder.fit(X) return self def transform(self, X): X = X.values result = self.__encoder.transform(X) result = pd.DataFrame(result) result.columns = self.__encoder.classes_ return result
class LabelBinarizerImpl(): def __init__(self, neg_label=0, pos_label=1, sparse_output=False): self._hyperparams = { 'neg_label': neg_label, 'pos_label': pos_label, 'sparse_output': sparse_output } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class GOAMultilayerPerceptron: def __init__(self, N, hidden_layer_sizes, max_iter, random_state, x_val, y_val, activation="relu"): self.N = N self.hidden_layer_sizes = hidden_layer_sizes self.activation = activation self.max_iter = max_iter self.random_state = check_random_state(random_state) self.xval = x_val self.yval = y_val def _forward_pass(self, activations, coefs, intercepts): hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], coefs[i]) activations[i + 1] += intercepts[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer activations[self.n_layers_-1] = logistic(activations[self.n_layers_-1]) return activations def initialize(self, y, layer_units, coefs_, intercepts_): self.n_outputs_ = y.shape[1] self.n_layers_ = len(layer_units) self.out_activation_ = 'logistic' self.n_coefs = [] self.n_intercepts = [] self.bound = 0 bound = 0 self.coefs_ = coefs_ self.intercepts_ = intercepts_ grasshopper_vector = self.encode(coefs_, intercepts_) for x in grasshopper_vector: if abs(x) > bound: bound = abs(x) bound = math.ceil(bound) self.grasshopper_vector = grasshopper_vector self.dim = len(grasshopper_vector) self.ub = bound self.lb = -bound def fit(self, X, y): inicial_mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=self.hidden_layer_sizes, random_state=8997) inicial_mlp.fit(X, y) N = self.N max_iter = self.max_iter hidden_layer_sizes = self.hidden_layer_sizes hidden_layer_sizes = list(hidden_layer_sizes) X, y = self.validate_input(X, y) n_samples, n_features = X.shape if y.ndim == 1: y = y.reshape((-1, 1)) self.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) self.initialize(y, layer_units, inicial_mlp.coefs_, inicial_mlp.intercepts_) y = self.label_binarizer.inverse_transform(y) bestauc = 0 flag = 0 dim = self.dim print("dim:", dim) lb = self.lb ub = self.ub ub = np.ones((dim, 1)) * ub lb = np.ones((dim, 1)) * lb if dim % 2 != 0: dim = dim + 1 ub = np.append(ub, self.ub) lb = np.append(lb, self.lb) flag = 1 if flag == 1: self.grasshopper_vector.append(0) grasshopper_positions = [] for i in range(N): grasshopper_positions.append(self.grasshopper_vector) # grasshopper_positions = initialization(N, dim, self.lb, self.ub) grasshopper_positions = np.array(grasshopper_positions) grasshopper_fitness = [] cmax = 1 cmin = 0.00004 for i in range(np.size(grasshopper_positions, 0)): if flag == 1: grasshopper_position = grasshopper_positions[i][0:-1] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness.append(auc1) # grasshopper_fitness.append(binary_log_loss(y, y_pred)) else: grasshopper_position = grasshopper_positions[i] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness.append(auc1) # grasshopper_fitness.append(binary_log_loss(y, y_pred)) sorted_indexes = list(np.array(grasshopper_fitness).argsort()) grasshopper_fitness.sort(reverse=True) sorted_grasshopper = [] for new_index in range(N): sorted_grasshopper.append(grasshopper_positions[sorted_indexes[new_index]]) target_position = sorted_grasshopper[0] target_fitness = grasshopper_fitness[0] print("target_position:", target_position) print("target_fitness:", target_fitness) l = 2 grasshopper_positions = np.array(grasshopper_positions) print(np.shape(grasshopper_positions)) while l < max_iter + 1: print("iteration ", l) tp = np.array(target_position) cc = cmax - l * ((cmax - cmin) / max_iter) for i in range(np.size(grasshopper_positions, 0)): temp = np.transpose(grasshopper_positions) s_i = np.zeros((dim, 1)) for j in range(N): if i != j: dist = distance(temp[:, j], temp[:, i]) r_ij_vec = (temp[:, j] - temp[:, i]) / (dist + eps(1)) xj_xi = 2 + dist % 2 s_ij = np.multiply((ub - lb)*cc/2*s_func(xj_xi), r_ij_vec) s_i = s_i + np.transpose(s_ij) X_new = cc * np.transpose(s_i) + tp grasshopper_positions[i, :] = np.squeeze(np.transpose(X_new)) for i in range(N): # Relocate grasshoppers that go outside the search space tp = np.greater(grasshopper_positions[i, :], np.transpose(ub)) tm = np.less(grasshopper_positions[i, :], np.transpose(lb)) grasshopper_positions[i, :] = grasshopper_positions[i, :] * np.logical_not(tp + tm) + np.transpose( ub) * tp + np.transpose(lb) * tm if flag == 1: grasshopper_position = grasshopper_positions[i][0:-1] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness = auc1 # grasshopper_fitness = binary_log_loss(y, y_pred) else: grasshopper_position = grasshopper_positions[i] coefs, intercepts = self.decode(grasshopper_position) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) grasshopper_fitness = auc1 #grasshopper_fitness = binary_log_loss(y, y_pred) if grasshopper_fitness > target_fitness: target_position = grasshopper_positions[i] target_fitness = grasshopper_fitness print("new_fitness:", target_fitness) y_pred = self._predict(X, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(y, y_pred) auc1 = auc(fpr, tpr) print("training auc:", auc1) y_pred = self._predict(self.xval, coefs, intercepts) y_pred = y_pred.ravel() self.label_binarizer.inverse_transform(y_pred) fpr, tpr, thresholds = roc_curve(self.yval, y_pred) auc1 = auc(fpr, tpr) if auc1>bestauc: bestauc = auc1 print("best auc on validation set:", bestauc) l=l+1 if flag == 1: target_position = target_position[0:-1] coefss, interceptss = self.decode(target_position) self.coefs_ = coefss self.intercepts_ = interceptss def init_coef(self, fan_in, fan_out): # Use the initialization method recommended by # Glorot et al. factor = 6. if self.activation == 'logistic': factor = 2. init_bound = np.sqrt(factor / (fan_in + fan_out)) # Generate weights and bias: coef_init = self.random_state.uniform(-init_bound, init_bound, (fan_in, fan_out)) intercept_init = self.random_state.uniform(-init_bound, init_bound, fan_out) return coef_init, intercept_init, init_bound def encode(self, coefs, intercepts): self.n_coefs = [] self.n_intercepts = [] grasshopper_position = [] for array in coefs: self.n_coefs.append(np.shape(array)) for line in array: grasshopper_position += list(line) for array in intercepts: self.n_intercepts.append(np.shape(array)) grasshopper_position += list(array) return grasshopper_position def decode(self, grasshopper_position:list): coefs = [] intercepts = [] pos = 0 for shape in self.n_coefs: coef = [] for j in range(shape[0]): coe = [] for k in range(shape[1]): coe.append(grasshopper_position[pos]) pos = pos+1 coef.append(coe) coefs.append(np.array(coef)) for shape in self.n_intercepts: intercept = [] for j in range(shape[0]): intercept.append(grasshopper_position[pos]) pos = pos+1 intercepts.append(np.array(intercept)) return coefs, intercepts def _predict(self, X, coefs, intercepts): X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_] # Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, coefs, intercepts) y_pred = activations[-1] return y_pred def predict(self, X): X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_] # Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, self.coefs_, self.intercepts_) y_pred = activations[-1] if self.n_outputs_ == 1: y_pred = y_pred.ravel() return self.label_binarizer.inverse_transform(y_pred) def validate_input(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) classes = unique_labels(y) self.label_binarizer = LabelBinarizer() self.label_binarizer.fit(classes) y = self.label_binarizer.transform(y) return X, y
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in train_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array( pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in test_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add( Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add( Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add( Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))