def run(): prep.preprocess_data() X_train = prep.data_store.get_X_train(include_valid=True) y_train = prep.data_store.get_y_train(include_valid=True, one_hot_encoding=False) X_test = prep.data_store.get_X_test() y_test = prep.data_store.get_y_test(one_hot_encoding=False) log.debug('X_train: %s, y_train: %s, X_test: %s, y_test: %s' % (X_train.shape, y_train.shape, X_test.shape, y_test.shape)) log.debug('X_train[:5]: %s' % (X_train[100:105])) log.debug('y_train[:5]: %s' % (y_train[100:105])) pipeline = SVC() train_sizes, train_scores, test_scores = \ learning_curve(estimator=pipeline, X=X_train[:1000], y=y_train[:1000], train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(test_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) log.debug('accuracy: %s, type: %s' % (train_mean[-1], type(train_mean))) log.debug('accuracy: %s, type: %s' % (test_mean[-1], type(test_mean))) log.debug("end")
def show_dimen_reduction(self, fig, ax): log.debug('#####') pk.preprocess_data() self.get_dataset() log.debug('X_train: %s, y_train: %s' % (self.X_train.shape, self.y_train.shape)) clf, pipeline = self.build_clf_pipeline() # rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy') # rfecv.fit(self.X_train, self.y_train) # log.debug('Optimal number of features: %d' % rfecv.n_features_) # Feature selector selector_k_best = SelectKBest(f_regression, k=10) # Build the machine learning pipeline pipeline_clf = Pipeline([('selector', selector_k_best), ('rf', clf)]) best_features = None best_score = 0 for k in range(2, pk.data_store.get_x_field_count() + 1): pipeline_clf.set_params(selector__k=k) # Training the classifier pipeline_clf.fit(self.X_train, self.y_train) # Print score cur_score = pipeline_clf.score(self.X_test, self.y_test) log.debug('Feature Count: %d, Score:%f' % (k, cur_score)) # Print the selected features chosen by the selector features_status = pipeline_clf.named_steps['selector'].get_support( ) selected_features = [] for count, item in enumerate(features_status): if item: selected_features.append(count) log.debug('Selected features:%s' % (','.join([str(x) for x in selected_features]))) if cur_score > best_score: best_features = selected_features best_score = cur_score mdata.set_reduced_features(best_features)
def __init__(self, main_view=None, parent=None): super(PreprocessSettingView, self).__init__(parent) self.central_view = main_view self.initialized = False widget = QWidget() layout = QVBoxLayout() widget.setLayout(layout) self.setWidget(widget) self.combos = [] prep.preprocess_data() self.X, self.y = prep.data_store.get_original_data() self.central_view.set_data(self.X, self.y) self.columns = self.X.columns.tolist() for idx in range(len(self.columns)): combo = QComboBox() combo.addItems(self.columns + ['None']) combo.setCurrentIndex(len(self.columns)) combo.currentTextChanged.connect( partial(self.on_combo_changed, idx)) layout.addWidget(combo) self.combos.append(combo)
def predict(self): pk.preprocess_data() self.get_dataset() X_train, y_train, X_test, y_test = self.reduce_dataset() log.debug('X_train: %s, y_train: %s, X_test: %s, y_test: %s' % (X_train.shape, y_train.shape, X_test.shape, y_test.shape)) clf, pipeline = self.build_clf_pipeline() self.set_best_params(clf, pipeline) # Estimating model performance pipeline.fit(X_train, y_train) predicted = pipeline.predict(X_test) score = AlgBase.accuracy(predicted, y_test) log.debug('Test accuracy: %.2f' % (score)) sel_features = mdata.get_reduced_features() log.debug('Selected features indeces: %s' % sel_features) X_unseen = pk.data_store.get_X_unseen() if sel_features is not None: X_unseen = X_unseen[:, sel_features] predictions = pipeline.predict(X_unseen).astype(int) pk.data_store.save_predictions(predictions) log.debug("end")
def show_learning_curve(self, fig, ax, best_params): prep.preprocess_data() batch_size = prep.data_store.get_batch_size() num_features = prep.data_store.get_x_field_count() num_labels = prep.data_store.get_y_value_count() hidden_size = prep.data_store.get_hidden_size() log.debug('num_features: %d, num_labels: %d' % (num_features, num_labels)) train_dataset = prep.data_store.get_X_train(include_valid=False) train_labels = prep.data_store.get_y_train(include_valid=False, one_hot_encoding=True) valid_dataset = prep.data_store.get_X_valid() valid_labels = prep.data_store.get_y_valid(True) test_dataset = prep.data_store.get_X_test() test_labels = prep.data_store.get_y_test(True) if num_labels == 1: train_labels = train_labels[:, None] valid_labels = valid_labels[:, None] test_labels = test_labels[:, None] log.debug('shape of train_labels: %s' % (train_labels.shape, )) graph = tf.Graph() with graph.as_default(): tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, num_features)) tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) tf_valid_dataset = tf.constant(valid_dataset) tf_test_dataset = tf.constant(test_dataset) weights_l1 = tf.Variable( tf.truncated_normal([num_features, hidden_size])) biases_l1 = tf.Variable(tf.zeros([hidden_size])) logits_l1 = tf.matmul(tf_train_dataset, weights_l1) + biases_l1 activation_l1 = tf.nn.relu(logits_l1) weights_l2 = tf.Variable( tf.truncated_normal([hidden_size, num_labels])) biases_l2 = tf.Variable(tf.zeros([num_labels])) logits_l2 = tf.matmul(activation_l1, weights_l2) + biases_l2 if num_labels > 1: loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits_l2, tf_train_labels)) optimizer = tf.train.GradientDescentOptimizer(0.2).minimize( loss) # Predictions for the training, validation, and test data train_prediction = tf.nn.softmax(logits_l2) valid_layer1 = tf.matmul(tf_valid_dataset, weights_l1) + biases_l1 valid_prediction = tf.nn.softmax( tf.matmul(tf.nn.relu(valid_layer1), weights_l2) + biases_l2) test_layer1 = tf.matmul(tf_test_dataset, weights_l1) + biases_l1 test_prediction = tf.nn.softmax( tf.matmul(tf.nn.relu(test_layer1), weights_l2) + biases_l2) else: y_conv = tf.nn.sigmoid(logits_l2) loss = -(tf_train_labels * tf.log(y_conv + 1e-12) + (1 - tf_train_labels) * tf.log(1 - y_conv + 1e-12)) loss = tf.reduce_mean(loss) optimizer = tf.train.GradientDescentOptimizer(0.2).minimize( loss) # Predictions for the training data train_prediction = tf.greater(y_conv, 0.5) # Predictions for the validation data valid_layer1 = tf.matmul(tf_valid_dataset, weights_l1) + biases_l1 valid_layer2 = tf.matmul(tf.nn.relu(valid_layer1), weights_l2) + biases_l2 y_conv = tf.nn.sigmoid(valid_layer2) valid_prediction = tf.greater(y_conv, 0.5) # Predictions for the test data test_layer1 = tf.matmul(tf_test_dataset, weights_l1) + biases_l1 test_layer2 = tf.matmul(tf.nn.relu(test_layer1), weights_l2) + biases_l2 y_conv = tf.nn.sigmoid(test_layer2) test_prediction = tf.greater(y_conv, 0.5) num_steps = prep.data_store.get_step_size() with tf.Session(graph=graph) as session: tf.initialize_all_variables().run() print('Initialized') for step in range(num_steps): offset = (step * batch_size) % (train_labels.shape[0] - batch_size) batch_data = train_dataset[offset:(offset + batch_size), :] # log.debug('offset: %d, step: %d, shape: %s' % (offset, step, batch_data.shape)) batch_labels = train_labels[offset:(offset + batch_size), :] feed_dict = { tf_train_dataset: batch_data, tf_train_labels: batch_labels } _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict) accuracy = self.get_accuracy_func(num_labels) if (step % 200 == 0): print('Minibatch loss at step: %d: %f' % (step, l)) print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels)) print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels)) print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
def get_titanic_data(): global dataset, labels prep.preprocess_data(force_process=True) dataset, labels = prep.data_store.get_original_data() return dataset, labels
def get_dataset(self): pk.preprocess_data() self.X_train = pk.data_store.get_X_train(include_valid=True) self.y_train = pk.data_store.get_y_train(include_valid=True) self.X_test = pk.data_store.get_X_test() self.y_test = pk.data_store.get_y_test()