Esempio n. 1
0
def run():
    prep.preprocess_data()
    X_train = prep.data_store.get_X_train(include_valid=True)
    y_train = prep.data_store.get_y_train(include_valid=True,
                                          one_hot_encoding=False)
    X_test = prep.data_store.get_X_test()
    y_test = prep.data_store.get_y_test(one_hot_encoding=False)
    log.debug('X_train: %s, y_train: %s, X_test: %s, y_test: %s' %
              (X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    log.debug('X_train[:5]: %s' % (X_train[100:105]))
    log.debug('y_train[:5]: %s' % (y_train[100:105]))

    pipeline = SVC()
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator=pipeline,
                       X=X_train[:1000],
                       y=y_train[:1000],
                       train_sizes=np.linspace(0.1, 1.0, 10),
                       cv=10,
                       n_jobs=1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(test_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    log.debug('accuracy: %s, type: %s' % (train_mean[-1], type(train_mean)))
    log.debug('accuracy: %s, type: %s' % (test_mean[-1], type(test_mean)))
    log.debug("end")
Esempio n. 2
0
    def show_dimen_reduction(self, fig, ax):
        log.debug('#####')
        pk.preprocess_data()
        self.get_dataset()
        log.debug('X_train: %s, y_train: %s' %
                  (self.X_train.shape, self.y_train.shape))

        clf, pipeline = self.build_clf_pipeline()
        # rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy')
        # rfecv.fit(self.X_train, self.y_train)
        # log.debug('Optimal number of features: %d' % rfecv.n_features_)

        # Feature selector
        selector_k_best = SelectKBest(f_regression, k=10)

        # Build the machine learning pipeline
        pipeline_clf = Pipeline([('selector', selector_k_best), ('rf', clf)])

        best_features = None
        best_score = 0
        for k in range(2, pk.data_store.get_x_field_count() + 1):
            pipeline_clf.set_params(selector__k=k)
            # Training the classifier
            pipeline_clf.fit(self.X_train, self.y_train)

            # Print score
            cur_score = pipeline_clf.score(self.X_test, self.y_test)
            log.debug('Feature Count: %d, Score:%f' % (k, cur_score))

            # Print the selected features chosen by the selector
            features_status = pipeline_clf.named_steps['selector'].get_support(
            )
            selected_features = []
            for count, item in enumerate(features_status):
                if item:
                    selected_features.append(count)
            log.debug('Selected features:%s' %
                      (','.join([str(x) for x in selected_features])))

            if cur_score > best_score:
                best_features = selected_features
                best_score = cur_score

            mdata.set_reduced_features(best_features)
Esempio n. 3
0
    def __init__(self, main_view=None, parent=None):
        super(PreprocessSettingView, self).__init__(parent)
        self.central_view = main_view
        self.initialized = False

        widget = QWidget()
        layout = QVBoxLayout()
        widget.setLayout(layout)
        self.setWidget(widget)

        self.combos = []
        prep.preprocess_data()
        self.X, self.y = prep.data_store.get_original_data()
        self.central_view.set_data(self.X, self.y)
        self.columns = self.X.columns.tolist()
        for idx in range(len(self.columns)):
            combo = QComboBox()
            combo.addItems(self.columns + ['None'])
            combo.setCurrentIndex(len(self.columns))
            combo.currentTextChanged.connect(
                partial(self.on_combo_changed, idx))
            layout.addWidget(combo)
            self.combos.append(combo)
Esempio n. 4
0
    def predict(self):
        pk.preprocess_data()
        self.get_dataset()
        X_train, y_train, X_test, y_test = self.reduce_dataset()
        log.debug('X_train: %s, y_train: %s, X_test: %s, y_test: %s' %
                  (X_train.shape, y_train.shape, X_test.shape, y_test.shape))

        clf, pipeline = self.build_clf_pipeline()
        self.set_best_params(clf, pipeline)

        # Estimating model performance
        pipeline.fit(X_train, y_train)
        predicted = pipeline.predict(X_test)
        score = AlgBase.accuracy(predicted, y_test)
        log.debug('Test accuracy: %.2f' % (score))

        sel_features = mdata.get_reduced_features()
        log.debug('Selected features indeces: %s' % sel_features)
        X_unseen = pk.data_store.get_X_unseen()
        if sel_features is not None:
            X_unseen = X_unseen[:, sel_features]
        predictions = pipeline.predict(X_unseen).astype(int)
        pk.data_store.save_predictions(predictions)
        log.debug("end")
Esempio n. 5
0
    def show_learning_curve(self, fig, ax, best_params):
        prep.preprocess_data()
        batch_size = prep.data_store.get_batch_size()
        num_features = prep.data_store.get_x_field_count()
        num_labels = prep.data_store.get_y_value_count()
        hidden_size = prep.data_store.get_hidden_size()
        log.debug('num_features: %d, num_labels: %d' %
                  (num_features, num_labels))

        train_dataset = prep.data_store.get_X_train(include_valid=False)
        train_labels = prep.data_store.get_y_train(include_valid=False,
                                                   one_hot_encoding=True)
        valid_dataset = prep.data_store.get_X_valid()
        valid_labels = prep.data_store.get_y_valid(True)
        test_dataset = prep.data_store.get_X_test()
        test_labels = prep.data_store.get_y_test(True)

        if num_labels == 1:
            train_labels = train_labels[:, None]
            valid_labels = valid_labels[:, None]
            test_labels = test_labels[:, None]
        log.debug('shape of train_labels:  %s' % (train_labels.shape, ))

        graph = tf.Graph()
        with graph.as_default():
            tf_train_dataset = tf.placeholder(tf.float32,
                                              shape=(batch_size, num_features))
            tf_train_labels = tf.placeholder(tf.float32,
                                             shape=(batch_size, num_labels))
            tf_valid_dataset = tf.constant(valid_dataset)
            tf_test_dataset = tf.constant(test_dataset)

            weights_l1 = tf.Variable(
                tf.truncated_normal([num_features, hidden_size]))
            biases_l1 = tf.Variable(tf.zeros([hidden_size]))
            logits_l1 = tf.matmul(tf_train_dataset, weights_l1) + biases_l1
            activation_l1 = tf.nn.relu(logits_l1)

            weights_l2 = tf.Variable(
                tf.truncated_normal([hidden_size, num_labels]))
            biases_l2 = tf.Variable(tf.zeros([num_labels]))
            logits_l2 = tf.matmul(activation_l1, weights_l2) + biases_l2

            if num_labels > 1:
                loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits_l2, tf_train_labels))
                optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(
                    loss)

                # Predictions for the training, validation, and test data
                train_prediction = tf.nn.softmax(logits_l2)
                valid_layer1 = tf.matmul(tf_valid_dataset,
                                         weights_l1) + biases_l1
                valid_prediction = tf.nn.softmax(
                    tf.matmul(tf.nn.relu(valid_layer1), weights_l2) +
                    biases_l2)
                test_layer1 = tf.matmul(tf_test_dataset,
                                        weights_l1) + biases_l1
                test_prediction = tf.nn.softmax(
                    tf.matmul(tf.nn.relu(test_layer1), weights_l2) + biases_l2)
            else:
                y_conv = tf.nn.sigmoid(logits_l2)
                loss = -(tf_train_labels * tf.log(y_conv + 1e-12) +
                         (1 - tf_train_labels) * tf.log(1 - y_conv + 1e-12))
                loss = tf.reduce_mean(loss)
                optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(
                    loss)

                # Predictions for the training data
                train_prediction = tf.greater(y_conv, 0.5)

                # Predictions for the validation data
                valid_layer1 = tf.matmul(tf_valid_dataset,
                                         weights_l1) + biases_l1
                valid_layer2 = tf.matmul(tf.nn.relu(valid_layer1),
                                         weights_l2) + biases_l2
                y_conv = tf.nn.sigmoid(valid_layer2)
                valid_prediction = tf.greater(y_conv, 0.5)

                # Predictions for the test data
                test_layer1 = tf.matmul(tf_test_dataset,
                                        weights_l1) + biases_l1
                test_layer2 = tf.matmul(tf.nn.relu(test_layer1),
                                        weights_l2) + biases_l2
                y_conv = tf.nn.sigmoid(test_layer2)
                test_prediction = tf.greater(y_conv, 0.5)

        num_steps = prep.data_store.get_step_size()
        with tf.Session(graph=graph) as session:
            tf.initialize_all_variables().run()
            print('Initialized')
            for step in range(num_steps):
                offset = (step * batch_size) % (train_labels.shape[0] -
                                                batch_size)
                batch_data = train_dataset[offset:(offset + batch_size), :]
                # log.debug('offset: %d, step: %d, shape: %s' % (offset, step, batch_data.shape))
                batch_labels = train_labels[offset:(offset + batch_size), :]
                feed_dict = {
                    tf_train_dataset: batch_data,
                    tf_train_labels: batch_labels
                }
                _, l, predictions = session.run(
                    [optimizer, loss, train_prediction], feed_dict=feed_dict)
                accuracy = self.get_accuracy_func(num_labels)
                if (step % 200 == 0):
                    print('Minibatch loss at step: %d: %f' % (step, l))
                    print('Minibatch accuracy: %.1f%%' %
                          accuracy(predictions, batch_labels))
                    print('Validation accuracy: %.1f%%' %
                          accuracy(valid_prediction.eval(), valid_labels))
            print('Test accuracy: %.1f%%' %
                  accuracy(test_prediction.eval(), test_labels))
Esempio n. 6
0
def get_titanic_data():
    global dataset, labels
    prep.preprocess_data(force_process=True)
    dataset, labels = prep.data_store.get_original_data()
    return dataset, labels
Esempio n. 7
0
 def get_dataset(self):
     pk.preprocess_data()
     self.X_train = pk.data_store.get_X_train(include_valid=True)
     self.y_train = pk.data_store.get_y_train(include_valid=True)
     self.X_test = pk.data_store.get_X_test()
     self.y_test = pk.data_store.get_y_test()