Ejemplo n.º 1
0
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import random

from sklearn import datasets, metrics, cross_validation

import skflow

random.seed(42)

# Load dataset.
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42)

# Build 3 layer DNN with 10, 20, 10 units respecitvely.
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            steps=200)

# Fit and predict.
classifier.fit(X_train, y_train)
score = metrics.accuracy_score(y_test, classifier.predict(X_test))
print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 2
0
print "--------------------------------------------------"
print "GradientBoostingClassifier"
print "--------------------------------------------------"
print 'Accuracy:', accuracy_score(y_test.flatten(), pred.flatten())
print 'F1 score:', f1_score(y_test.flatten(), pred.flatten())
print 'Recall:', recall_score(y_test.flatten(), pred.flatten())
print 'Precision:', precision_score(y_test.flatten(), pred.flatten())
print '\n clasification report:\n', classification_report(
    y_test.flatten(), pred.flatten())

#print 'Gradient boosting score: %f' % accuracy_score(y_test.flatten(), pred.flatten())

# Deep Learning - DNN used the library skflow which is based on tensorflow

import skflow

model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                       n_classes=17,
                                       batch_size=100,
                                       steps=3000,
                                       optimizer="SGD",
                                       learning_rate=0.01)

model.fit(X_train, y_train.values)

#y_test = test_data['Y']
y_prediction = model.predict(X_test)

print "prediction accuracy:", np.sum(
    y_test.flatten() == y_prediction) * 1. / len(y_test.flatten())
Ejemplo n.º 3
0
def do_system_training(dataset,
                       model_path,
                       feature_normalizer_path,
                       feature_path,
                       classifier_params,
                       dataset_evaluation_mode='folds',
                       classifier_method='gmm',
                       overwrite=False):
    """System training

    model container format:

    {
        'normalizer': normalizer class
        'models' :
            {
                'office' : mixture.GMM class
                'home' : mixture.GMM class
                ...
            }
    }

    Parameters
    ----------
    dataset : class
        dataset class

    model_path : str
        path where the models are saved.

    feature_normalizer_path : str
        path where the feature normalizers are saved.

    feature_path : str
        path where the features are saved.

    classifier_params : dict
        parameter dict

    dataset_evaluation_mode : str ['folds', 'full']
        evaluation mode, 'full' all material available is considered to belong to one fold.
        (Default value='folds')

    classifier_method : str ['gmm']
        classifier method, currently only GMM supported
        (Default value='gmm')

    overwrite : bool
        overwrite existing models
        (Default value=False)

    Returns
    -------
    nothing

    Raises
    -------
    ValueError
        classifier_method is unknown.

    IOError
        Feature normalizer not found.
        Feature file not found.

    """

    if classifier_method != 'gmm' and classifier_method != 'dnn':
        raise ValueError("Unknown classifier method [" + classifier_method +
                         "]")

    # Check that target path exists, create if not
    check_path(model_path)

    for fold in dataset.folds(mode=dataset_evaluation_mode):
        current_model_file = get_model_filename(fold=fold, path=model_path)
        if not os.path.isfile(current_model_file) or overwrite:
            # Load normalizer
            feature_normalizer_filename = get_feature_normalizer_filename(
                fold=fold, path=feature_normalizer_path)
            if os.path.isfile(feature_normalizer_filename):
                normalizer = load_data(feature_normalizer_filename)
            else:
                raise IOError("Feature normalizer not found [%s]" %
                              feature_normalizer_filename)

            # Initialize model container
            model_container = {'normalizer': normalizer, 'models': {}}

            # Collect training examples
            file_count = len(dataset.train(fold))
            data = {}
            for item_id, item in enumerate(dataset.train(fold)):
                progress(title_text='Collecting data',
                         fold=fold,
                         percentage=(float(item_id) / file_count),
                         note=os.path.split(item['file'])[1])

                # Load features
                feature_filename = get_feature_filename(
                    audio_file=item['file'], path=feature_path)
                if os.path.isfile(feature_filename):
                    feature_data = load_data(feature_filename)['feat']
                else:
                    raise IOError("Features not found [%s]" % (item['file']))

                # Scale features
                feature_data = model_container['normalizer'].normalize(
                    feature_data)

                # Store features per class label
                if item['scene_label'] not in data:
                    data[item['scene_label']] = feature_data
                else:
                    data[item['scene_label']] = numpy.vstack(
                        (data[item['scene_label']], feature_data))

            le = pp.LabelEncoder()
            tot_data = {}

            # Train models for each class
            for label in data:
                progress(title_text='Train models', fold=fold, note=label)
                if classifier_method == 'gmm':
                    model_container['models'][label] = mixture.GMM(
                        **classifier_params).fit(data[label])
                elif classifier_method == 'dnn':
                    if 'x' not in tot_data:
                        tot_data['x'] = data[label]
                        tot_data['y'] = numpy.repeat(label,
                                                     len(data[label]),
                                                     axis=0)
                    else:
                        tot_data['x'] = numpy.vstack(
                            (tot_data['x'], data[label]))
                        tot_data['y'] = numpy.hstack(
                            (tot_data['y'],
                             numpy.repeat(label, len(data[label]), axis=0)))
                else:
                    raise ValueError("Unknown classifier method [" +
                                     classifier_method + "]")

            clf = skflow.TensorFlowDNNClassifier(**classifier_params)
            if classifier_method == 'dnn':
                tot_data['y'] = le.fit_transform(tot_data['y'])
                clf.fit(tot_data['x'], tot_data['y'])
                clf.save('dnn/dnnmodel1')

            # Save models
            save_data(current_model_file, model_container)
Ejemplo n.º 4
0
#  limitations under the License.

from sklearn import datasets, metrics
from sklearn.cross_validation import train_test_split

import skflow
import tensorflow as tf

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=42)


# setup exponential decay function
def exp_decay(global_step):
    return tf.train.exponential_decay(learning_rate=0.1,
                                      global_step=global_step,
                                      decay_steps=100,
                                      decay_rate=0.001)


# use customized decay function in learning_rate
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            steps=800,
                                            learning_rate=exp_decay)
classifier.fit(X_train, y_train)
score = metrics.accuracy_score(y_test, classifier.predict(X_test))
Ejemplo n.º 5
0
def main():
    """Run experiment with multiple classifiers."""
    data = get_data()

    print("Got %i training samples and %i test samples." %
          (len(data['train']['X']), len(data['test']['X'])))

    # Get classifiers
    classifiers = [
        ('Logistic Regression (C=1)', LogisticRegression(C=1)),
        ('Logistic Regression (C=1000)', LogisticRegression(C=10000)),
        ('RBM 200, n_iter=40, LR=0.01, Reg: C=1',
         Pipeline(steps=[(
             'rbm',
             BernoulliRBM(
                 n_components=200, n_iter=40, learning_rate=0.01, verbose=True)
         ), ('logistic', LogisticRegression(C=1))])),
        ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000',
         Pipeline(steps=[(
             'rbm',
             BernoulliRBM(
                 n_components=200, n_iter=40, learning_rate=0.01, verbose=True)
         ), ('logistic', LogisticRegression(C=10000))])),
        ('RBM 100',
         Pipeline(steps=[('rbm', BernoulliRBM(
             n_components=100)), ('logistic', LogisticRegression(C=1))])),
        ('RBM 100, n_iter=20',
         Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20)
                          ), ('logistic', LogisticRegression(C=1))])),
        ('RBM 256',
         Pipeline(steps=[('rbm', BernoulliRBM(
             n_components=256)), ('logistic', LogisticRegression(C=1))])),
        ('RBM 512, n_iter=100',
         Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10)
                          ), ('logistic', LogisticRegression(C=1))])),
        ('NN 20:5',
         skflow.TensorFlowDNNClassifier(hidden_units=[20, 5],
                                        n_classes=data['n_classes'],
                                        steps=500)),
        # ('NN 500:200 dropout',
        #  skflow.TensorFlowEstimator(model_fn=dropout_model,
        #                             n_classes=10,
        #                             steps=20000)),
        # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model,
        #                                    n_classes=10,
        #                                    batch_size=100,
        #                                    steps=20000,
        #                                    learning_rate=0.001)),
        ('SVM, adj.',
         SVC(probability=False,
             kernel="rbf",
             C=2.8,
             gamma=.0073,
             cache_size=200)),
        ('SVM, linear', SVC(kernel="linear", C=0.025, cache_size=200)),
        ('k nn', KNeighborsClassifier(3)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
        ('Random Forest', RandomForestClassifier(n_estimators=50, n_jobs=10)),
        ('Random Forest 2',
         RandomForestClassifier(max_depth=5,
                                n_estimators=10,
                                max_features=1,
                                n_jobs=10)),
        ('AdaBoost', AdaBoostClassifier()),
        ('Naive Bayes', GaussianNB()),
        ('Gradient Boosting', GradientBoostingClassifier()),
        ('LDA', LinearDiscriminantAnalysis()),
        ('QDA', QuadraticDiscriminantAnalysis())
    ]

    # Fit them all
    classifier_data = {}
    for clf_name, clf in classifiers:
        print("#" * 80)
        print("Start fitting '%s' classifier." % clf_name)
        examples = 100000  # Reduce data to make training faster
        t0 = time.time()
        clf.fit(data['train']['X'][:examples], data['train']['y'][:examples])
        t1 = time.time()
        an_data = analyze(clf, data, t1 - t0, clf_name=clf_name)
        classifier_data[clf_name] = {
            'training_time': t1 - t0,
            'testing_time': an_data['testing_time'],
            'accuracy': an_data['accuracy']
        }

    print_website(classifier_data)
Ejemplo n.º 6
0
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print "Accuracy:", accuracy.eval({x: X_test, y: y_test})

# Skflow

y, X = train['Survived'], train[['Age', 'SibSp', 'Fare']].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
X_train = numpy.array(X_train)
X_test = numpy.array(X_test)
y_train = numpy.array(y_train).reshape(y_train.shape[-1], 1)
y_test = numpy.array(y_test).reshape(y_test.shape[-1], 1)

classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=2,
                                            batch_size=128,
                                            steps=500,
                                            optimizer='Adam',
                                            learning_rate=0.05)
classifier.fit(X_train, y_train)
print(accuracy_score(classifier.predict(X_test), y_test))

# TensorFlowDNNClassifier(batch_size=128, class_weight=None,
#	continue_training=False, early_stopping_rounds=None,
#	hidden_units=[10, 20, 10], keep_checkpoint_every_n_hours=10000,
#	learning_rate=0.05, max_to_keep=5, n_classes=2, num_cores=4,
#	optimizer='SGD', steps=500, tf_master='', tf_random_seed=42,
#	verbose=1)
Ejemplo n.º 7
0
classifier = skflow.TensorFlowLinearClassifier(n_classes=10,
                                               batch_size=100,
                                               steps=1000,
                                               learning_rate=0.01)
classifier.fit(X_train, y_train)
linear_y_predict = classifier.predict(X_test)
linear_submission = pd.DataFrame({
    'ImageId': range(1, 28001),
    'Label': linear_y_predict
})
linear_submission.to_csv('../Datasets/MNIST/linear_submission.csv',
                         index=False)

classifier = skflow.TensorFlowDNNClassifier(hidden_units=[200, 50, 10],
                                            n_classes=10,
                                            steps=5000,
                                            learning_rate=0.01,
                                            batch_size=50)
classifier.fit(X_train, y_train)
dnn_y_predict = classifier.predict(X_test)
dnn_submission = pd.DataFrame({
    'ImageId': range(1, 28001),
    'Label': dnn_y_predict
})
dnn_submission.to_csv('../Datasets/MNIST/dnn_submission.csv', index=False)


def max_pool_2x2(tensor_in):
    return tf.nn.max_pool(tensor_in,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
Ejemplo n.º 8
0
from sklearn import datasets, metrics
from sklearn.cross_validation import train_test_split

import skflow


random.seed(42)

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=42)

# classifier without early stopping - overfitting
classifier1 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=3, steps=800)
classifier1.fit(X_train, y_train)
score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test))

# classifier with early stopping - improved accuracy on testing set
classifier2 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=3, steps=1000,
                                            early_stopping_rounds=200)
classifier2.fit(X_train, y_train)
score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test))

# you can expect the score is improved by using early stopping
print(score2 > score1)
Ejemplo n.º 9
0
    if data[f].dtype == 'object':
        catagorical_features.append(f)
    else:
        numeric_features.append(f)

data_num = whiten(data[numeric_features])
data_cat = pd.get_dummies(data[catagorical_features],
                          columns=catagorical_features)

trlen = train_data.shape[0]
train = np.hstack((data_num[:trlen], data_cat[:trlen]))
test = np.hstack((data_num[trlen:], data_cat[trlen:]))
labels = label_data.astype(int)

xtrain, xtest, ytrain, ytest = train_test_split(train, labels, train_size=0.7)

model = skflow.TensorFlowDNNClassifier(hidden_units=[128, 128, 128],
                                       learning_rate=0.01,
                                       n_classes=2,
                                       batch_size=128,
                                       steps=10000)
model.fit(xtrain, ytrain)
p = model.predict_proba(xtest)[:, 1]
print("TensorFlowDNNClassifier log_loss: %0.5f" % (log_loss(ytest, p)))

model.fit(train, labels)
preds = model.predict_proba(test)[:, 1]
sample = pd.read_csv("results/sample_submission.csv")
sample.PredictedProb = preds
sample.to_csv("results/simple_skflow_results.csv", index=False)
Ejemplo n.º 10
0

X_train = read_wiki_content("doc2vec_train_content.txt")
X_test = read_wiki_content("doc2vec_test_content.txt")

print("Dimension of input: ", len(X_train[0]))

print('Using DNN')
hidden_units = [2000, 1000, 500, 200]
steps = 50000
early_stopping_rounds = 5000

print("Parameters: ", hidden_units, " steps = ", steps,
      "   early_stopping_rounds = ", early_stopping_rounds)
classifier = skflow.TensorFlowDNNClassifier(
    hidden_units=hidden_units,
    n_classes=6,
    steps=steps,
    early_stopping_rounds=early_stopping_rounds)
print('Fit model')
classifier.fit(X_train, Y_train, logdir="./logdir/doc2vec_dnn")

print('Predicting')
prediction = classifier.predict(X_test)

score2 = metrics.accuracy_score(prediction, Y_test)

confusion_matrix = metrics.confusion_matrix(Y_test, prediction)

print(confusion_matrix)
print(score2)