Example #1
0
 def test_init_with_correct_file_input(self):
     """
     Ensure a valid dict is created from a correct json during init
     """
     settings = utils.Settings(self.check_settings_file)
     self.assertIs(settings.user_input.__class__, dict)
     self.assertTrue(len(settings.user_input) > 0)
Example #2
0
 def test_resolves_to_correct_dir(self):
     """
     Make sure settings parser resolves to dir containing test and train dirs
     """
     settings_string_with_2_dirs = io.StringIO(
         u'{"data_dir": ["fake", "TestSettingsParserDir"]}')
     settings = utils.Settings(settings_string_with_2_dirs)
     self.assertEqual(settings.data_dir, os.path.abspath(self.data_dir))
Example #3
0
 def test_error_if_required_missing(self):
     """
     Ensure error is thrown if a required setting is omitted
     """
     setting_string_without_required = io.StringIO(
         u'{"foo": 5, "bar": "duck"}')
     with self.assertRaises(ValueError):
         utils.Settings(setting_string_without_required)
Example #4
0
    def check_default_values_during_init(self):
        """
        Make sure default values are set for r_seed
        """
        settings = utils.Settings(self.check_settings_file)
        classes = constants.classes

        self.assertIs(settings.random_seed, 42)
        self.assertIs(settings.classes, classes)
Example #5
0
 def test_init_with_correct_stringIO(self):
     """
     Ensure a valid dict is created from a correct json io.String during init
     """
     string_settings = io.StringIO(
         u'{"data_dir": ["TestSettingsParserDir"]}')
     settings = utils.Settings(string_settings)
     self.assertIs(settings.user_input.__class__, dict)
     self.assertTrue(len(settings.user_input) > 0)
Example #6
0
def main():
    
    # this should be parsed from json, but hardcoded for now
    
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeLR.pkl'
    
    #attributes_settings = ['numpixels','aspectratio']
    #pkl_file = 'imsizeLR_alt.pkl'
    
    attributes_settings = ['width','height','mean','stderr','propwhite','propbool','propblack']
    pkl_file = 'imattr1.pkl'
    
    # Load the settings, providing 
    settings = utils.Settings('settings.json')
    
    # Make the wrapper function
    processing = highlevelfeatures.BasicAttributes(attributes_settings)
    
    # Load the training data, with the processing applied
    X, y = utils.load_data(settings.image_fnames, classes=settings.classes,
                           processing=processing)
    
    # Encode the labels
    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # just a dummy uniform probability classifier for working purposes
    #clf = sklearn.dummy.DummyClassifier(strategy='uniform')
    
    #clf = sklearn.linear_model.SGDClassifier(n_jobs=-1,
    #                                         loss='log')
    
    #clf = sklearn.ensemble.RandomForestClassifier(n_jobs=-1,
    #                                              n_estimators=100,
    #                                              verbose=1)
    
    # clf = sklearn.svm.SVC(probability=True)
    
    clf = sklearn.linear_model.LogisticRegression()
    
    cv = sklearn.cross_validation.StratifiedShuffleSplit(y)
    
    # Try cross-validating
    results = []
    for train, test in cv:
        clf.fit(X[train], y[train])
        p = clf.predict_proba(X[test])
        results.append(sklearn.metrics.log_loss(y[test], p))
    
    print(results)
    print('CV average = {}'.format(np.mean(results)))
    
    # Train on the whole thing and save model for later
    clf.fit(X,y)
    
    joblib.dump(clf, pkl_file, compress=3)
Example #7
0
def main(run_settings_path, verbose=False, force=False):
    # load the non-run-specific settings
    settings = utils.Settings('settings.json')
    # load the run-specific settings
    run_settings = utils.load_run_settings(run_settings_path,
                                           settings,
                                           settings_path='settings.json',
                                           force=force)
    if run_settings['model type'] == 'sklearn':
        train_sklearn(run_settings, verbose=verbose, force=force)
    elif run_settings['model type'] == 'pylearn2':
        train_pylearn2(run_settings, verbose=verbose, force=force)
    else:
        raise NotImplementedError("Unsupported model type.")
Example #8
0
def main(run_settings_path, verbose=False, altdata=None, augment=1, split=1):
    # this should just run either function depending on the run settings
    settings = utils.Settings('settings.json')
    # test script won't overwrite the pickle, so always force load
    run_settings = utils.load_run_settings(run_settings_path, 
            settings,
            settings_path='settings.json',
            force=True)
    # HELLO BOILERPLATE
    if run_settings['model type'] == 'sklearn':
        test_sklearn(run_settings, verbose=verbose)
    elif run_settings['model type'] == 'pylearn2':
        #train_pylearn2(run_settings)
        test_pylearn2(run_settings, verbose=verbose,altdata=altdata,
                augment=augment, split=split)
    else:
        raise NotImplementedError("Unsupported model type.")
def main():

    # yeah, that ought to parse it
    settings = utils.Settings('settings.json')

    # loading in mnist
    train_path = os.path.join(settings.data_dir, "mnist_train.npz")
    test_path = os.path.join(settings.data_dir, "mnist_test.npz")
    train_npz = np.load(train_path)
    test_npz = np.load(test_path)

    # sticking it all together
    X = np.vstack([train_npz['arr_0'], test_npz['arr_0']])
    y = np.hstack([train_npz['arr_1'], test_npz['arr_1']])

    # Testing out the mlp function
    mlp = neukrill_net.nk_mlp.MLP_sk_interface(verbose=True)

    mlp.fit(X, y)
def main():
    # this should be parsed from json, but hardcoded for now
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeLR.pkl'
    #out_fname = 'submission_imsizeLR.csv'

    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeSVM.pkl'
    #out_fname = 'submission_imsizeSVM.csv'

    #attributes_settings = ['numpixels','aspectratio']
    #pkl_file = 'imsizeLR_alt.pkl'
    #out_fname = 'submission_imsizeLR_alt.csv'

    attributes_settings = [
        'width', 'height', 'mean', 'stderr', 'propwhite', 'propbool',
        'propblack'
    ]
    pkl_file = 'imattr1.pkl'
    out_fname = 'submission_imattr1.csv'

    # Get global settings, providing file names of test data
    settings = utils.Settings('settings.json')

    # Make the wrapper function
    processing = highlevelfeatures.attributes_wrapper(attributes_settings)

    # Load the test data, with the processing applied
    X, names = utils.load_data(settings.image_fnames,
                               processing=processing,
                               verbose=False)

    clf = joblib.load(pkl_file)
    p = clf.predict_proba(X)

    utils.write_predictions(out_fname, p, names, settings.classes)
Example #11
0
def main():
    out_fname = 'submission_priorprobs.csv'
    settings = utils.Settings('settings.json')

    # Get names of test data files
    names = [
        os.path.basename(fpath) for fpath in settings.image_fnames['test']
    ]

    # Score expected from training data (not a CV score because no folds)
    labels = []
    for class_index, class_name in enumerate(settings.classes):
        num_images = len(settings.image_fnames['train'][class_name])
        # generate the class labels and add them to the list
        labels += num_images * [class_name]

    p = settings.class_priors[np.newaxis, :]
    p = np.tile(p, (len(labels), 1))

    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(labels)

    cv = sklearn.metrics.log_loss(y, p)
    print('CV = {}'.format(cv))

    # Write output
    with open(out_fname, 'w') as csv_out:
        out_writer = csv.writer(csv_out, delimiter=',')
        out_writer.writerow(['image'] + list(settings.classes))
        for index in range(len(names)):
            out_writer.writerow([names[index]] + list(settings.class_priors))

    with open(out_fname, 'rb') as f_in:
        f_out = gzip.open(out_fname + '.gz', 'wb')
        f_out.writelines(f_in)
        f_out.close()
Example #12
0
def main():

    # this should be parsed from json, but hardcoded for now
    bow_options = {
        'verbose': True,
        'normalise_hist': False,
        'n_features_max': 100,
        'patch_size': 15,
        'clusteralgo': 'kmeans',
        'n_clusters': 20,
        'random_seed': 42
    }

    # Load the settings, providing
    settings = utils.Settings('settings.json')

    # Load the raw data
    print('Loading the raw training data')
    rawdata, labels = utils.load_rawdata(settings.image_fnames,
                                         classes=settings.classes)

    # Encode the labels
    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(labels)

    # Probably not the best classifier
    clf = sklearn.linear_model.LogisticRegression()

    cv = sklearn.cross_validation.StratifiedShuffleSplit(y)

    bow = highlevelfeatures.BagOfWords(**bow_options)
    sample = np.random.random_integers(
        0, len(rawdata) - 1,
        size=(1000))  # Subsample so we can do this in sensible time
    bow.fit([rawdata[i] for i in sample])
    #bow.fit(rawdata)
    print('Bagging words for raw training data')
    X = bow.extractfeatures(rawdata)
    X = np.squeeze(X)

    # Try cross-validating
    print('Cross-validating')
    results = []
    for train, test in cv:
        # Make a new BOW encoding
        #bow = highlevelfeatures.BagOfWords(**bow_options)
        #bow.fit([rawdata[i] for i in train])
        #X = bow.extractfeatures(rawdata)

        clf.fit(X[train, :], y[train])
        p = clf.predict_proba(X[test])
        res = sklearn.metrics.log_loss(y[test], p)
        print(res)
        results.append(res)

    print(results)
    print('CV average = {}'.format(np.mean(results)))

    # Train on the whole thing and save model for later
    #bow = highlevelfeatures.BagOfWords(**bow_options)
    #bow.fit(rawdata)
    #X = bow.extractfeatures(rawdata)

    clf.fit(X, y)

    print('Loading the raw test data')
    rawtest, names = utils.load_rawdata(settings.image_fnames)
    print('Bagging words for raw test data')
    X2 = X = bow.extractfeatures(rawdata)
    X2 = np.squeeze(X2)

    p = clf.predict_proba(X2)

    utils.write_predictions('submission_bow_initial.csv', p, names,
                            settings.classes)
Example #13
0
 def test_error_if_file_does_not_exist(self):
     """
     Ensure an IOError is thrown if the file doesn't exist
     """
     with self.assertRaises(ValueError):
         utils.Settings('fake_file')