Ejemplo n.º 1
0
def main():
    # this should be parsed from json, but hardcoded for now
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeLR.pkl'
    #out_fname = 'submission_imsizeLR.csv'
    
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeSVM.pkl'
    #out_fname = 'submission_imsizeSVM.csv'
    
    #attributes_settings = ['numpixels','aspectratio']
    #pkl_file = 'imsizeLR_alt.pkl'
    #out_fname = 'submission_imsizeLR_alt.csv'
    
    attributes_settings = ['width','height','mean','stderr','propwhite','propbool','propblack']
    pkl_file = 'imattr1.pkl'
    out_fname = 'submission_imattr1.csv'
    
    # Get global settings, providing file names of test data
    settings = utils.Settings('settings.json')
    
    # Make the wrapper function
    processing = highlevelfeatures.attributes_wrapper(attributes_settings)
    
    # Load the test data, with the processing applied
    X, names = utils.load_data(settings.image_fnames, processing=processing,
                               verbose=False)
    
    clf = joblib.load(pkl_file)
    p = clf.predict_proba(X)
    
    utils.write_predictions(out_fname, p, names, settings.classes)
Ejemplo n.º 2
0
def train_sklearn(run_settings, verbose=False, force=False):
    # unpack settings
    settings = run_settings['settings']

    # get all training file paths and class names
    image_fname_dict = settings.image_fnames

    # now being parsed from json
    augment_settings = run_settings["preprocessing"]

    # build processing function
    processing = augment.augmentation_wrapper(**augment_settings)

    # load data as design matrix, applying processing function
    X, y = utils.load_data(image_fname_dict,
                           classes=settings.classes,
                           processing=processing,
                           verbose=verbose)

    # make a label encoder and encode the labels
    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)

    if run_settings['classifier'] == 'dummy':
        # just a dummy uniform probability classifier for working purposes
        clf = sklearn.dummy.DummyClassifier(strategy='uniform')
    elif run_settings['classifier'] == 'logistic regression':
        clf = sklearn.linear_model.SGDClassifier(n_jobs=-1, loss='log')
    elif run_settings['classifier'] == 'random forest':
        forest = sklearn.ensemble.RandomForestClassifier(
            n_jobs=-1,
            n_estimators=100,
            #                                          verbose=1,
            max_depth=5)
        scaler = sklearn.preprocessing.StandardScaler()
        clf = sklearn.pipeline.Pipeline((("scl", scaler), ("clf", forest)))

    # only supporting stratified shuffle split for now
    cv = sklearn.cross_validation.StratifiedShuffleSplit(
        y, **run_settings['cross validation'])

    results = []
    for train, test in cv:
        clf.fit(X[train], y[train])
        p = clf.predict_proba(X[test])
        results.append(sklearn.metrics.log_loss(y[test], p))

    print("Average CV: {0} +/- {1}".format(np.mean(results),
                                           np.sqrt(np.var(results))))

    # save the model in the data directory, in a "models" subdirectory
    # with the name of the run_settings as the name of the pkl
    joblib.dump(clf, run_settings["pickle abspath"], compress=3)

    # store the raw log loss results back in the run settings json
    run_settings["crossval results"] = results
    # along with the other things we've added
    utils.save_run_settings(run_settings)
Ejemplo n.º 3
0
def main():
    
    # this should be parsed from json, but hardcoded for now
    
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeLR.pkl'
    
    #attributes_settings = ['numpixels','aspectratio']
    #pkl_file = 'imsizeLR_alt.pkl'
    
    attributes_settings = ['width','height','mean','stderr','propwhite','propbool','propblack']
    pkl_file = 'imattr1.pkl'
    
    # Load the settings, providing 
    settings = utils.Settings('settings.json')
    
    # Make the wrapper function
    processing = highlevelfeatures.BasicAttributes(attributes_settings)
    
    # Load the training data, with the processing applied
    X, y = utils.load_data(settings.image_fnames, classes=settings.classes,
                           processing=processing)
    
    # Encode the labels
    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # just a dummy uniform probability classifier for working purposes
    #clf = sklearn.dummy.DummyClassifier(strategy='uniform')
    
    #clf = sklearn.linear_model.SGDClassifier(n_jobs=-1,
    #                                         loss='log')
    
    #clf = sklearn.ensemble.RandomForestClassifier(n_jobs=-1,
    #                                              n_estimators=100,
    #                                              verbose=1)
    
    # clf = sklearn.svm.SVC(probability=True)
    
    clf = sklearn.linear_model.LogisticRegression()
    
    cv = sklearn.cross_validation.StratifiedShuffleSplit(y)
    
    # Try cross-validating
    results = []
    for train, test in cv:
        clf.fit(X[train], y[train])
        p = clf.predict_proba(X[test])
        results.append(sklearn.metrics.log_loss(y[test], p))
    
    print(results)
    print('CV average = {}'.format(np.mean(results)))
    
    # Train on the whole thing and save model for later
    clf.fit(X,y)
    
    joblib.dump(clf, pkl_file, compress=3)
Ejemplo n.º 4
0
def train_sklearn(run_settings, verbose=False, force=False):
    # unpack settings
    settings = run_settings['settings']

    # get all training file paths and class names
    image_fname_dict = settings.image_fnames

    # now being parsed from json
    augment_settings = run_settings["preprocessing"]

    # build processing function
    processing = augment.augmentation_wrapper(**augment_settings)
    
    # load data as design matrix, applying processing function
    X, y = utils.load_data(image_fname_dict, classes=settings.classes,
                           processing=processing, verbose=verbose)

    # make a label encoder and encode the labels
    label_encoder = sklearn.preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)

    if run_settings['classifier'] == 'dummy':
        # just a dummy uniform probability classifier for working purposes
        clf = sklearn.dummy.DummyClassifier(strategy='uniform')
    elif run_settings['classifier'] == 'logistic regression':
        clf = sklearn.linear_model.SGDClassifier(n_jobs=-1,
                                                 loss='log')
    elif run_settings['classifier'] == 'random forest':
        forest = sklearn.ensemble.RandomForestClassifier(n_jobs=-1,
                                                  n_estimators=100,
        #                                          verbose=1,
                                                  max_depth=5)
        scaler = sklearn.preprocessing.StandardScaler()
        clf = sklearn.pipeline.Pipeline((("scl",scaler),("clf",forest)))

    # only supporting stratified shuffle split for now
    cv = sklearn.cross_validation.StratifiedShuffleSplit(y,
                                    **run_settings['cross validation'])

    results = []
    for train, test in cv:
        clf.fit(X[train], y[train])
        p = clf.predict_proba(X[test])
        results.append(sklearn.metrics.log_loss(y[test], p))

    print("Average CV: {0} +/- {1}".format(np.mean(results),
                                    np.sqrt(np.var(results))))

    # save the model in the data directory, in a "models" subdirectory
    # with the name of the run_settings as the name of the pkl
    joblib.dump(clf, run_settings["pickle abspath"], compress=3)

    # store the raw log loss results back in the run settings json
    run_settings["crossval results"] = results
    # along with the other things we've added
    utils.save_run_settings(run_settings)
Ejemplo n.º 5
0
    def test_loading_test_data_with_processing(self):
        """
        Check whether data and names are correct when loading test data
        with dummy zeros((10,10)) processing
        """
        data, names = utils.load_data(self.image_fname_dict,
                                      processing=self.processing)

        self.assertEqual(names, ['136177.jpg', '27712.jpg', '81949.jpg'])
        self.assertEqual(data.shape, (3, 100))
Ejemplo n.º 6
0
    def test_loading_test_data_with_processing(self):
        """
        Check whether data and names are correct when loading test data
        with dummy zeros((10,10)) processing
        """
        data, names = utils.load_data(self.image_fname_dict,
                                      processing=self.processing)

        self.assertEqual(names, ['136177.jpg', '27712.jpg', '81949.jpg'])
        self.assertEqual(data.shape, (3, 100))
Ejemplo n.º 7
0
    def test_load_test_data_name_correspondence_is_correct(self):
        """
        Make sure the names match up to the correct row in the data for test
        data
        """
        single_val_processing = lambda images: images.min()
        data, names = utils.load_data(self.image_fname_dict,
                                      processing=single_val_processing)

        self.assertEqual(names, ['136177.jpg', '27712.jpg', '81949.jpg'])
        self.assertIs(int(data[0][0]), 63)
        self.assertIs(int(data[1][0]), 5)
        self.assertIs(int(data[2][0]), 46)
Ejemplo n.º 8
0
    def test_load_test_data_name_correspondence_is_correct(self):
        """
        Make sure the names match up to the correct row in the data for test
        data
        """
        single_val_processing = lambda images: images.min()
        data, names = utils.load_data(self.image_fname_dict,
                                      processing=single_val_processing)

        self.assertEqual(names, ['136177.jpg', '27712.jpg', '81949.jpg'])
        self.assertIs(int(data[0][0]), 63)
        self.assertIs(int(data[1][0]), 5)
        self.assertIs(int(data[2][0]), 46)
Ejemplo n.º 9
0
    def test_loading_train_data_with_processing(self):
        """
        Ensure load_data with training data returns the correct data
        """
        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=self.processing)

        self.assertIs(len(labels), 10)
        self.assertEqual(['acantharia_protist'] * 3 + \
                         ['acantharia_protist_halo'] * 2 + \
                         ['artifacts_edge'] * 4 + \
                         ['fecal_pellet'], list(labels))
        self.assertEqual(data.shape, (10, 100))
Ejemplo n.º 10
0
    def test_loading_train_data_with_processing(self):
        """
        Ensure load_data with training data returns the correct data
        """
        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=self.processing)

        self.assertIs(len(labels), 10)
        self.assertEqual(['acantharia_protist'] * 3 + \
                         ['acantharia_protist_halo'] * 2 + \
                         ['artifacts_edge'] * 4 + \
                         ['fecal_pellet'], list(labels))
        self.assertEqual(data.shape, (10, 100))
Ejemplo n.º 11
0
    def test_loading_train_with_augmentation(self):
        """
        Use a custom processing function that returns multiple images
        and test to make sure the image and data arrays still match.
        """
        # will return a list of 4 zero arrays
        dummy_augment = lambda image: [np.zeros((10,10)) for i in range(4)]

        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=dummy_augment)

        self.assertIs(len(labels), 40)
        self.assertEqual(['acantharia_protist'] * 12 + \
                         ['acantharia_protist_halo'] * 8 + \
                         ['artifacts_edge'] * 16 + \
                         ['fecal_pellet']*4, list(labels))
        self.assertEqual(data.shape, (40, 100))
Ejemplo n.º 12
0
    def test_loading_train_with_augmentation(self):
        """
        Use a custom processing function that returns multiple images
        and test to make sure the image and data arrays still match.
        """
        # will return a list of 4 zero arrays
        dummy_augment = lambda image: [np.zeros((10, 10)) for i in range(4)]

        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=dummy_augment)

        self.assertIs(len(labels), 40)
        self.assertEqual(['acantharia_protist'] * 12 + \
                         ['acantharia_protist_halo'] * 8 + \
                         ['artifacts_edge'] * 16 + \
                         ['fecal_pellet']*4, list(labels))
        self.assertEqual(data.shape, (40, 100))
Ejemplo n.º 13
0
    def test_load_train_data_name_correspondence_is_correct(self):
        """
        Ensure the correspondence of labels to data is maintained
        on load
        """
        single_val_processing = lambda images: images.min()

        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=single_val_processing)

        self.assertIs(len(labels), 10)
        self.assertEqual(['acantharia_protist'] * 3 + \
                         ['acantharia_protist_halo'] * 2 + \
                         ['artifacts_edge'] * 4 + \
                         ['fecal_pellet'], list(labels))
        self.assertEqual(data.shape, (10, 1))

        self.assertEqual([[int(x[0])] for x in data], [[73], [65], [51], [35],
                                                [37], [0], [202], [0],
                                                [0], [158]])
Ejemplo n.º 14
0
    def test_load_train_data_name_correspondence_is_correct(self):
        """
        Ensure the correspondence of labels to data is maintained
        on load
        """
        single_val_processing = lambda images: images.min()

        data, labels = utils.load_data(self.image_fname_dict,
                                       classes=self.classes,
                                       processing=single_val_processing)

        self.assertIs(len(labels), 10)
        self.assertEqual(['acantharia_protist'] * 3 + \
                         ['acantharia_protist_halo'] * 2 + \
                         ['artifacts_edge'] * 4 + \
                         ['fecal_pellet'], list(labels))
        self.assertEqual(data.shape, (10, 1))

        self.assertEqual(
            [[int(x[0])] for x in data],
            [[73], [65], [51], [35], [37], [0], [202], [0], [0], [158]])
Ejemplo n.º 15
0
def test_sklearn(run_settings, verbose=False):
    # some more boilerplate here
    # unpack settings
    settings = run_settings['settings']

    # get all training file paths and class names
    image_fname_dict = settings.image_fnames

    # parsed from json, preproc settings are dict
    augment_settings = run_settings["preprocessing"]
    processing = augment.augmentation_wrapper(**augment_settings)

    image_fname_dict = settings.image_fnames

    X, names = utils.load_data(image_fname_dict, processing=processing,
                               verbose=verbose)
    
    # load the model from where it's _expected_ to be saved
    clf = joblib.load(run_settings['pickle abspath'])
    p = clf.predict_proba(X)
   
    utils.write_predictions(run_settings['submissions abspath'], p, 
            names, settings.classes)
Ejemplo n.º 16
0
def main():
    # this should be parsed from json, but hardcoded for now
    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeLR.pkl'
    #out_fname = 'submission_imsizeLR.csv'

    #attributes_settings = ['width','height']
    #pkl_file = 'imsizeSVM.pkl'
    #out_fname = 'submission_imsizeSVM.csv'

    #attributes_settings = ['numpixels','aspectratio']
    #pkl_file = 'imsizeLR_alt.pkl'
    #out_fname = 'submission_imsizeLR_alt.csv'

    attributes_settings = [
        'width', 'height', 'mean', 'stderr', 'propwhite', 'propbool',
        'propblack'
    ]
    pkl_file = 'imattr1.pkl'
    out_fname = 'submission_imattr1.csv'

    # Get global settings, providing file names of test data
    settings = utils.Settings('settings.json')

    # Make the wrapper function
    processing = highlevelfeatures.attributes_wrapper(attributes_settings)

    # Load the test data, with the processing applied
    X, names = utils.load_data(settings.image_fnames,
                               processing=processing,
                               verbose=False)

    clf = joblib.load(pkl_file)
    p = clf.predict_proba(X)

    utils.write_predictions(out_fname, p, names, settings.classes)