Example #1
0
def sample_bottom_up(infile='presents_revorder.csv', outfile='sub_bottomup_1.csv', write=True, check=True):
    """
    Replicate the sample bottom-up approach
    """
    sleigh = classes.LayerSleigh()
    layer = classes.Layer()

    presents_file = os.path.join('data', infile)
    outfile = os.path.join('data', outfile)
    logger.info("Reading and placing presents")
    with open(presents_file, 'rb') as presents:
            presents.readline()  # skip header
            read = csv.reader(presents)
            for row in read:
                present = classes.Present(*row)
                if not layer.place_present(present):
                    # Can't place the present on the layer, so close the layer and start a new one
                    sleigh.add_layer(layer)
                    layer = classes.Layer(z=sleigh.max_z + 1)
                    res = layer.place_present(present)
            # Add the final layer
            sleigh.add_layer(layer)

    if check and not sleigh.check_all():
        logger.error('There is an error in the Sleigh')
        return sleigh

    if write:
        sleigh.write_to_file(outfile)
    return sleigh
Example #2
0
    def sendEmails(self, receiverEmails: List[str], msg: str, subject: str) -> None:
        try:
            smtpServer = "smtp.gmail.com"
            senderEmail = environ.get('SENDEREMAIL')
            password = environ.get('EMAILPWD')

            if senderEmail and password:
                with smtplib.SMTP_SSL(smtpServer, 465) as server:
                    server.login(senderEmail, password)

                    for email in receiverEmails:
                        try:
                            emailMsg = EmailMessage()
                            emailMsg.set_content(msg)
                            emailMsg['Subject'] = subject
                            emailMsg['From'] = senderEmail
                            emailMsg['To'] = email
                            server.send_message(emailMsg)
                            info("{} Email sent to {}".format(subject, email))
                        except Exception as e:
                            # log error
                            error(str(e))
            else:
                raise Exception("Missing sender email and/or password")
        except Exception as e:
            error(str(e))
Example #3
0
def delete_db():  #wipe db
    try:
        tableNameList = [
            "User", "Playlists", "Tracks", "Albums", "Artists", "TrackAlbum",
            "TrackArtists", "TrackPlaylists", "AlbumArtists"
        ]  #list of tables names
        logger.info("DROPPING DATABASE; User Call")
        try:
            for i in tableNameList:
                get_db().execute(f"DROP TABLE {i}")  #drop tables
                logger.debug(
                    f"DROPPING {i}; REASON: User Call [Delete Database]")
        except sqlite3.OperationalError as e:
            logger.warning("RuntimeError; " + e.__str__() +
                           "; User calling DeleteDB whilst no tables; Ignore")
    except RuntimeError as e:  #except error if its out of request context
        logger.warning("RuntimeError; " + e.__str__() +
                       "; POSSIBLE REASON: FIRST TIME STARTUP")
        logger.warning("Re calling delete_db out of application context")
        db = sqlite3.connect(DATABASE)  # connect to db out of reqauest context
        try:
            for i in tableNameList:
                db.execute(f"DROP TABLE {i}")
                logger.debug(
                    f"DROPPING {i}; REASON: User Call [Delete Database]")
            db.commit()
        except sqlite3.OperationalError as e:
            logger.debug(e.__str__())
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500, 'verbose': 1}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)

    """
Example #5
0
 def fit(self, X, y):
     self.ridge_estimator_ = self._get_ridge_model()
     self.rf_estimator_ = self._get_rf_model()
     logger.info("Fitting Ridge model")
     self.ridge_estimator_.fit(X, y)
     ridge_y = self.ridge_estimator_.predict(X)
     logger.info("Fitting RF model")
     self.rf_estimator_.fit(ridge_y, y)
    def predict_test(self, average_responses):
        logger.info("Calculating predictions for test set")

        # Now calculate the test set responses
        test_predictors = self.build_test_predictors()
        test_clusters = self.estimator.predict(test_predictors)
        test_averages = average_responses[test_clusters]
        return test_averages
Example #7
0
 def build_features(self, files, training=True):
     """
     Utility method that loops over every image and applies self.process_image
     Returns a numpy array of dimensions (n_observations, n_features)
     """
     logger.info("Building predictors")
     predictors = self.do_for_each_image(files, self.process_image, self.n_features, training)
     return predictors
Example #8
0
 def fit(self, X, y):
     self.ridge_estimator_ = self._get_ridge_model()
     self.rf_estimator_ = self._get_rf_model()
     logger.info("Fitting Ridge model")
     self.ridge_estimator_.fit(X, y)
     ridge_y = self.ridge_estimator_.predict(X)
     logger.info("Fitting RF model")
     self.rf_estimator_.fit(ridge_y, y)
 def fit_estimator(self):
     # Fit a k-means clustering estimator
     # We use 37 centers initially because there are 37 classes
     # Seems like the sample submission used 6 clusters
     start_time = time.time()
     logger.info("Fitting kmeans estimator")
     self.estimator = KMeans(init='k-means++', n_clusters=37)
     self.estimator.fit(self.predictors)
     logger.info("Finished fitting model in {}".format(time.time() - start_time))
    def _transform(training, features, file_list):
        filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH
        res = []

        for i, f in enumerate(file_list):
            if i % 5000 == 0:
                logger.info("Processing image {} of {}".format(i, len(file_list)))
            img = RawImage(os.path.join(filepath, f))
            features = [func(img) for func in features]
            res.append(np.hstack(features))
        return res
Example #11
0
 def get_cluster_averages(self):
     # Get the average response for each cluster in the training set
     # This is a 37 x 37 array, one row for each cluster, and one column for each class
     logger.info("Calculating cluster averages")
     average_responses = np.zeros((37, 37))
     for cluster in range(37):
         idx = self.estimator.labels_ == cluster
         responses = self.train_y[idx, :]
         average_responses[cluster] = responses.mean(axis=0)
     logger.info("Finished calculating cluster averages")
     return average_responses
Example #12
0
def _parallel_sampler(file_list, steps, step_size, training):
    filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH
    rows = []
    counter = 0
    for i, f in enumerate(file_list):
        counter += 1
        if counter % 5000 == 0:
            logger.info("Processed {} images".format(counter))
        image = RawImage(os.path.join(filepath, f))
        rows.append(image.grid_sample(step_size, steps).flatten().astype('float64') / 255)
    return np.vstack(rows)
Example #13
0
    def _transform(self, file_list):
        filepath = TRAIN_IMAGE_PATH if self.training else TEST_IMAGE_PATH
        out = np.zeros((len(file_list), self.scaled_size, self.scaled_size, 3))
        factor = self.scaled_size / self.crop_size

        for i, f in enumerate(file_list):
            if i % 5000 == 0:
                logger.info("Processing image {} of {}".format(i, len(file_list)))
            img = RawImage(os.path.join(filepath, f))
            img.crop(self.crop_size).rescale(factor)
            out[i] = img.data * 255
        return out
Example #14
0
    def _transform(training, features, file_list):
        filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH
        res = []

        for i, f in enumerate(file_list):
            if i % 5000 == 0:
                logger.info("Processing image {} of {}".format(
                    i, len(file_list)))
            img = RawImage(os.path.join(filepath, f))
            features = [func(img) for func in features]
            res.append(np.hstack(features))
        return res
Example #15
0
    def run(self, check=True, write=True):
        presents_file = os.path.join('data', self.infile)
        outfile = os.path.join('data', self.outfile)
        logger.info("Reading and placing presents")
        counter = 0
        with open(presents_file, 'rb') as presents:
            presents.readline()  # skip header
            read = csv.reader(presents)
            for row in read:
                present = classes.Present(*row)
                position = self.sleigh.place_present(present)
                counter += 1
                if counter % self.log_at == 0:
                    logger.info("Placed {} presents".format(counter))
                    logger.info("Current min z is {}".format(np.min(self.sleigh.z_map)))

        logger.info("Finished placing presents")

        if write:
            self.write()

        if check:
            self.check()

        return self
Example #16
0
 def perform_cross_validation(self, *args, **kwargs):
     """
     Performs cross validation using the main estimator.  In some cases, when we don't need to search
     across a grid of hyperparameters, we may want to perform cross validation only.
     """
     start_time = time.time()
     if self.cv_sample is not None:
         logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(self.cv_folds, self.cv_sample))
         self.cv_x,\
         self.cv_x_test,\
         self.cv_y,\
         self.cv_y_test = cross_validation.train_test_split(self.train_x, self.train_y, train_size=self.cv_sample)
     else:
         logger.info("Performing {}-fold cross validation with full training set".format(self.cv_folds))
         self.cv_x = self.train_x
         self.cv_y = self.train_y
     self.cv_iterator = self.cv_class(self.cv_x.shape[0], n_folds=self.cv_folds)
     params = {
         'cv': self.cv_iterator,
         'scoring': rmse_scorer,
         'verbose': 2,
         'n_jobs': self.n_jobs
     }
     params.update(kwargs)
     # Make sure to not parallelize the estimator
     if 'n_jobs' in self.estimator.get_params().keys():
         self.estimator.set_params(n_jobs=1)
     self.cv_scores = cross_validation.cross_val_score(self.estimator,
                                                       self.cv_x,
                                                       self.cv_y,
                                                       *args, **params)
     logger.info("Cross validation completed in {}.  Scores:".format(time.time() - start_time))
     logger.info("{}".format(self.cv_scores))
Example #17
0
 def do_for_each_image(self, files, func, n_features, training):
     """
     Function that iterates over a list of files, applying func to the image indicated by that function.
     Returns an (n_samples, n_features) ndarray
     """
     dims = (N_TRAIN if training else N_TEST, n_features)
     predictors = np.zeros(dims)
     counter = 0
     for row, f in enumerate(files):
         filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH
         image = RawImage(os.path.join(filepath, f))
         predictors[row] = func(image)
         counter += 1
         if counter % 1000 == 0:
             logger.info("Processed {} images".format(counter))
     return predictors
Example #18
0
    def build_test_predictors(self):
        """
        Builds the test predictors

        Returns:
            None
        """
        if self.test_x is None:
            test_files = sorted(os.listdir(TEST_IMAGE_PATH))
            if os.path.exists(self.test_predictors_file):
                logger.info("Test predictors already exists, loading from file {}".format(self.test_predictors_file))
                res = np.load(self.test_predictors_file)
            else:
                res = self.build_features(test_files, False)
                logger.info("Caching test predictors to {}".format(self.test_predictors_file))
                np.save(self.test_predictors_file, res)
            self.test_x = res
Example #19
0
def multi_request(request):  #backend endpoint for js
    logger.info(f"Multi Request Received; {request.__str__()}")
    qVar1 = request.args['qVar1']
    qVar2 = request.args['qVar2']
    table1 = request.args['table1']
    table2 = request.args['table2']
    logger.debug(", ".join([qVar1, qVar2, table1, table2]))
    searchVar = f'{table1}.{qVar1}, {table2}.{qVar2}'
    tableName = f"{table1} INNER JOIN {table2}"
    baseQ = f"SELECT * FROM {tableName} ON {table1}.{qVar1} == {table2}.{qVar2}"
    query = baseQ

    logger.info(query)
    queryOut = get_db().execute(query).fetchall()

    output = {}
    output['results'] = queryOut
    return jsonify(output)
Example #20
0
    def build_train_predictors(self):
        """
        Builds the training predictors.  Once the predictors are built, they are cached to a file.
        If the file already exists, the predictors are loaded from file.
        Couldn't use the @cache_to_file decorator because the decorator factory doesn't have access to self at compilation

        Returns:
            None
        """
        if self.train_x is None:
            file_list = train_solutions.filenames
            if os.path.exists(self.train_predictors_file):
                logger.info("Training predictors already exists, loading from file {}".format(self.train_predictors_file))
                res = np.load(self.train_predictors_file)
            else:
                res = self.build_features(file_list, True)
                logger.info("Caching training predictors to {}".format(self.train_predictors_file))
                np.save(self.train_predictors_file, res)
            self.train_x = res
Example #21
0
    def run(self, method, *args, **kwargs):
        """
        Primary entry point for executing tasks with the model

        Arguments:
        ----------
        method: string
            Must be one of 'grid_search', 'cv', 'train', or 'predict'

        *args:
            Additional arguments to be passed to the job

        **kwargs:
            Additional arguments to be passed to the job

        """

        jobs = {'grid_search', 'cv', 'train', 'predict'}

        if method not in jobs:
            raise RuntimeError("{} is not a valid job".format(method))

        start_time = time.time()
        self.build_train_predictors()
        res = None

        if method == 'grid_search':
            logger.info("Performing grid search")
            res = self.perform_grid_search_and_cv(*args, **kwargs)
        elif method == 'cv':
            logger.info("Performing cross validation")
            res = self.perform_cross_validation(*args, **kwargs)
        elif method == 'train':
            logger.info("Performing training")
            res = self.train(*args, **kwargs)
        elif method == 'predict':
            logger.info("Performing prediction")
            res = self.predict(*args, **kwargs)

        end_time = time.time()
        logger.info("Model completed in {}".format(end_time - start_time))
        return res
Example #22
0
    def transform(self, X=None):
        if self.training:
            files = train_solutions.filenames
        else:
            files = sorted(os.listdir(TEST_IMAGE_PATH))

        if os.path.exists(self.result_path) and not self.force_rerun:
            logger.info("File already exists.  Loading from {}".format(self.result_path))
            if self.memmap:
                return joblib.load(self.result_path, mmap_mode='r+')
            else:
                return joblib.load(self.result_path)
        else:
            res = np.vstack(Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_parallel_crop_scale)(self, files) for files in chunks(files, self.n_jobs)
            ))
            logger.info("Saving results to file {}".format(self.result_path))
            joblib.dump(res, self.result_path)
            if self.memmap:
                res = joblib.load(self.result_path, mmap_mode='r+')
            return res
Example #23
0
    def run(self, check=True, write=True):
        layer = self.layer_class()

        presents_file = os.path.join('data', self.infile)
        outfile = os.path.join('data', self.outfile)
        logger.info("Reading and placing presents")
        counter = 0
        with open(presents_file, 'rb') as presents:
            presents.readline()  # skip header
            read = csv.reader(presents)
            for row in read:
                present = classes.Present(*row)
                layer = self.process_present(present, layer)
                counter += 1
                if counter % self.log_at == 0:
                    logger.info("Placed {} presents".format(counter))

            self.process_last_layer(layer)

        logger.info("Finished placing presents")

        if write:
            self.write()

        if check:
            self.check()
        return self
Example #24
0
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x,
                                                        train_y,
                                                        train_size=0.2,
                                                        test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500,
        'verbose': 1
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)
    """
Example #25
0
def sample_top_down(infile='presents_revorder.csv', outfile='sub_topdown_1.csv', write=True, check=True):
    """
    Replicate the MatLab top-down approach

    Strategy is basically the same as bottom_up, but before closing each layer,
    align all of the presents to the top of the layer.

    Actually this strategy is not quite the same, since it reads the present in a different order.
    Result is a slightly higher score than the benchmark
    """
    sleigh = classes.LayerSleigh()
    layer = classes.Layer()

    presents_file = os.path.join('data', infile)
    outfile = os.path.join('data', outfile)
    logger.info("Reading and placing presents")
    with open(presents_file, 'rb') as presents:
            presents.readline()  # skip header
            read = csv.reader(presents)
            for row in read:
                present = classes.Present(*row)
                if not layer.place_present(present):
                    # Can't place the present on the layer, so close the layer and start a new one
                    # Before closing, re-align all of the presents in the layer to the top of the layer
                    align_presents_to_layer_top(layer)
                    sleigh.add_layer(layer)
                    layer = classes.Layer(z=sleigh.max_z + 1)
                    res = layer.place_present(present)
            # Add the final layer
            align_presents_to_layer_top(layer)
            sleigh.add_layer(layer)

    if check and not sleigh.check_all():
        logger.error('There is an error in the Sleigh')
        return sleigh

    if write:
        sleigh.write_to_file(outfile)
    return sleigh
Example #26
0
    def train(self, *args, **kwargs):
        start_time = time.time()
        logger.info("Fitting estimator")
        if 'n_jobs' in self.estimator.get_params().keys():
            self.estimator.set_params(n_jobs=self.n_jobs)
        self.estimator.fit(self.train_x, self.train_y)
        logger.info("Finished fitting model in {}".format(time.time() - start_time))

        # Get an in sample RMSE
        logger.info("Calculating in-sample RMSE")
        self.training_predict = self.estimator.predict(self.train_x)
        self.rmse = rmse(self.training_predict, self.train_y)
        return self.estimator
Example #27
0
    def fit(self, X, y=None):
        start_time = time.time()
        logger.info("Fitting estimator")
        self.estimator_ = self.get_estimator()
        if 'n_jobs' in self.estimator_.get_params().keys():
            self.estimator_.set_params(n_jobs=self.n_jobs)

        self.estimator_.fit(X, y)
        logger.info("Finished fitting model in {}".format(time.time() - start_time))

        # Get an in sample RMSE
        logger.info("Calculating in-sample RMSE")
        self.training_predict = self.estimator_.predict(X)
        self.rmse = rmse(self.training_predict, y)
Example #28
0
    def cross_validation(self, X, y, n_folds=2, cv_class=None, sample=None, parallel_estimator=False):
        cls = cv_class or cross_validation.KFold

        start_time = time.time()
        if sample is not None:
            logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(n_folds, sample))
            self.cv_x,\
            _,\
            self.cv_y,\
            _ = cross_validation.train_test_split(X, y, train_size=sample)
            del _
        else:
            logger.info("Performing {}-fold cross validation with full training set".format(n_folds))
            self.cv_x = X
            self.cv_y = y
        self.cv_iterator = cls(self.cv_x.shape[0], n_folds=n_folds)

        params = {
            'cv': self.cv_iterator,
            'scoring': rmse_scorer,
            'verbose': 3,
            'n_jobs': self.n_jobs
        }

        estimator = self.get_estimator()
        # Make sure to not parallelize the estimator
        if 'n_jobs' in estimator.get_params().keys():
            if parallel_estimator:
                estimator.set_params(n_jobs=self.n_jobs)
                params['n_jobs'] = 1
            else:
                estimator.set_params(n_jobs=1)

        self.cv_scores = cross_validation.cross_val_score(estimator, self.cv_x, self.cv_y, **params)
        logger.info("Cross validation completed in {}.  Scores:".format(time.time() - start_time))
        logger.info("{}".format(self.cv_scores))
Example #29
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(
            training=False,
            result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.
            format(n_centroids),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
Example #30
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                      result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                      crop_size=crop,
                                                      scaled_size=s,
                                                      n_jobs=-1,
                                                      memmap=True)

        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
Example #31
0
"""
Last quoted plan benchmark
"""
import classes
from classes import logger

train = classes.get_train_data()
actuals = classes.get_actual_plan(train)

scores = []
# Score seems to be a bit high on training, about .547-.548
# Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition
for n in range(5):
    truncated = classes.truncate(train)
    prediction = classes.get_last_observed_plan(truncated)
    score = classes.score_df(prediction, actuals)
    scores.append(score)
    logger.info("Run {}, score: {}".format(n+1, score))

test = classes.get_test_data()
pred = classes.get_last_observed_plan(test)

classes.make_submission(pred, 'benchmark_001.csv')
Example #32
0
from classes.logger import error, info

if __name__ == "__main__":
    try:
        finder = AppointmentFinder()
        contacts = finder.getContactsByState()
        statesUpdated = finder.getUpdatedTsByState()

        for state in contacts:
            cvsData = finder.getApptDataForStateCvs(state)
            aptsAvailable = finder.openStateApts(cvsData)
            parsedTs = finder.parseCvsTs(cvsData)
            tsIsOld = True

            if state in statesUpdated:
                curTs = statesUpdated[state]
                tsIsOld = parsedTs > curTs

                if tsIsOld:
                    finder.updateStateTs(state, parsedTs)
                    statesUpdated[state] = parsedTs
                    info("{} appointment data timestamp updated to {}".format(
                        state.upper(), parsedTs))

            if tsIsOld and aptsAvailable:
                emails = contacts[state]
                finder.sendApptAvailableEmailCvs(emails, state)

        info("Appointment finder run complete!")
    except Exception as e:
        error(str(e))
Example #33
0
    def perform_grid_search_and_cv(self, *args, **kwargs):
        """
        Performs cross validation and grid search to identify optimal parameters and to score the estimator
        The grid search space is defined by self.grid_search_parameters.

        If grid_search_sample is defined, then a downsample of the full train_x is used to perform the grid search

        Cross validation is parallelized at the CV level, not the estimator level, because not all estimators
        can be parallelized.

        Parameters:
        ----------
        refit: boolean, default True
            If true, the grid search estimator is refit on the grid search set, and then is used to calculate a score
            on the holdout set.

            Really only useful if grid_search_sample < 1, otherwise the calculated score will basically be an in-sample
            error (since the training and the testing were the same dataset)

        grid_search_parameters: set on model instantiation
            The grid search parameters -- should set this when you instantiate the Model, not when you call run('grid_search')
        """
        if self.grid_search_parameters is not None:
            logger.info("Performing grid search")
            start_time = time.time()
            params = {
                'scoring': rmse_scorer,
                'verbose': 3,
                'refit':  True,
                'n_jobs': self.n_jobs,
                'cv': 2
            }
            params.update(kwargs)
            # Make sure to not parallelize the estimator if it can be parallelized
            if 'n_jobs' in self.estimator.get_params().keys():
                self.estimator.set_params(n_jobs=1)

            self.grid_search_estimator = self.grid_search_class(self.estimator,
                                                                self.grid_search_parameters,
                                                                *args, **params)
            if self.grid_search_sample is not None:
                logger.info("Using {} of the train set for grid search".format(self.grid_search_sample))
                # Downsample if a sampling rate is defined
                self.grid_search_x, \
                self.grid_search_x_test, \
                self.grid_search_y, \
                self.grid_search_y_test = cross_validation.train_test_split(self.train_x,
                                                                            self.train_y,
                                                                            train_size=self.grid_search_sample)
            else:
                logger.info("Using full train set for the grid search")
                # Otherwise use the full set
                self.grid_search_x = self.grid_search_x_test = self.train_x
                self.grid_search_y = self.grid_search_y_test = self.train_y
            self.grid_search_estimator.fit(self.grid_search_x, self.grid_search_y)
            logger.info("Found best parameters:")
            logger.info(self.grid_search_estimator.best_params_)

            if params['refit']:
                logger.info("Predicting on holdout set")
                pred = self.grid_search_estimator.predict(self.grid_search_x_test)
                res = rmse(self.grid_search_y_test, pred)
                logger.info("RMSE on holdout set: {}".format(res))

            logger.info("Grid search completed in {}".format(time.time() - start_time))
Example #34
0
    def perform_cross_validation(self, *args, **kwargs):
        start_time = time.time()
        if self.cv_sample is not None:
            logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(self.cv_folds, self.cv_sample))
            self.cv_x,\
            self.cv_x_test,\
            self.cv_y,\
            self.cv_y_test = cross_validation.train_test_split(self.train_x, self.train_y, train_size=self.cv_sample)
        else:
            logger.info("Performing {}-fold cross validation with full training set".format(self.cv_folds))
            self.cv_x = self.train_x
            self.cv_y = self.train_y

        self.cv_iterator = self.cv_class(self.cv_x.shape[0], n_folds=self.cv_folds)

        params = {
            'cv': self.cv_iterator,
            'scoring': rmse_scorer,
            'verbose': 2,
            'n_jobs': self.n_jobs
        }
        params.update(kwargs)

        # Gotta roll our own cross validation
        # Cross validation will look like this:
        # For each fold:
        #   train estimator
        #   Predict estimator
        #   Store prediction
        #   Move onto next estimator

        overall_scores = []
        detailed_scores = [{}] * self.cv_folds
        for i, idx in enumerate(self.cv_iterator):
            logger.debug("Working on fold {}".format(i + 1))
            train = idx[0]
            test = idx[1]

            # Get the data
            # The actual cross val method uses safe_mask to index the arrays.  This is only required if
            # we might be handling sparse matrices
            this_train_x = self.cv_x[train]
            this_train_y = self.cv_y[train]
            this_test_x = self.cv_x[test]
            this_test_y = self.cv_y[test]

            logger.debug("Fold {} training X and Y shape: {}, {}".format(i + 1, this_train_x.shape, this_train_y.shape))
            logger.debug("Fold {} test X and Y shape: {}, {}".format(i + 1, this_test_x.shape, this_test_y.shape))

            test_preds = np.zeros(this_test_y.shape)
            train_preds = np.zeros(this_train_y.shape)

            # Should be able to refactor out this inner loop
            for cls in range(1, 12):
                cols = train_solutions.class_map[cls]

                logger.info("Performing CV on class {}".format(cls))

                # Clone the estimator
                # Need to do this for each fold
                estimator = clone(self.estimator[cls])

                existing_test_preds = np.any(test_preds, axis=0)
                existing_train_preds = np.any(train_preds, axis=0)
                this_x = np.hstack((this_train_x, train_preds[:, existing_train_preds]))
                test_x = np.hstack((this_test_x, test_preds[:, existing_test_preds]))
                this_y = this_train_y[:, cols]
                test_y = this_test_y[:, cols]

                logger.debug("Train X shape: {}".format(this_x.shape))
                logger.debug("Train Y shape: {}".format(this_y.shape))
                logger.debug("Test X shape: {}".format(test_x.shape))

                # Parallelize at the estimator level
                if 'n_jobs' in estimator.get_params().keys():
                    estimator.set_params(n_jobs=self.n_jobs)
                estimator.fit(this_x, this_y)

                train_pred = estimator.predict(this_x)
                test_pred = estimator.predict(test_x)

                # Scale things back
                if self.scaled:
                    # this does not work correctly because cv_y is already split
                    scale_factors = train_solutions.get_sum_for_class(cls)

                    assert train.shape[0] == scale_factors[0]
                    assert test.shape[0] == scale_factors[0]

                    train_scale_factors = scale_factors[train]
                    test_scale_factors = scale_factors[test]

                    assert train_scale_factors.shape[0] == train_pred.shape[0]
                    assert test_scale_factors.shape[0] == test_pred.shape[0]

                    train_pred = np.multiply(train_pred, train_scale_factors)
                    test_pred = np.multiply(test_pred, test_scale_factors)
                    test_y = np.multiply(test_y, test_scale_factors)

                score = rmse(test_y, test_pred)
                detailed_scores[i][cls] = score
                logger.info("RMSE on test set for class {}: {}".format(cls, score))

                train_preds[:, cols] = train_pred
                test_preds[:, cols] = test_pred

            if self.scaled:
                pass
            else:
                fold_rmse = rmse(this_test_y, test_preds)

            overall_scores.append(fold_rmse)
            logger.info("Overall score for fold {}: {}".format(i + 1, fold_rmse))

        self.cv_scores = np.array(overall_scores)
        logger.info("Cross validation completed in {}.  Scores:".format(time.time() - start_time))
        logger.info(detailed_scores)
        logger.info("Overall scores:")
        logger.info(overall_scores)
Example #35
0
from pathlib import Path
from classes.logger import error, info


if __name__ == "__main__":
    try:
        logFile = environ.get('LOGFILE')
        
        if logFile:
            # construct new file name, get current log directory, construct
            # new file absolute path
            curDate = datetime.strftime(datetime.now(), "%Y-%m-%d")
            newFileName = "app.logger.{}".format(curDate)
            logDir = path.dirname(logFile)
            newFile = path.join(logDir, newFileName)
            exist = path.isfile(newFile)

            if not exist:
                # if the new file doesn't already exist
                # then rename the old one to the new file 
                Path(logFile).rename(newFile)
                info("Logs successfully archived to file {}".format(newFileName))
            else:
                # if the file exists already don't archive
                info("Log file was not archived as {} already exists.".format(newFileName))
        else:
            info("Unable to archive log file. No log file environment variable found.")

    except Exception as e:
        error(str(e))
Example #36
0
    def train(self, *args, **kwargs):
        start_time = time.time()
        logger.info("Fitting estimator")
        preds = np.zeros(self.train_y.shape)
        # This currently just goes from 1 to 11, but the tree doesn't actually progress in that order.
        # Maybe experiment with a more fine-grained control over which predictions get passed in
        for cls in range(1, 12):
            cols = train_solutions.class_map[cls]

            # Select the correct estimator, and get the right subsets of the data to use in training
            logger.info("Fitting estimator for class {}".format(cls))
            estimator = self.estimator[cls]

            existing_preds = np.any(preds, axis=0)  # Boolean array of which columns are populated in preds
            # X is concatenated with any predictions that have already been made
            logger.debug("Adding columns {} of predictions to X".format(np.where(existing_preds)[0]))
            this_x = np.hstack((self.train_x, preds[:, existing_preds]))
            this_y = self.train_y[:, cols]
            logger.debug("X is of shape {}".format(this_x.shape))
            logger.debug("Y is of shape {}".format(this_y.shape))

            # Train the current estimator
            if 'n_jobs' in estimator.get_params().keys():
                estimator.set_params(n_jobs=self.n_jobs)
            estimator.fit(this_x, this_y)

            # Make predictions with the current estimator, and store those predictions
            logger.info("Making predictions for class {}".format(cls))
            y_pred = estimator.predict(this_x)
            logger.debug("Ypred is of shape {}".format(this_y.shape))
            logger.info("RMSE of class {} is {}".format(cls, rmse(y_pred, this_y)))
            preds[:, cols] = y_pred

        logger.info("Finished fitting model in {}".format(time.time() - start_time))

        # Get an in sample RMSE
        logger.info("Calculating overall in-sample RMSE")
        self.training_predict = preds
        self.rmse = rmse(self.training_predict, self.train_y)
        return self.estimator
Example #37
0
    def predict(self, X):
        if not hasattr(self, 'estimator_'):
            raise RuntimeError("Estimator has not been trained")

        logger.info("Predicting")
        return self.estimator_.predict(X)
Example #38
0
import os
from classes import logger
import csv
import classes

infile = 'presents_short.csv'

layer = classes.Layer()

presents_file = os.path.join('data', infile)
logger.info("Reading and placing presents")
with open(presents_file, 'rb') as presents:
    presents.readline()  # skip header
    read = csv.reader(presents)
    for row in read:
        present = classes.Present(*row)
        if not layer.place_present(present):
            break


areas = []
presents_file = os.path.join('data', infile)
logger.info("Reading and placing presents")
with open(presents_file, 'rb') as presents:
        presents.readline()  # skip header
        read = csv.reader(presents)
        for row in read:
            present = classes.Present(*row)
            areas.append((present.pid, present.x * present.y))

import math
Example #39
0
    def grid_search(self, X, y, grid_search_params, grid_search_class=None, sample=None, n_folds=2, refit=True, parallel_estimator=False):
        cls = grid_search_class or grid_search.GridSearchCV
        logger.info("Performing grid search")
        start_time = time.time()
        params = {
            'scoring': rmse_scorer,
            'verbose': 3,
            'refit':  refit,
            'n_jobs': self.n_jobs,
            'cv': n_folds
        }
        estimator = self.get_estimator()

        if 'n_jobs' in estimator.get_params().keys():
            # If the estimator can be parallelized, and parallel_estimator is True, then parallelize at that level
            # otherwise parallelize at the grid search level
            if parallel_estimator:
                estimator.set_params(n_jobs=self.n_jobs)
                params['n_jobs'] = 1
            else:
                estimator.set_params(n_jobs=1)

        self.grid_search_estimator = cls(estimator, grid_search_params, **params)

        if sample is not None:
            logger.info("Using {} of the train set for grid search".format(sample))
            # Downsample if a sampling rate is defined
            self.grid_search_x, \
            self.grid_search_x_test, \
            self.grid_search_y, \
            self.grid_search_y_test = cross_validation.train_test_split(X,
                                                                        y,
                                                                        train_size=sample)
        else:
            logger.info("Using full train set for the grid search")
            # Otherwise use the full set
            self.grid_search_x = self.grid_search_x_test = X
            self.grid_search_y = self.grid_search_y_test = y

        self.grid_search_estimator.fit(self.grid_search_x, self.grid_search_y)
        logger.info("Found best parameters:")
        logger.info(self.grid_search_estimator.best_params_)
        logger.info("All results:")
        logger.info(pprint.pformat(self.grid_search_estimator.grid_scores_))

        if params['refit']:
            logger.info("Predicting on holdout set")
            pred = self.grid_search_estimator.predict(self.grid_search_x_test)
            res = rmse(self.grid_search_y_test, pred)
            logger.info("RMSE on holdout set: {}".format(res))

        logger.info("Grid search completed in {}".format(time.time() - start_time))