def sample_bottom_up(infile='presents_revorder.csv', outfile='sub_bottomup_1.csv', write=True, check=True): """ Replicate the sample bottom-up approach """ sleigh = classes.LayerSleigh() layer = classes.Layer() presents_file = os.path.join('data', infile) outfile = os.path.join('data', outfile) logger.info("Reading and placing presents") with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) if not layer.place_present(present): # Can't place the present on the layer, so close the layer and start a new one sleigh.add_layer(layer) layer = classes.Layer(z=sleigh.max_z + 1) res = layer.place_present(present) # Add the final layer sleigh.add_layer(layer) if check and not sleigh.check_all(): logger.error('There is an error in the Sleigh') return sleigh if write: sleigh.write_to_file(outfile) return sleigh
def sendEmails(self, receiverEmails: List[str], msg: str, subject: str) -> None: try: smtpServer = "smtp.gmail.com" senderEmail = environ.get('SENDEREMAIL') password = environ.get('EMAILPWD') if senderEmail and password: with smtplib.SMTP_SSL(smtpServer, 465) as server: server.login(senderEmail, password) for email in receiverEmails: try: emailMsg = EmailMessage() emailMsg.set_content(msg) emailMsg['Subject'] = subject emailMsg['From'] = senderEmail emailMsg['To'] = email server.send_message(emailMsg) info("{} Email sent to {}".format(subject, email)) except Exception as e: # log error error(str(e)) else: raise Exception("Missing sender email and/or password") except Exception as e: error(str(e))
def delete_db(): #wipe db try: tableNameList = [ "User", "Playlists", "Tracks", "Albums", "Artists", "TrackAlbum", "TrackArtists", "TrackPlaylists", "AlbumArtists" ] #list of tables names logger.info("DROPPING DATABASE; User Call") try: for i in tableNameList: get_db().execute(f"DROP TABLE {i}") #drop tables logger.debug( f"DROPPING {i}; REASON: User Call [Delete Database]") except sqlite3.OperationalError as e: logger.warning("RuntimeError; " + e.__str__() + "; User calling DeleteDB whilst no tables; Ignore") except RuntimeError as e: #except error if its out of request context logger.warning("RuntimeError; " + e.__str__() + "; POSSIBLE REASON: FIRST TIME STARTUP") logger.warning("Re calling delete_db out of application context") db = sqlite3.connect(DATABASE) # connect to db out of reqauest context try: for i in tableNameList: db.execute(f"DROP TABLE {i}") logger.debug( f"DROPPING {i}; REASON: User Call [Delete Database]") db.commit() except sqlite3.OperationalError as e: logger.debug(e.__str__())
def kmeans_006_colwise_rmse(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2) # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500, 'verbose': 1}, n_jobs=-1) wrapper.fit(train_x, train_y) # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111 kmeans_preds = wrapper.predict(test_x) logger.info('Kmeans') colwise = classes.colwise_rmse(kmeans_preds, test_y) overall = classes.rmse(kmeans_preds, test_y) """
def fit(self, X, y): self.ridge_estimator_ = self._get_ridge_model() self.rf_estimator_ = self._get_rf_model() logger.info("Fitting Ridge model") self.ridge_estimator_.fit(X, y) ridge_y = self.ridge_estimator_.predict(X) logger.info("Fitting RF model") self.rf_estimator_.fit(ridge_y, y)
def predict_test(self, average_responses): logger.info("Calculating predictions for test set") # Now calculate the test set responses test_predictors = self.build_test_predictors() test_clusters = self.estimator.predict(test_predictors) test_averages = average_responses[test_clusters] return test_averages
def build_features(self, files, training=True): """ Utility method that loops over every image and applies self.process_image Returns a numpy array of dimensions (n_observations, n_features) """ logger.info("Building predictors") predictors = self.do_for_each_image(files, self.process_image, self.n_features, training) return predictors
def fit_estimator(self): # Fit a k-means clustering estimator # We use 37 centers initially because there are 37 classes # Seems like the sample submission used 6 clusters start_time = time.time() logger.info("Fitting kmeans estimator") self.estimator = KMeans(init='k-means++', n_clusters=37) self.estimator.fit(self.predictors) logger.info("Finished fitting model in {}".format(time.time() - start_time))
def _transform(training, features, file_list): filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH res = [] for i, f in enumerate(file_list): if i % 5000 == 0: logger.info("Processing image {} of {}".format(i, len(file_list))) img = RawImage(os.path.join(filepath, f)) features = [func(img) for func in features] res.append(np.hstack(features)) return res
def get_cluster_averages(self): # Get the average response for each cluster in the training set # This is a 37 x 37 array, one row for each cluster, and one column for each class logger.info("Calculating cluster averages") average_responses = np.zeros((37, 37)) for cluster in range(37): idx = self.estimator.labels_ == cluster responses = self.train_y[idx, :] average_responses[cluster] = responses.mean(axis=0) logger.info("Finished calculating cluster averages") return average_responses
def _parallel_sampler(file_list, steps, step_size, training): filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH rows = [] counter = 0 for i, f in enumerate(file_list): counter += 1 if counter % 5000 == 0: logger.info("Processed {} images".format(counter)) image = RawImage(os.path.join(filepath, f)) rows.append(image.grid_sample(step_size, steps).flatten().astype('float64') / 255) return np.vstack(rows)
def _transform(self, file_list): filepath = TRAIN_IMAGE_PATH if self.training else TEST_IMAGE_PATH out = np.zeros((len(file_list), self.scaled_size, self.scaled_size, 3)) factor = self.scaled_size / self.crop_size for i, f in enumerate(file_list): if i % 5000 == 0: logger.info("Processing image {} of {}".format(i, len(file_list))) img = RawImage(os.path.join(filepath, f)) img.crop(self.crop_size).rescale(factor) out[i] = img.data * 255 return out
def _transform(training, features, file_list): filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH res = [] for i, f in enumerate(file_list): if i % 5000 == 0: logger.info("Processing image {} of {}".format( i, len(file_list))) img = RawImage(os.path.join(filepath, f)) features = [func(img) for func in features] res.append(np.hstack(features)) return res
def run(self, check=True, write=True): presents_file = os.path.join('data', self.infile) outfile = os.path.join('data', self.outfile) logger.info("Reading and placing presents") counter = 0 with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) position = self.sleigh.place_present(present) counter += 1 if counter % self.log_at == 0: logger.info("Placed {} presents".format(counter)) logger.info("Current min z is {}".format(np.min(self.sleigh.z_map))) logger.info("Finished placing presents") if write: self.write() if check: self.check() return self
def perform_cross_validation(self, *args, **kwargs): """ Performs cross validation using the main estimator. In some cases, when we don't need to search across a grid of hyperparameters, we may want to perform cross validation only. """ start_time = time.time() if self.cv_sample is not None: logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(self.cv_folds, self.cv_sample)) self.cv_x,\ self.cv_x_test,\ self.cv_y,\ self.cv_y_test = cross_validation.train_test_split(self.train_x, self.train_y, train_size=self.cv_sample) else: logger.info("Performing {}-fold cross validation with full training set".format(self.cv_folds)) self.cv_x = self.train_x self.cv_y = self.train_y self.cv_iterator = self.cv_class(self.cv_x.shape[0], n_folds=self.cv_folds) params = { 'cv': self.cv_iterator, 'scoring': rmse_scorer, 'verbose': 2, 'n_jobs': self.n_jobs } params.update(kwargs) # Make sure to not parallelize the estimator if 'n_jobs' in self.estimator.get_params().keys(): self.estimator.set_params(n_jobs=1) self.cv_scores = cross_validation.cross_val_score(self.estimator, self.cv_x, self.cv_y, *args, **params) logger.info("Cross validation completed in {}. Scores:".format(time.time() - start_time)) logger.info("{}".format(self.cv_scores))
def do_for_each_image(self, files, func, n_features, training): """ Function that iterates over a list of files, applying func to the image indicated by that function. Returns an (n_samples, n_features) ndarray """ dims = (N_TRAIN if training else N_TEST, n_features) predictors = np.zeros(dims) counter = 0 for row, f in enumerate(files): filepath = TRAIN_IMAGE_PATH if training else TEST_IMAGE_PATH image = RawImage(os.path.join(filepath, f)) predictors[row] = func(image) counter += 1 if counter % 1000 == 0: logger.info("Processed {} images".format(counter)) return predictors
def build_test_predictors(self): """ Builds the test predictors Returns: None """ if self.test_x is None: test_files = sorted(os.listdir(TEST_IMAGE_PATH)) if os.path.exists(self.test_predictors_file): logger.info("Test predictors already exists, loading from file {}".format(self.test_predictors_file)) res = np.load(self.test_predictors_file) else: res = self.build_features(test_files, False) logger.info("Caching test predictors to {}".format(self.test_predictors_file)) np.save(self.test_predictors_file, res) self.test_x = res
def multi_request(request): #backend endpoint for js logger.info(f"Multi Request Received; {request.__str__()}") qVar1 = request.args['qVar1'] qVar2 = request.args['qVar2'] table1 = request.args['table1'] table2 = request.args['table2'] logger.debug(", ".join([qVar1, qVar2, table1, table2])) searchVar = f'{table1}.{qVar1}, {table2}.{qVar2}' tableName = f"{table1} INNER JOIN {table2}" baseQ = f"SELECT * FROM {tableName} ON {table1}.{qVar1} == {table2}.{qVar2}" query = baseQ logger.info(query) queryOut = get_db().execute(query).fetchall() output = {} output['results'] = queryOut return jsonify(output)
def build_train_predictors(self): """ Builds the training predictors. Once the predictors are built, they are cached to a file. If the file already exists, the predictors are loaded from file. Couldn't use the @cache_to_file decorator because the decorator factory doesn't have access to self at compilation Returns: None """ if self.train_x is None: file_list = train_solutions.filenames if os.path.exists(self.train_predictors_file): logger.info("Training predictors already exists, loading from file {}".format(self.train_predictors_file)) res = np.load(self.train_predictors_file) else: res = self.build_features(file_list, True) logger.info("Caching training predictors to {}".format(self.train_predictors_file)) np.save(self.train_predictors_file, res) self.train_x = res
def run(self, method, *args, **kwargs): """ Primary entry point for executing tasks with the model Arguments: ---------- method: string Must be one of 'grid_search', 'cv', 'train', or 'predict' *args: Additional arguments to be passed to the job **kwargs: Additional arguments to be passed to the job """ jobs = {'grid_search', 'cv', 'train', 'predict'} if method not in jobs: raise RuntimeError("{} is not a valid job".format(method)) start_time = time.time() self.build_train_predictors() res = None if method == 'grid_search': logger.info("Performing grid search") res = self.perform_grid_search_and_cv(*args, **kwargs) elif method == 'cv': logger.info("Performing cross validation") res = self.perform_cross_validation(*args, **kwargs) elif method == 'train': logger.info("Performing training") res = self.train(*args, **kwargs) elif method == 'predict': logger.info("Performing prediction") res = self.predict(*args, **kwargs) end_time = time.time() logger.info("Model completed in {}".format(end_time - start_time)) return res
def transform(self, X=None): if self.training: files = train_solutions.filenames else: files = sorted(os.listdir(TEST_IMAGE_PATH)) if os.path.exists(self.result_path) and not self.force_rerun: logger.info("File already exists. Loading from {}".format(self.result_path)) if self.memmap: return joblib.load(self.result_path, mmap_mode='r+') else: return joblib.load(self.result_path) else: res = np.vstack(Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_parallel_crop_scale)(self, files) for files in chunks(files, self.n_jobs) )) logger.info("Saving results to file {}".format(self.result_path)) joblib.dump(res, self.result_path) if self.memmap: res = joblib.load(self.result_path, mmap_mode='r+') return res
def run(self, check=True, write=True): layer = self.layer_class() presents_file = os.path.join('data', self.infile) outfile = os.path.join('data', self.outfile) logger.info("Reading and placing presents") counter = 0 with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) layer = self.process_present(present, layer) counter += 1 if counter % self.log_at == 0: logger.info("Placed {} presents".format(counter)) self.process_last_layer(layer) logger.info("Finished placing presents") if write: self.write() if check: self.check() return self
def kmeans_006_colwise_rmse(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2) # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500, 'verbose': 1 }, n_jobs=-1) wrapper.fit(train_x, train_y) # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111 kmeans_preds = wrapper.predict(test_x) logger.info('Kmeans') colwise = classes.colwise_rmse(kmeans_preds, test_y) overall = classes.rmse(kmeans_preds, test_y) """
def sample_top_down(infile='presents_revorder.csv', outfile='sub_topdown_1.csv', write=True, check=True): """ Replicate the MatLab top-down approach Strategy is basically the same as bottom_up, but before closing each layer, align all of the presents to the top of the layer. Actually this strategy is not quite the same, since it reads the present in a different order. Result is a slightly higher score than the benchmark """ sleigh = classes.LayerSleigh() layer = classes.Layer() presents_file = os.path.join('data', infile) outfile = os.path.join('data', outfile) logger.info("Reading and placing presents") with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) if not layer.place_present(present): # Can't place the present on the layer, so close the layer and start a new one # Before closing, re-align all of the presents in the layer to the top of the layer align_presents_to_layer_top(layer) sleigh.add_layer(layer) layer = classes.Layer(z=sleigh.max_z + 1) res = layer.place_present(present) # Add the final layer align_presents_to_layer_top(layer) sleigh.add_layer(layer) if check and not sleigh.check_all(): logger.error('There is an error in the Sleigh') return sleigh if write: sleigh.write_to_file(outfile) return sleigh
def train(self, *args, **kwargs): start_time = time.time() logger.info("Fitting estimator") if 'n_jobs' in self.estimator.get_params().keys(): self.estimator.set_params(n_jobs=self.n_jobs) self.estimator.fit(self.train_x, self.train_y) logger.info("Finished fitting model in {}".format(time.time() - start_time)) # Get an in sample RMSE logger.info("Calculating in-sample RMSE") self.training_predict = self.estimator.predict(self.train_x) self.rmse = rmse(self.training_predict, self.train_y) return self.estimator
def fit(self, X, y=None): start_time = time.time() logger.info("Fitting estimator") self.estimator_ = self.get_estimator() if 'n_jobs' in self.estimator_.get_params().keys(): self.estimator_.set_params(n_jobs=self.n_jobs) self.estimator_.fit(X, y) logger.info("Finished fitting model in {}".format(time.time() - start_time)) # Get an in sample RMSE logger.info("Calculating in-sample RMSE") self.training_predict = self.estimator_.predict(X) self.rmse = rmse(self.training_predict, y)
def cross_validation(self, X, y, n_folds=2, cv_class=None, sample=None, parallel_estimator=False): cls = cv_class or cross_validation.KFold start_time = time.time() if sample is not None: logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(n_folds, sample)) self.cv_x,\ _,\ self.cv_y,\ _ = cross_validation.train_test_split(X, y, train_size=sample) del _ else: logger.info("Performing {}-fold cross validation with full training set".format(n_folds)) self.cv_x = X self.cv_y = y self.cv_iterator = cls(self.cv_x.shape[0], n_folds=n_folds) params = { 'cv': self.cv_iterator, 'scoring': rmse_scorer, 'verbose': 3, 'n_jobs': self.n_jobs } estimator = self.get_estimator() # Make sure to not parallelize the estimator if 'n_jobs' in estimator.get_params().keys(): if parallel_estimator: estimator.set_params(n_jobs=self.n_jobs) params['n_jobs'] = 1 else: estimator.set_params(n_jobs=1) self.cv_scores = cross_validation.cross_val_score(estimator, self.cv_x, self.cv_y, **params) logger.info("Cross validation completed in {}. Scores:".format(time.time() - start_time)) logger.info("{}".format(self.cv_scores))
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
""" Last quoted plan benchmark """ import classes from classes import logger train = classes.get_train_data() actuals = classes.get_actual_plan(train) scores = [] # Score seems to be a bit high on training, about .547-.548 # Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition for n in range(5): truncated = classes.truncate(train) prediction = classes.get_last_observed_plan(truncated) score = classes.score_df(prediction, actuals) scores.append(score) logger.info("Run {}, score: {}".format(n+1, score)) test = classes.get_test_data() pred = classes.get_last_observed_plan(test) classes.make_submission(pred, 'benchmark_001.csv')
from classes.logger import error, info if __name__ == "__main__": try: finder = AppointmentFinder() contacts = finder.getContactsByState() statesUpdated = finder.getUpdatedTsByState() for state in contacts: cvsData = finder.getApptDataForStateCvs(state) aptsAvailable = finder.openStateApts(cvsData) parsedTs = finder.parseCvsTs(cvsData) tsIsOld = True if state in statesUpdated: curTs = statesUpdated[state] tsIsOld = parsedTs > curTs if tsIsOld: finder.updateStateTs(state, parsedTs) statesUpdated[state] = parsedTs info("{} appointment data timestamp updated to {}".format( state.upper(), parsedTs)) if tsIsOld and aptsAvailable: emails = contacts[state] finder.sendApptAvailableEmailCvs(emails, state) info("Appointment finder run complete!") except Exception as e: error(str(e))
def perform_grid_search_and_cv(self, *args, **kwargs): """ Performs cross validation and grid search to identify optimal parameters and to score the estimator The grid search space is defined by self.grid_search_parameters. If grid_search_sample is defined, then a downsample of the full train_x is used to perform the grid search Cross validation is parallelized at the CV level, not the estimator level, because not all estimators can be parallelized. Parameters: ---------- refit: boolean, default True If true, the grid search estimator is refit on the grid search set, and then is used to calculate a score on the holdout set. Really only useful if grid_search_sample < 1, otherwise the calculated score will basically be an in-sample error (since the training and the testing were the same dataset) grid_search_parameters: set on model instantiation The grid search parameters -- should set this when you instantiate the Model, not when you call run('grid_search') """ if self.grid_search_parameters is not None: logger.info("Performing grid search") start_time = time.time() params = { 'scoring': rmse_scorer, 'verbose': 3, 'refit': True, 'n_jobs': self.n_jobs, 'cv': 2 } params.update(kwargs) # Make sure to not parallelize the estimator if it can be parallelized if 'n_jobs' in self.estimator.get_params().keys(): self.estimator.set_params(n_jobs=1) self.grid_search_estimator = self.grid_search_class(self.estimator, self.grid_search_parameters, *args, **params) if self.grid_search_sample is not None: logger.info("Using {} of the train set for grid search".format(self.grid_search_sample)) # Downsample if a sampling rate is defined self.grid_search_x, \ self.grid_search_x_test, \ self.grid_search_y, \ self.grid_search_y_test = cross_validation.train_test_split(self.train_x, self.train_y, train_size=self.grid_search_sample) else: logger.info("Using full train set for the grid search") # Otherwise use the full set self.grid_search_x = self.grid_search_x_test = self.train_x self.grid_search_y = self.grid_search_y_test = self.train_y self.grid_search_estimator.fit(self.grid_search_x, self.grid_search_y) logger.info("Found best parameters:") logger.info(self.grid_search_estimator.best_params_) if params['refit']: logger.info("Predicting on holdout set") pred = self.grid_search_estimator.predict(self.grid_search_x_test) res = rmse(self.grid_search_y_test, pred) logger.info("RMSE on holdout set: {}".format(res)) logger.info("Grid search completed in {}".format(time.time() - start_time))
def perform_cross_validation(self, *args, **kwargs): start_time = time.time() if self.cv_sample is not None: logger.info("Performing {}-fold cross validation with {:.0%} of the sample".format(self.cv_folds, self.cv_sample)) self.cv_x,\ self.cv_x_test,\ self.cv_y,\ self.cv_y_test = cross_validation.train_test_split(self.train_x, self.train_y, train_size=self.cv_sample) else: logger.info("Performing {}-fold cross validation with full training set".format(self.cv_folds)) self.cv_x = self.train_x self.cv_y = self.train_y self.cv_iterator = self.cv_class(self.cv_x.shape[0], n_folds=self.cv_folds) params = { 'cv': self.cv_iterator, 'scoring': rmse_scorer, 'verbose': 2, 'n_jobs': self.n_jobs } params.update(kwargs) # Gotta roll our own cross validation # Cross validation will look like this: # For each fold: # train estimator # Predict estimator # Store prediction # Move onto next estimator overall_scores = [] detailed_scores = [{}] * self.cv_folds for i, idx in enumerate(self.cv_iterator): logger.debug("Working on fold {}".format(i + 1)) train = idx[0] test = idx[1] # Get the data # The actual cross val method uses safe_mask to index the arrays. This is only required if # we might be handling sparse matrices this_train_x = self.cv_x[train] this_train_y = self.cv_y[train] this_test_x = self.cv_x[test] this_test_y = self.cv_y[test] logger.debug("Fold {} training X and Y shape: {}, {}".format(i + 1, this_train_x.shape, this_train_y.shape)) logger.debug("Fold {} test X and Y shape: {}, {}".format(i + 1, this_test_x.shape, this_test_y.shape)) test_preds = np.zeros(this_test_y.shape) train_preds = np.zeros(this_train_y.shape) # Should be able to refactor out this inner loop for cls in range(1, 12): cols = train_solutions.class_map[cls] logger.info("Performing CV on class {}".format(cls)) # Clone the estimator # Need to do this for each fold estimator = clone(self.estimator[cls]) existing_test_preds = np.any(test_preds, axis=0) existing_train_preds = np.any(train_preds, axis=0) this_x = np.hstack((this_train_x, train_preds[:, existing_train_preds])) test_x = np.hstack((this_test_x, test_preds[:, existing_test_preds])) this_y = this_train_y[:, cols] test_y = this_test_y[:, cols] logger.debug("Train X shape: {}".format(this_x.shape)) logger.debug("Train Y shape: {}".format(this_y.shape)) logger.debug("Test X shape: {}".format(test_x.shape)) # Parallelize at the estimator level if 'n_jobs' in estimator.get_params().keys(): estimator.set_params(n_jobs=self.n_jobs) estimator.fit(this_x, this_y) train_pred = estimator.predict(this_x) test_pred = estimator.predict(test_x) # Scale things back if self.scaled: # this does not work correctly because cv_y is already split scale_factors = train_solutions.get_sum_for_class(cls) assert train.shape[0] == scale_factors[0] assert test.shape[0] == scale_factors[0] train_scale_factors = scale_factors[train] test_scale_factors = scale_factors[test] assert train_scale_factors.shape[0] == train_pred.shape[0] assert test_scale_factors.shape[0] == test_pred.shape[0] train_pred = np.multiply(train_pred, train_scale_factors) test_pred = np.multiply(test_pred, test_scale_factors) test_y = np.multiply(test_y, test_scale_factors) score = rmse(test_y, test_pred) detailed_scores[i][cls] = score logger.info("RMSE on test set for class {}: {}".format(cls, score)) train_preds[:, cols] = train_pred test_preds[:, cols] = test_pred if self.scaled: pass else: fold_rmse = rmse(this_test_y, test_preds) overall_scores.append(fold_rmse) logger.info("Overall score for fold {}: {}".format(i + 1, fold_rmse)) self.cv_scores = np.array(overall_scores) logger.info("Cross validation completed in {}. Scores:".format(time.time() - start_time)) logger.info(detailed_scores) logger.info("Overall scores:") logger.info(overall_scores)
from pathlib import Path from classes.logger import error, info if __name__ == "__main__": try: logFile = environ.get('LOGFILE') if logFile: # construct new file name, get current log directory, construct # new file absolute path curDate = datetime.strftime(datetime.now(), "%Y-%m-%d") newFileName = "app.logger.{}".format(curDate) logDir = path.dirname(logFile) newFile = path.join(logDir, newFileName) exist = path.isfile(newFile) if not exist: # if the new file doesn't already exist # then rename the old one to the new file Path(logFile).rename(newFile) info("Logs successfully archived to file {}".format(newFileName)) else: # if the file exists already don't archive info("Log file was not archived as {} already exists.".format(newFileName)) else: info("Unable to archive log file. No log file environment variable found.") except Exception as e: error(str(e))
def train(self, *args, **kwargs): start_time = time.time() logger.info("Fitting estimator") preds = np.zeros(self.train_y.shape) # This currently just goes from 1 to 11, but the tree doesn't actually progress in that order. # Maybe experiment with a more fine-grained control over which predictions get passed in for cls in range(1, 12): cols = train_solutions.class_map[cls] # Select the correct estimator, and get the right subsets of the data to use in training logger.info("Fitting estimator for class {}".format(cls)) estimator = self.estimator[cls] existing_preds = np.any(preds, axis=0) # Boolean array of which columns are populated in preds # X is concatenated with any predictions that have already been made logger.debug("Adding columns {} of predictions to X".format(np.where(existing_preds)[0])) this_x = np.hstack((self.train_x, preds[:, existing_preds])) this_y = self.train_y[:, cols] logger.debug("X is of shape {}".format(this_x.shape)) logger.debug("Y is of shape {}".format(this_y.shape)) # Train the current estimator if 'n_jobs' in estimator.get_params().keys(): estimator.set_params(n_jobs=self.n_jobs) estimator.fit(this_x, this_y) # Make predictions with the current estimator, and store those predictions logger.info("Making predictions for class {}".format(cls)) y_pred = estimator.predict(this_x) logger.debug("Ypred is of shape {}".format(this_y.shape)) logger.info("RMSE of class {} is {}".format(cls, rmse(y_pred, this_y))) preds[:, cols] = y_pred logger.info("Finished fitting model in {}".format(time.time() - start_time)) # Get an in sample RMSE logger.info("Calculating overall in-sample RMSE") self.training_predict = preds self.rmse = rmse(self.training_predict, self.train_y) return self.estimator
def predict(self, X): if not hasattr(self, 'estimator_'): raise RuntimeError("Estimator has not been trained") logger.info("Predicting") return self.estimator_.predict(X)
import os from classes import logger import csv import classes infile = 'presents_short.csv' layer = classes.Layer() presents_file = os.path.join('data', infile) logger.info("Reading and placing presents") with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) if not layer.place_present(present): break areas = [] presents_file = os.path.join('data', infile) logger.info("Reading and placing presents") with open(presents_file, 'rb') as presents: presents.readline() # skip header read = csv.reader(presents) for row in read: present = classes.Present(*row) areas.append((present.pid, present.x * present.y)) import math
def grid_search(self, X, y, grid_search_params, grid_search_class=None, sample=None, n_folds=2, refit=True, parallel_estimator=False): cls = grid_search_class or grid_search.GridSearchCV logger.info("Performing grid search") start_time = time.time() params = { 'scoring': rmse_scorer, 'verbose': 3, 'refit': refit, 'n_jobs': self.n_jobs, 'cv': n_folds } estimator = self.get_estimator() if 'n_jobs' in estimator.get_params().keys(): # If the estimator can be parallelized, and parallel_estimator is True, then parallelize at that level # otherwise parallelize at the grid search level if parallel_estimator: estimator.set_params(n_jobs=self.n_jobs) params['n_jobs'] = 1 else: estimator.set_params(n_jobs=1) self.grid_search_estimator = cls(estimator, grid_search_params, **params) if sample is not None: logger.info("Using {} of the train set for grid search".format(sample)) # Downsample if a sampling rate is defined self.grid_search_x, \ self.grid_search_x_test, \ self.grid_search_y, \ self.grid_search_y_test = cross_validation.train_test_split(X, y, train_size=sample) else: logger.info("Using full train set for the grid search") # Otherwise use the full set self.grid_search_x = self.grid_search_x_test = X self.grid_search_y = self.grid_search_y_test = y self.grid_search_estimator.fit(self.grid_search_x, self.grid_search_y) logger.info("Found best parameters:") logger.info(self.grid_search_estimator.best_params_) logger.info("All results:") logger.info(pprint.pformat(self.grid_search_estimator.grid_scores_)) if params['refit']: logger.info("Predicting on holdout set") pred = self.grid_search_estimator.predict(self.grid_search_x_test) res = rmse(self.grid_search_y_test, pred) logger.info("RMSE on holdout set: {}".format(res)) logger.info("Grid search completed in {}".format(time.time() - start_time))