longitude = None if (violation_code is not None and issue_month is not None and issue_weekday is not None and issue_hour is not None and issue_time is not None and car_state is not None and latitude is not None and longitude is not None): # With required source fields scrubbed, it is time to engineer a couple of features. out_of_state = 0 if car_state != 5: out_of_state = 1 luxury_make = luxury_make_types[car_make] domestic_make = domestic_make_types[car_make] writer.writerow([ violation_code, issue_month, issue_weekday, issue_hour, car_state, car_make, car_color, latitude, longitude, out_of_state, luxury_make, domestic_make ]) processed_records += 1 logger.time_log('Data Pre-Processing Complete.\n') logger.log(' Total Records: %s' % raw_record_count) logger.log('Processed Records: %s\n' % processed_records) logger.log('Records with missing or incomplete data:') for column in column_names: logger.log("%s: %s" % (column, missing_data[column])) logger.close()
def run_classification_search_experiment( self, scoring, sample=None, random_state=None, test_size=0.25, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False): """ The classification search makes use of a bayesian search to find the best hyper-parameters. """ use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV( self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True ) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_
def run_classification_experiment(self, sample=None, random_state=None, test_size=0.20, multiclass=False, record_predict_proba=False, sampling=None, cv=5, verbose=True, transformer=None, fit_increment=None, warm_start=False, max_iters=None, n_jobs=-1): use_project_path() logger = Logger('%s.txt' % self.name) evaluator = Evaluator(logger) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if transformer is not None: logger.time_log('Fitting Transformer...') transformer.fit(x_train) logger.time_log('Transformer Fit Complete.\n') if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) if cv is not None: kfold = StratifiedKFold(n_splits=cv, random_state=random_state) logger.time_log('Cross Validating Model...') fold_scores = Parallel(n_jobs=n_jobs, verbose=3)( delayed(crossfold_classifier) (clone(self.estimator), transformer, x_train, y_train, train_index, test_index, record_predict_proba, verbose, fit_increment, warm_start, max_iters, random_state) for train_index, test_index in kfold.split(x_train, y_train)) logger.time_log('Cross Validation Complete.\n') logger.time_log('Training Model...') if fit_increment is not None: if max_iters is not None: for iter in range(max_iters): x_iter_train, y_iter_train = shuffle( x_train, y_train, random_state=random_state) batch_fit_classifier(self.estimator, x_iter_train, y_iter_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: batch_fit_classifier(self.estimator, x_train, y_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: if transformer is not None: x_train_transformed = transformer.transform(x_train) self.estimator.fit(x_train_transformed, y_train) else: self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test, transformer=transformer, verbose=verbose) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') if cv is not None: evaluator.evaluate_fold_scores(fold_scores) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = self.estimator
def run_classification_experiment( self, sample=None, random_state=None, test_size=0.25, multiclass=False, record_predict_proba=False): """ Running a classification experiment is used when only a single model run and fit is necessary. """ use_project_path() logger = Logger('%s.txt' % self.name) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) logger.time_log('Training Model...') self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name)
def run_classification_search_experiment(self, scoring, sample=None, random_state=None, test_size=0.20, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False, sampling=None): use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV(self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_