Exemple #1
0
                    longitude = None

                if (violation_code is not None and issue_month is not None
                        and issue_weekday is not None
                        and issue_hour is not None and issue_time is not None
                        and car_state is not None and latitude is not None
                        and longitude is not None):

                    # With required source fields scrubbed, it is time to engineer a couple of features.
                    out_of_state = 0
                    if car_state != 5:
                        out_of_state = 1

                    luxury_make = luxury_make_types[car_make]
                    domestic_make = domestic_make_types[car_make]

                    writer.writerow([
                        violation_code, issue_month, issue_weekday, issue_hour,
                        car_state, car_make, car_color, latitude, longitude,
                        out_of_state, luxury_make, domestic_make
                    ])
                    processed_records += 1

    logger.time_log('Data Pre-Processing Complete.\n')
    logger.log('    Total Records: %s' % raw_record_count)
    logger.log('Processed Records: %s\n' % processed_records)
    logger.log('Records with missing or incomplete data:')
    for column in column_names:
        logger.log("%s: %s" % (column, missing_data[column]))
    logger.close()
    def run_classification_search_experiment(
            self,
            scoring,
            sample=None,
            random_state=None,
            test_size=0.25,
            n_jobs=-1,
            n_iter=2,
            cv=5,
            verbose=3,
            multiclass=False,
            record_predict_proba=False):
        """
        The classification search makes use of a bayesian search to find the best hyper-parameters.
        """
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        search = BayesSearchCV(
            self.estimator,
            self.hyper_parameters.search_space,
            n_jobs=n_jobs,
            n_iter=n_iter,
            cv=cv,
            verbose=verbose,
            scoring=scoring,
            return_train_score=True
        )

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size)

        logger.time_log('Starting HyperParameter Search...')
        results = search.fit(x_train, y_train)
        logger.time_log('Search Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(results.best_estimator_, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(results.best_estimator_, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test)
            test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            results,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass
        )

        logger.close()

        self.hyper_parameters.params = results.best_params_
        self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = results.best_estimator_
    def run_classification_experiment(self,
                                      sample=None,
                                      random_state=None,
                                      test_size=0.20,
                                      multiclass=False,
                                      record_predict_proba=False,
                                      sampling=None,
                                      cv=5,
                                      verbose=True,
                                      transformer=None,
                                      fit_increment=None,
                                      warm_start=False,
                                      max_iters=None,
                                      n_jobs=-1):
        use_project_path()

        logger = Logger('%s.txt' % self.name)
        evaluator = Evaluator(logger)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(
            data_frame, data_frame[self.target], test_size=test_size)

        if transformer is not None:
            logger.time_log('Fitting Transformer...')
            transformer.fit(x_train)
            logger.time_log('Transformer Fit Complete.\n')

        if sampling is not None:
            logger.time_log('Starting Data Re-Sampling...')
            logger.log('Original Training Shape is %s' % Counter(y_train))
            x_new, y_new = sampling.fit_resample(x_train, y_train)
            logger.log('Balanced Training Shape is %s' % Counter(y_new))
            if hasattr(x_train, 'columns'):
                x_new = pd.DataFrame(x_new, columns=x_train.columns)
            x_train, y_train = x_new, y_new
            logger.time_log('Re-Sampling Complete.\n')
            logger.time_log('Shuffling Re-Sampled Data.\n')
            x_train, y_train = shuffle(x_train,
                                       y_train,
                                       random_state=random_state)
            logger.time_log('Shuffling Complete.\n')

        if self.hyper_parameters is not None:
            self.estimator.set_params(**self.hyper_parameters.params)

        if cv is not None:
            kfold = StratifiedKFold(n_splits=cv, random_state=random_state)
            logger.time_log('Cross Validating Model...')
            fold_scores = Parallel(n_jobs=n_jobs, verbose=3)(
                delayed(crossfold_classifier)
                (clone(self.estimator), transformer, x_train, y_train,
                 train_index, test_index, record_predict_proba, verbose,
                 fit_increment, warm_start, max_iters, random_state)
                for train_index, test_index in kfold.split(x_train, y_train))
            logger.time_log('Cross Validation Complete.\n')

        logger.time_log('Training Model...')
        if fit_increment is not None:
            if max_iters is not None:
                for iter in range(max_iters):
                    x_iter_train, y_iter_train = shuffle(
                        x_train, y_train, random_state=random_state)
                    batch_fit_classifier(self.estimator,
                                         x_iter_train,
                                         y_iter_train,
                                         transformer=transformer,
                                         increment=fit_increment,
                                         verbose=verbose)
            else:
                batch_fit_classifier(self.estimator,
                                     x_train,
                                     y_train,
                                     transformer=transformer,
                                     increment=fit_increment,
                                     verbose=verbose)
        else:
            if transformer is not None:
                x_train_transformed = transformer.transform(x_train)
                self.estimator.fit(x_train_transformed, y_train)
            else:
                self.estimator.fit(x_train, y_train)
        logger.time_log('Training Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(self.estimator,
                                        x_train,
                                        transformer=transformer,
                                        verbose=verbose)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(self.estimator,
                                       x_test,
                                       transformer=transformer,
                                       verbose=verbose)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(self.estimator,
                                                       x_test,
                                                       transformer=transformer,
                                                       verbose=verbose)
            test_proba_evaluation_frame = EvaluationFrame(
                y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        if cv is not None:
            evaluator.evaluate_fold_scores(fold_scores)

        evaluator.evaluate_classifier_result(
            self.estimator,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass)

        logger.close()

        if self.hyper_parameters is not None:
            self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = self.estimator
    def run_classification_experiment(
            self,
            sample=None,
            random_state=None,
            test_size=0.25,
            multiclass=False,
            record_predict_proba=False):
        """
        Running a classification experiment is used when only a single model run and fit is necessary.
        """
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size)

        if self.hyper_parameters is not None:
            self.estimator.set_params(**self.hyper_parameters.params)

        logger.time_log('Training Model...')
        self.estimator.fit(x_train, y_train)
        logger.time_log('Training Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(self.estimator, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(self.estimator, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(self.estimator, x_test)
            test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            self.estimator,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass
        )

        logger.close()

        if self.hyper_parameters is not None:
            self.hyper_parameters.save('%s_params.p' % self.name)
    def run_classification_search_experiment(self,
                                             scoring,
                                             sample=None,
                                             random_state=None,
                                             test_size=0.20,
                                             n_jobs=-1,
                                             n_iter=2,
                                             cv=5,
                                             verbose=3,
                                             multiclass=False,
                                             record_predict_proba=False,
                                             sampling=None):
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        search = BayesSearchCV(self.estimator,
                               self.hyper_parameters.search_space,
                               n_jobs=n_jobs,
                               n_iter=n_iter,
                               cv=cv,
                               verbose=verbose,
                               scoring=scoring,
                               return_train_score=True)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(
            data_frame, data_frame[self.target], test_size=test_size)

        if sampling is not None:
            logger.time_log('Starting Data Re-Sampling...')
            logger.log('Original Training Shape is %s' % Counter(y_train))
            x_new, y_new = sampling.fit_resample(x_train, y_train)
            logger.log('Balanced Training Shape is %s' % Counter(y_new))
            if hasattr(x_train, 'columns'):
                x_new = pd.DataFrame(x_new, columns=x_train.columns)
            x_train, y_train = x_new, y_new
            logger.time_log('Re-Sampling Complete.\n')
            logger.time_log('Shuffling Re-Sampled Data.\n')
            x_train, y_train = shuffle(x_train,
                                       y_train,
                                       random_state=random_state)
            logger.time_log('Shuffling Complete.\n')

        logger.time_log('Starting HyperParameter Search...')
        results = search.fit(x_train, y_train)
        logger.time_log('Search Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(results.best_estimator_, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(results.best_estimator_, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(results.best_estimator_,
                                                       x_test)
            test_proba_evaluation_frame = EvaluationFrame(
                y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            results,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass)

        logger.close()

        self.hyper_parameters.params = results.best_params_
        self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = results.best_estimator_