Beispiel #1
0
    def _run_single_model(self, model_def, x, y):
        model = model_def.model()

        # perform the search for the best hyper parameters
        param_dist = model_def.params_to_tune()
        search = None

        # choose which search to apply
        if SEARCH == 'randomized':
            search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=N_ITER_RANDOM_SEARCH,
                cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True),
                iid=False,
                n_jobs=-1)
        elif SEARCH == 'grid':
            search = GridSearchCV(model,
                                  param_dist,
                                  cv=StratifiedKFold(n_splits=N_CV_SEARCH,
                                                     shuffle=True),
                                  iid=False,
                                  n_jobs=-1)

        log("Search started at %s\n" % now())
        search.fit(x, y)
        log(format_best_parameters(search))
        best_estimator = search.best_estimator_

        # cross-validation
        log("Cross validation started at %s\n" % now())

        scoring = {
            'tp': make_scorer(tp),
            'tn': make_scorer(tn),
            'fp': make_scorer(fp),
            'fn': make_scorer(fn),
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall'
        }

        scores = cross_validate(model_def.model(search.best_params_),
                                x,
                                y,
                                cv=StratifiedKFold(n_splits=N_CV,
                                                   shuffle=True),
                                n_jobs=-1,
                                scoring=scoring)

        # now, train a final model with all the data, so that we can use it
        # for the comparison among the datasets
        super_model = model_def.model(search.best_params_)
        super_model.fit(x, y)

        # return the scores and the best estimator
        return scores["test_precision"], scores["test_recall"], scores[
            'test_accuracy'], scores['test_tn'], scores['test_fp'], scores[
                'test_fn'], scores['test_tp'], super_model
Beispiel #2
0
def _build_production_model(model_def, best_params, x, y):
    log("Production model build started at %s\n" % now())

    super_model = model_def.model(best_params)
    super_model.fit(x, y)

    return super_model
Beispiel #3
0
        def clean_results():
            if self._autoclean_stopped:
                return

            if os.path.exists(parent_folder):
                for user_folder in os.listdir(parent_folder):
                    for timed_folder in os.listdir(
                            os.path.join(parent_folder, user_folder)):
                        if not re.match('\d+', timed_folder):
                            continue

                        millis = int(timed_folder)
                        folder_date = ms_to_datetime(millis)
                        now = date_utils.now()

                        if (now - folder_date) > datetime.timedelta(
                                milliseconds=lifetime_ms):
                            folder_path = os.path.join(parent_folder,
                                                       user_folder,
                                                       timed_folder)

                            LOGGER.info('Cleaning old folder: ' + folder_path)
                            shutil.rmtree(folder_path)

            timer = threading.Timer(period_sec, clean_results)
            timer.setDaemon(True)
            timer.start()
Beispiel #4
0
 def __init__(self, models_to_run,
              refactorings: Iterable[LowLevelRefactoring],
              datasets: Iterable[str]):
     self._models_to_run = models_to_run
     self._refactorings: Iterable[LowLevelRefactoring] = refactorings
     self._datasets: Iterable[str] = datasets
     self._start_datetime = None
     self.start_pipeline_date_time: str = now()
     self._current_execution_number: int = 0
Beispiel #5
0
    def _run_ordered(self, model_def, x_train, y_train, x_test, y_test):
        model = model_def.model()

        # perform the search for the best hyper parameters
        param_dist = model_def.params_to_tune()
        search = None

        # choose which search to apply
        if SEARCH == 'randomized':
            search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=N_ITER_RANDOM_SEARCH,
                cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True),
                iid=False,
                n_jobs=-1)
        elif SEARCH == 'grid':
            search = GridSearchCV(model,
                                  param_dist,
                                  cv=StratifiedKFold(n_splits=N_CV_SEARCH,
                                                     shuffle=True),
                                  iid=False,
                                  n_jobs=-1)

        log("Search started at %s\n" % now())
        search.fit(x_train, y_train)
        log(format_best_parameters(search))

        log("Training again started at %s\n" % now())
        final_model = model_def.model(search.best_params_)
        final_model.fit(x_train, y_train)

        # what's the accuracy of the model?
        log("Test started at %s\n" % now())
        y_pred = final_model.predict(x_test)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()

        # return the scores and the final model
        return precision, recall, accuracy, tn, fp, fn, tp, final_model
        def scheduler_loop():
            while not self.stopped:
                try:
                    self.scheduler.run(blocking=False)
                except:
                    LOGGER.exception('Failed to execute scheduled job')

                now = date_utils.now()
                sleep_delta = timedelta(minutes=1) - timedelta(microseconds=now.microsecond, seconds=now.second)
                _sleep(sleep_delta.total_seconds())
Beispiel #7
0
def _evaluate_model(search, x_train, x_tests, y_train, y_tests):
    log("Test search started at %s\n" % now(), False)
    search.fit(x_train, y_train)
    log(format_best_parameters(search), False)
    best_estimator = search.best_estimator_

    test_scores = {'accuracy': [], 'precision': [], 'recall': [], 'tn': [], 'fp': [], 'fn': [], 'tp': []}
    # Predict unseen results for all validation sets
    for index, x_test in enumerate(x_tests):
        y_pred = best_estimator.predict(x_test)
        y_test = y_tests[index]
        test_scores["accuracy"] += [accuracy_score(y_test, y_pred)]
        test_scores["precision"] += [precision_score(y_test, y_pred)]
        test_scores["recall"] += [recall_score(y_test, y_pred)]
        test_scores["tn"] += [confusion_matrix(y_test, y_pred).ravel()[0]]
        test_scores["fp"] += [confusion_matrix(y_test, y_pred).ravel()[1]]
        test_scores["fn"] += [confusion_matrix(y_test, y_pred).ravel()[2]]
        test_scores["tp"] += [confusion_matrix(y_test, y_pred).ravel()[3]]

    return test_scores
Beispiel #8
0
    def get_next_time(self):
        if not self.repeatable:
            return self.start_datetime

        if self.repeat_unit == 'minutes':
            next_time_func = lambda start, iteration_index: \
                start + timedelta(minutes=self.repeat_period * iteration_index)
            get_initial_multiplier = lambda start: \
                ((now - start).seconds // 60 + (now - start).days * 1440) \
                // self.repeat_period
        elif self.repeat_unit == 'hours':
            next_time_func = lambda start, iteration_index: start + timedelta(
                hours=self.repeat_period * iteration_index)

            get_initial_multiplier = lambda start: \
                ((now - start).seconds // 3600 + (now - start).days * 24) \
                // self.repeat_period
        elif self.repeat_unit == 'days':
            next_time_func = lambda start, iteration_index: start + timedelta(days=self.repeat_period * iteration_index)
            get_initial_multiplier = lambda start: (now - start).days // self.repeat_period
        elif self.repeat_unit == 'months':
            next_time_func = lambda start, iteration_index: date_utils.add_months(start,
                                                                                  self.repeat_period * iteration_index)
            get_initial_multiplier = lambda start: (now - start).days // 28 // self.repeat_period
        elif self.repeat_unit == 'weeks':
            start_weekday = self.start_datetime.weekday()
            offset = 0
            for weekday in self.weekdays:
                index = ALLOWED_WEEKDAYS.index(weekday)
                if index < start_weekday:
                    offset += 1

            def next_weekday(start: datetime, iteration_index):
                weeks_multiplier = (iteration_index + offset) // len(self.weekdays)
                next_weekday_index = (iteration_index + offset) % len(self.weekdays)
                next_weekday_name = self.weekdays[next_weekday_index]
                next_weekday = ALLOWED_WEEKDAYS.index(next_weekday_name)

                return start \
                       + timedelta(weeks=self.repeat_period * weeks_multiplier) \
                       + timedelta(days=(next_weekday - start.weekday()))

            next_time_func = next_weekday

            get_initial_multiplier = lambda start: (now - start).days // 7 // self.repeat_period * len(
                self.weekdays) - 1
        else:
            raise Exception('Unknown unit: ' + repr(self.repeat_unit))

        now = date_utils.now(tz=timezone.utc)
        max_iterations = 10000
        initial_multiplier = max(0, get_initial_multiplier(self.start_datetime))
        i = 0
        while True:
            resolved_time = next_time_func(self.start_datetime, i + initial_multiplier)
            if resolved_time >= now:
                return resolved_time

            i += 1
            if i > max_iterations:
                raise Exception('Endless loop in calc next time')
 def _start_time(self):
     self._count_execution()
     self._start_hour = now()
     log("Started at %s" % self._start_hour)
 def _finish_time(self, dataset, model, refactoring):
     finish_hour = now()
     log("Finished at %s" % finish_hour)
     log(
         ("TIME,%s,%s,%s,%s,%s" % (dataset, refactoring.name(), model.name(), self._start_hour, finish_hour)))