def _run_single_model(self, model_def, x, y): model = model_def.model() # perform the search for the best hyper parameters param_dist = model_def.params_to_tune() search = None # choose which search to apply if SEARCH == 'randomized': search = RandomizedSearchCV( model, param_dist, n_iter=N_ITER_RANDOM_SEARCH, cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True), iid=False, n_jobs=-1) elif SEARCH == 'grid': search = GridSearchCV(model, param_dist, cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True), iid=False, n_jobs=-1) log("Search started at %s\n" % now()) search.fit(x, y) log(format_best_parameters(search)) best_estimator = search.best_estimator_ # cross-validation log("Cross validation started at %s\n" % now()) scoring = { 'tp': make_scorer(tp), 'tn': make_scorer(tn), 'fp': make_scorer(fp), 'fn': make_scorer(fn), 'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall' } scores = cross_validate(model_def.model(search.best_params_), x, y, cv=StratifiedKFold(n_splits=N_CV, shuffle=True), n_jobs=-1, scoring=scoring) # now, train a final model with all the data, so that we can use it # for the comparison among the datasets super_model = model_def.model(search.best_params_) super_model.fit(x, y) # return the scores and the best estimator return scores["test_precision"], scores["test_recall"], scores[ 'test_accuracy'], scores['test_tn'], scores['test_fp'], scores[ 'test_fn'], scores['test_tp'], super_model
def _build_production_model(model_def, best_params, x, y): log("Production model build started at %s\n" % now()) super_model = model_def.model(best_params) super_model.fit(x, y) return super_model
def clean_results(): if self._autoclean_stopped: return if os.path.exists(parent_folder): for user_folder in os.listdir(parent_folder): for timed_folder in os.listdir( os.path.join(parent_folder, user_folder)): if not re.match('\d+', timed_folder): continue millis = int(timed_folder) folder_date = ms_to_datetime(millis) now = date_utils.now() if (now - folder_date) > datetime.timedelta( milliseconds=lifetime_ms): folder_path = os.path.join(parent_folder, user_folder, timed_folder) LOGGER.info('Cleaning old folder: ' + folder_path) shutil.rmtree(folder_path) timer = threading.Timer(period_sec, clean_results) timer.setDaemon(True) timer.start()
def __init__(self, models_to_run, refactorings: Iterable[LowLevelRefactoring], datasets: Iterable[str]): self._models_to_run = models_to_run self._refactorings: Iterable[LowLevelRefactoring] = refactorings self._datasets: Iterable[str] = datasets self._start_datetime = None self.start_pipeline_date_time: str = now() self._current_execution_number: int = 0
def _run_ordered(self, model_def, x_train, y_train, x_test, y_test): model = model_def.model() # perform the search for the best hyper parameters param_dist = model_def.params_to_tune() search = None # choose which search to apply if SEARCH == 'randomized': search = RandomizedSearchCV( model, param_dist, n_iter=N_ITER_RANDOM_SEARCH, cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True), iid=False, n_jobs=-1) elif SEARCH == 'grid': search = GridSearchCV(model, param_dist, cv=StratifiedKFold(n_splits=N_CV_SEARCH, shuffle=True), iid=False, n_jobs=-1) log("Search started at %s\n" % now()) search.fit(x_train, y_train) log(format_best_parameters(search)) log("Training again started at %s\n" % now()) final_model = model_def.model(search.best_params_) final_model.fit(x_train, y_train) # what's the accuracy of the model? log("Test started at %s\n" % now()) y_pred = final_model.predict(x_test) accuracy = metrics.accuracy_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel() # return the scores and the final model return precision, recall, accuracy, tn, fp, fn, tp, final_model
def scheduler_loop(): while not self.stopped: try: self.scheduler.run(blocking=False) except: LOGGER.exception('Failed to execute scheduled job') now = date_utils.now() sleep_delta = timedelta(minutes=1) - timedelta(microseconds=now.microsecond, seconds=now.second) _sleep(sleep_delta.total_seconds())
def _evaluate_model(search, x_train, x_tests, y_train, y_tests): log("Test search started at %s\n" % now(), False) search.fit(x_train, y_train) log(format_best_parameters(search), False) best_estimator = search.best_estimator_ test_scores = {'accuracy': [], 'precision': [], 'recall': [], 'tn': [], 'fp': [], 'fn': [], 'tp': []} # Predict unseen results for all validation sets for index, x_test in enumerate(x_tests): y_pred = best_estimator.predict(x_test) y_test = y_tests[index] test_scores["accuracy"] += [accuracy_score(y_test, y_pred)] test_scores["precision"] += [precision_score(y_test, y_pred)] test_scores["recall"] += [recall_score(y_test, y_pred)] test_scores["tn"] += [confusion_matrix(y_test, y_pred).ravel()[0]] test_scores["fp"] += [confusion_matrix(y_test, y_pred).ravel()[1]] test_scores["fn"] += [confusion_matrix(y_test, y_pred).ravel()[2]] test_scores["tp"] += [confusion_matrix(y_test, y_pred).ravel()[3]] return test_scores
def get_next_time(self): if not self.repeatable: return self.start_datetime if self.repeat_unit == 'minutes': next_time_func = lambda start, iteration_index: \ start + timedelta(minutes=self.repeat_period * iteration_index) get_initial_multiplier = lambda start: \ ((now - start).seconds // 60 + (now - start).days * 1440) \ // self.repeat_period elif self.repeat_unit == 'hours': next_time_func = lambda start, iteration_index: start + timedelta( hours=self.repeat_period * iteration_index) get_initial_multiplier = lambda start: \ ((now - start).seconds // 3600 + (now - start).days * 24) \ // self.repeat_period elif self.repeat_unit == 'days': next_time_func = lambda start, iteration_index: start + timedelta(days=self.repeat_period * iteration_index) get_initial_multiplier = lambda start: (now - start).days // self.repeat_period elif self.repeat_unit == 'months': next_time_func = lambda start, iteration_index: date_utils.add_months(start, self.repeat_period * iteration_index) get_initial_multiplier = lambda start: (now - start).days // 28 // self.repeat_period elif self.repeat_unit == 'weeks': start_weekday = self.start_datetime.weekday() offset = 0 for weekday in self.weekdays: index = ALLOWED_WEEKDAYS.index(weekday) if index < start_weekday: offset += 1 def next_weekday(start: datetime, iteration_index): weeks_multiplier = (iteration_index + offset) // len(self.weekdays) next_weekday_index = (iteration_index + offset) % len(self.weekdays) next_weekday_name = self.weekdays[next_weekday_index] next_weekday = ALLOWED_WEEKDAYS.index(next_weekday_name) return start \ + timedelta(weeks=self.repeat_period * weeks_multiplier) \ + timedelta(days=(next_weekday - start.weekday())) next_time_func = next_weekday get_initial_multiplier = lambda start: (now - start).days // 7 // self.repeat_period * len( self.weekdays) - 1 else: raise Exception('Unknown unit: ' + repr(self.repeat_unit)) now = date_utils.now(tz=timezone.utc) max_iterations = 10000 initial_multiplier = max(0, get_initial_multiplier(self.start_datetime)) i = 0 while True: resolved_time = next_time_func(self.start_datetime, i + initial_multiplier) if resolved_time >= now: return resolved_time i += 1 if i > max_iterations: raise Exception('Endless loop in calc next time')
def _start_time(self): self._count_execution() self._start_hour = now() log("Started at %s" % self._start_hour)
def _finish_time(self, dataset, model, refactoring): finish_hour = now() log("Finished at %s" % finish_hour) log( ("TIME,%s,%s,%s,%s,%s" % (dataset, refactoring.name(), model.name(), self._start_hour, finish_hour)))