def fit(self, seasons=['2019-20'], season_train='2018-19'): if self.fitted: raise AlreadyFittedError() self.fitted = True fd_noregobs = ForecastDataset(regobs_types=[], seasons=seasons) fd_regobs = ForecastDataset(regobs_types=regobs_types, seasons=seasons) fd_noregobs_test = ForecastDataset(regobs_types=[], seasons=[season_train]) fd_regobs_test = ForecastDataset(regobs_types=regobs_types, seasons=[season_train]) for days, varsom, regobs, temp in setup: if varsom and not self.with_varsom: continue if temp and self.stretch_temp is not None and not self.stretch_temp: continue if not temp and self.stretch_temp: continue if regobs: labeled_data = fd_regobs.label(days=days, with_varsom=varsom) test_data = fd_regobs_test.label(days=days, with_varsom=varsom) else: labeled_data = fd_noregobs.label(days=days, with_varsom=varsom) test_data = fd_noregobs_test.label(days=days, with_varsom=varsom) labeled_data.data = labeled_data.data.loc[:, [ not re.search(r"cause", col) for col in labeled_data.data.columns.get_level_values(0) ]] test_data.data = test_data.data.loc[:, [ not re.search(r"cause", col) for col in test_data.data.columns.get_level_values(0) ]] if temp: labeled_data = labeled_data.stretch_temperatures() test_data = test_data.stretch_temperatures() labeled_data = labeled_data.drop_regions() test_data = test_data.drop_regions() if days > 2: labeled_data = labeled_data.to_time_parameters(orig_days=1) test_data = test_data.to_time_parameters(orig_days=1) labeled_data = labeled_data.normalize() test_data = test_data.normalize(by=labeled_data) for m_tag, create_machine in [("SKClustering", createClustering), ("SKClassifier", createClassifier)]: tag = f"{m_tag}_{days}_noregions_{'' if varsom else 'no'}varsom_{'-'.join(regobs)}{'_temp' if temp else ''}" print(f"Training {tag}, size {labeled_data.data.shape}") machine = create_machine() machine.fit(labeled_data) print("Saving machine") self.machines[tag] = machine print("Testing machine") predicted_data = machine.predict(test_data) results_machine = predicted_data.f1() real = results_machine.index.get_level_values(0) == "REAL" # We need to "reverse" the rmse to be sorted together with f1. results_machine.loc[real, "rmse"] = results_machine\ .loc[real, "rmse"]\ .rdiv(1, fill_value=0)\ .replace(np.inf, 0) f1_machine = results_machine[["f1", "rmse"]]\ .apply(lambda x: pd.Series(x.dropna().to_numpy()), axis=1)\ .squeeze()\ .rename(tag) self.f1 = f1_machine if self.f1 is None else pd.concat( [self.f1, f1_machine], axis=1).fillna(0)
def predict(self, seasons=["2020-21"], csv_tag=None): if csv_tag is None: fd_noregobs = ForecastDataset(regobs_types=[], seasons=seasons) fd_regobs = ForecastDataset(regobs_types=regobs_types, seasons=seasons) all_data = {} machine_scores = self.f1 ms_idx = machine_scores.index.to_frame().fillna("") machine_scores.index = pd.MultiIndex.from_frame(ms_idx) empty_indices = machine_scores.index[np.logical_and( np.logical_or( machine_scores.index.get_level_values(3) == "0", machine_scores.index.get_level_values(3) == ""), machine_scores.index.get_level_values(0) != "REAL")] machine_scores = machine_scores.drop(empty_indices) groupby = machine_scores.groupby(level=[0, 1, 2]) grouped_scores = groupby.mean() + groupby.min() for days, varsom, regobs, temp in setup: if varsom and not self.with_varsom: continue if temp and self.stretch_temp is not None and not self.stretch_temp: continue if not temp and self.stretch_temp: continue d_tag = f"{days}_noregions_{'' if varsom else 'no'}varsom_{'-'.join(regobs)}{'_temp' if temp else ''}" print(d_tag) try: print("Collecting data") if csv_tag is None: fd = fd_regobs if regobs else fd_noregobs data = fd.label(days=days, with_varsom=varsom) else: data = LabeledData.from_csv(days, regobs, False, varsom, csv_tag) data = data.normalize() data = data.drop_regions() if temp: data = data.stretch_temperatures() collected = True except expected_errors: print("Failed to collect data") collected = False for m_tag, machine_class in [("SKClustering", SKClusteringMachine), ("SKClassifier", SKClassifierMachine) ]: tag = f"{m_tag}_{d_tag}" if collected: machine = self.machines[tag] all_data[tag] = data else: grouped_scores.drop(columns=tag, inplace=True) predictions = {} best_models = pd.DataFrame( grouped_scores.columns.values[np.argsort(-grouped_scores)], index=grouped_scores.index) ld = None for tag in np.unique(best_models.values.flatten()): print(tag) labeled_data = self.machines[tag].predict(all_data[tag], force_subprobs=True) labeled_data.pred = labeled_data.pred.astype(str) predictions[tag] = labeled_data.pred if ld is None: ld = labeled_data ld.data = None elif ld.label is not None and labeled_data.label is not None: combined = ld.label.combine_first(labeled_data.label) ld.label = ld.label.reindex( ld.label.index.union(labeled_data.label.index)) ld.label.loc[combined.index] = combined elif ld.label is None and labeled_data.label is not None: ld.label = labeled_data.label pred = None for label in best_models.index: for _, tag in best_models.loc[label].items(): pred_tag = predictions[tag][label].replace("0", np.nan) if label[1] != "": pred_tag = pred_tag.replace("", np.nan) if pred is not None and label in pred.columns: combined = pred[label].combine_first(pred_tag) pred = pred.reindex(combined.index) pred[label] = combined elif pred is not None: pred = pred.reindex(pred.index.union(pred_tag.index)) pred.loc[predictions[tag].index, label] = pred_tag else: pred = pred_tag.to_frame() """Remove values that shouldn't exist.""" ld.pred = pred return ld.valid_pred()
varsom = False try: bm = SKClusteringMachine.load(model_prefix) except FileNotFoundError: try: print("Reading csv") labeled_data = LabeledData.from_csv( seasons=["2017-18", "2018-19", "2019-20"], days=days, regobs_types=regobs_types, with_varsom=varsom, ) except CsvMissingError: print("Csv missing. Fetching online data. (This takes a long time.)") labeled_data = ForecastDataset(regobs_types=regobs_types).label( days=days, with_varsom=varsom) labeled_data.to_csv() labeled_data = labeled_data.normalize() f1 = None importances = None strat = ("CLASS", "", "danger_level") print(f"Training model") dt = DecisionTreeClassifier(max_depth=7, class_weight={ "1": 1, "2": 1, "3": 1, "4": 1
classifier_creator, classifier_creator, regressor_creator, sk_prim_class_weight={}, sk_class_weight={}, ) f1 = None for days, varsom, regobs in setup: try: labeled_data = LabeledData.from_csv(days=days, regobs_types=regobs, with_varsom=varsom) except CsvMissingError: labeled_data = ForecastDataset(regobs_types=regobs).label( days=days, with_varsom=varsom) labeled_data.to_csv() labeled_data = labeled_data.normalize() labeled_data = labeled_data.drop_regions() for m_tag, create_machine in [("SKClustering", createClustering), ("SKClassifier", createClassifier)]: tag = f"{m_tag}_{days}_noregions_{'' if varsom else 'no'}varsom_{'-'.join(regobs)}" print(f"Training {tag}, size {labeled_data.data.shape}") machine = create_machine() machine.fit(labeled_data) print("Saving machine") machine.dump(tag)
days = 7 regobs_types = [ "Faretegn", "Tester", "Skredaktivitet", "Snødekke", "Skredproblem", "Skredfarevurdering" ] try: print("Reading training csv") training_data = LabeledData.from_csv(seasons=train_seasons, days=days, regobs_types=regobs_types, with_varsom=True) except CsvMissingError: print("Csv missing. Fetching online data. (This takes a long time.)") training_data = ForecastDataset(seasons=train_seasons, regobs_types=regobs_types).label( days=days, with_varsom=True) training_data.to_csv() try: print("Reading testing csv") testing_data = LabeledData.from_csv(seasons=test_seasons, days=days, regobs_types=regobs_types, with_varsom=True) except CsvMissingError: print("Csv missing. Fetching online data. (This takes a long time.)") testing_data = ForecastDataset(seasons=test_seasons, regobs_types=regobs_types).label( days=days, with_varsom=True) testing_data.to_csv()
# Fetch data days = 0 regobs_types = [] varsom = False seasons = ["2017-18", "2018-19", "2019-20"] try: print("Reading csv") labeled_data = LabeledData.from_csv(seasons=seasons, days=days, regobs_types=regobs_types, with_varsom=varsom) except CsvMissingError: print("Csv missing. Fetching online data. (This takes a long time.)") labeled_data = ForecastDataset( seasons=seasons, regobs_types=regobs_types).label(days=days, with_varsom=varsom) labeled_data.to_csv() print("Calculating scores") # Score need something at .pred, otherwise it won't run. labeled_data.pred = labeled_data.label score = Score(labeled_data) vectors = score.label_vectors concatenated_score = pd.concat( [vectors["problem_1"], vectors["problem_2"], vectors["problem_3"]]) non_empty = concatenated_score["freq"] != 0 concatenated_problems = concatenated_score.loc[non_empty] def calc_problem(problems):