def read_trajectories(part, Model, mode='all', N=0): print('Ładowanie trajektorii') if part >= 1: path = f'data/part{part}/model{Model}/generated/' trajectories = [] log(f'GEN - odczyt trajektorii - start [{path+"task1.txt"}]') start = datetime.now() file_path = path + 'task1.txt' with open(file_path) as f: n = 0 trajs = f.readlines() if mode == 'start': trajs = trajs[:N] if mode == 'end': trajs = trajs[N:] l_t = len(trajs) for traj in trajs: traj = traj.strip() traj = traj.split(';')[1:] T = int(len(traj) / 2) sx, sy = traj[:T], traj[T:] # strings x, y = [], [] # floats for i in range(len(sx)): x.append(float(sx[i])) y.append(float(sy[i])) trajectories.append([x, y]) n += 1 if n % 500 == 0: print(f'czytanie trajektorii - {n}/{l_t}') stop = datetime.now() log(f'GEN - odczyt trajektorii - koniec {stop - start}') print(' --- ZAKOŃCZONO') return trajectories
def linear_regression(features, part, Model): if part in [0, 1, 2, 3, 4, 5, 6]: train_data, train_labels, test_data, test_label = split_data( features, number_to_learn) else: train_data, train_labels, test_data, test_label = split_data( features, floor(10**(part - 6))) print('Wyznaczanie modelu wilowymiarowej regresji liniowej...') log(f'ML - regresja liniowa - nauczanie - start') start = datetime.now() model = LinearRegression(normalize=True, n_jobs=-1) model = model.fit(train_data, train_labels) stop = datetime.now() log(f'ML - regresja liniowa - nauczanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') path = f'data/part{part}/model{Model}/ML/linear_regression/model.pk1' print('Zapisywanie modelu do pliku {}'.format(path)) save_model(model, path) print(' --- ZAKOŃCZONO') print('Testowanie modelu wielowymiarowej regresji liniowej...') log(f'ML - regresja liniowa - przewidywanie - start') start = datetime.now() predicted_labels = model.predict(test_data) stop = datetime.now() log(f'ML - regresja liniowa - przewidywanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') print('Translacja przewidywań...') results = pd.DataFrame({'expo': test_label, 'expo_est': predicted_labels}) print(' --- ZAKOŃCZONO') print('Translacja wyników do pliku...') results.to_csv( f'data/part{part}/model{Model}/ML/linear_regression/estimated.csv') print(' --- ZAKOŃCZONO')
def decision_tree(features, part, Model): if part in [0, 1, 2, 3, 4, 5, 6]: train_data, train_labels, test_data, test_label = split_data( features, number_to_learn) else: train_data, train_labels, test_data, test_label = split_data( features, floor(10**(part - 6))) hiperparam_data = train_data[:floor(number_to_learn / 10)] hiperparam_labels = train_labels[:floor(number_to_learn / 10)] print('Wyznaczanie drzewa decyzyjnego...') max_depth = list(range(2, 20, 1)) min_samples_split = list(range(1, 11)) min_samples_leaf = list(range(1, 6)) max_features = ['auto', 'sqrt'] random_grid = { 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features } log(f'ML - drzewo decyzyjne - szukanie superparametrów - start') start = datetime.now() model = DecisionTreeRegressor() model = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=30, cv=3, verbose=2, random_state=42, n_jobs=3, return_train_score=True, refit=True) model = model.fit(hiperparam_data, hiperparam_labels) stop = datetime.now() log(f'ML - drzewo decyzyjne - szukanie superparametrów - koniec {stop - start}' ) model_params = pd.DataFrame(model.best_params_, index=['decision tree']) dirmake(f'data/part{part}/model{Model}/ML/decision_tree') model_params.to_csv( f'data/part{part}/model{Model}/ML/decision_tree/model_params.csv') model = DecisionTreeRegressor(**model.best_params_) log(f'ML - drzewo decyzyjne - nauczanie - start') start = datetime.now() model.fit(train_data, train_labels) stop = datetime.now() log(f'ML - drzewo decyzyjne - nauczanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') path = f'data/part{part}/model{Model}/ML/decision_tree/model.pk1' print('Zapisywanie modelu do pliku {}'.format(path)) save_model(model, path) plt.cla() plt.figure(figsize=(10, 6.5)) plot_tree(model, max_depth=3, feature_names=list(test_data), fontsize=10, filled=True) path = f'data/part{part}/model{Model}/ML/decision_tree/tree.pdf' plt.savefig(path, transparent=True, bbox_inches='tight') plt.cla() plt.figure(figsize=(15, 15)) plot_tree(model, feature_names=list(test_data), filled=True) path = f'data/part{part}/model{Model}/ML/decision_tree/full_tree.pdf' plt.savefig(path, transparent=True, bbox_inches='tight') print(' --- ZAKOŃCZONO') print('Testowanie modelu drzewa decyzyjnego...') log(f'ML - drzewo decyzyjne - przewidywanie - start') start = datetime.now() predicted_labels = model.predict(test_data) stop = datetime.now() log(f'ML - drzewo decyzyjne - przewidywanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') print('Translacja przewidywań...') results = pd.DataFrame({'expo': test_label, 'expo_est': predicted_labels}) print(' --- ZAKOŃCZONO') print('Translacja wyników do pliku...') results.to_csv( f'data/part{part}/model{Model}/ML/decision_tree/estimated.csv') print(' --- ZAKOŃCZONO')
def gradient_boosting(features, part, Model): if part in [0, 1, 2, 3, 4, 5, 6]: train_data, train_labels, test_data, test_label = split_data( features, number_to_learn) else: train_data, train_labels, test_data, test_label = split_data( features, floor(10**(part - 6))) hiperparam_data = train_data[:floor(number_to_learn / 10)] hiperparam_labels = train_labels[:floor(number_to_learn / 10)] print('Wyznaczanie modelu gradient boosting...') # https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 learning_rate = [0.001 * i for i in range(1, 21)] n_estimators = list(range(100, 1001, 100)) max_depth = list(range(2, 20, 1)) min_samples_split = list(range(2, 11)) min_samples_leaf = list(range(1, 6)) max_features = ['auto', 'sqrt'] random_grid = { 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features } model = GradientBoostingRegressor() log(f'ML - wzmocnienie gradientowe - szukanie superparametrów - start') start = datetime.now() model = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=30, cv=3, verbose=2, random_state=42, n_jobs=3, return_train_score=True, refit=True) model.fit(hiperparam_data, hiperparam_labels) stop = datetime.now() log(f'ML - wzmocenienie gradientowe - szukanie superparametrów - koniec {stop - start}' ) model_params = pd.DataFrame(model.best_params_, index=['gradient boosting']) dirmake(f'data/part{part}/model{Model}/ML/gradient_boosting') model_params.to_csv( f'data/part{part}/model{Model}/ML/gradient_boosting/model_params.csv') model = GradientBoostingRegressor(**model.best_params_) # params = pd.read_csv(f'data/part{part}/model{Model}/ML/gradient_boosting/model_params.csv', index_col='Unnamed: 0') # params = params.to_dict(orient = 'list') # for key,value in params.items(): # params[key] = value[0] # model = GradientBoostingRegressor(**params) log(f'ML - wzmocnienie gradientowe - nauczanie - start') start = datetime.now() model.fit(train_data, train_labels) stop = datetime.now() log(f'ML - wzmocenienie gradientowe - nauczanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') path = f'data/part{part}/model{Model}/ML/gradient_boosting/model.pk1' print('Zapisywanie modelu do pliku {} oraz jego parametrów'.format(path)) save_model(model, path) print(' --- ZAKOŃCZONO') print('Testowanie modelu gradient boosting...') log(f'ML - wzmocnienie gradientowe - przewidywanie - start') start = datetime.now() predicted_labels = model.predict(test_data) stop = datetime.now() log(f'ML - wzmocenienie gradientowe - przewidywanie - koniec {stop - start}' ) print(' --- ZAKOŃCZONO') print('Translacja przewidywań...') results = pd.DataFrame({'expo': test_label, 'expo_est': predicted_labels}) print(' --- ZAKOŃCZONO') print('Zapisywanie wyników do pliku...') results.to_csv( f'data/part{part}/model{Model}/ML/gradient_boosting/estimated.csv') print(' --- ZAKOŃCZONO')
def random_forest(features, part, Model): if part in [0, 1, 2, 3, 4, 5, 6]: train_data, train_labels, test_data, test_label = split_data( features, number_to_learn) else: train_data, train_labels, test_data, test_label = split_data( features, floor(10**(part - 6))) hiperparam_data = train_data[:floor(number_to_learn / 10)] hiperparam_labels = train_labels[:floor(number_to_learn / 10)] print('Wyznaczanie modelu random forest...') # https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 n_estimators = list(range(100, 1001, 100)) max_depth = list(range(2, 20, 1)) min_samples_split = list(range(2, 11)) min_samples_leaf = list(range(1, 6)) max_features = ['auto', 'sqrt'] max_samples = [0.1 * i for i in range(1, 10)] random_grid = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features, 'max_samples': max_samples } model = RandomForestRegressor(bootstrap=True, max_samples=0.5) log(f'ML - las losowy - szukanie superparametrów - start') start = datetime.now() model = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=30, cv=3, verbose=2, random_state=42, n_jobs=3, return_train_score=True, refit=True) model.fit(hiperparam_data, hiperparam_labels) stop = datetime.now() log(f'ML - las losowy - szukanie superparametrów - koniec {stop - start}') model_params = pd.DataFrame(model.best_params_, index=['random forest']) dirmake(f'data/part{part}/model{Model}/ML/random_forest') model_params.to_csv( f'data/part{part}/model{Model}/ML/random_forest/model_params.csv') log(f'ML - las losowy - nauczanie - start') start = datetime.now() model = RandomForestRegressor(**model.best_params_) model.fit(train_data, train_labels) stop = datetime.now() log(f'ML - las losowy - nauczanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') path = f'data/part{part}/model{Model}/ML/random_forest/model.pk1' print('Zapisywanie modelu do pliku {} oraz jego parametrów'.format(path)) save_model(model, path) print(' --- ZAKOŃCZONO') print('Testowanie modelu random forest...') log(f'ML - las losowy - przewidywanie - start') start = datetime.now() predicted_labels = model.predict(test_data) stop = datetime.now() log(f'ML - las losowy - przewidywanie - koniec {stop - start}') print(' --- ZAKOŃCZONO') print('Translacja przewidywań...') results = pd.DataFrame({'expo': test_label, 'expo_est': predicted_labels}) print(' --- ZAKOŃCZONO') print('Zapisywanie wyników do pliku...') results.to_csv( f'data/part{part}/model{Model}/ML/random_forest/estimated.csv') print(' --- ZAKOŃCZONO')
def generate_trajectories(N, part, Model): print('Generowanie trajektorii') if part == 1: path = f'data/part1/model{Model}/generated/' dirmake(path) start = datetime.now() log(f'GEN - generowanie trajektorii - start [{path}]') if Model == 'A': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=99, max_T=100, path_datasets=path) if Model == 'B': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=20, max_T=21, path_datasets=path) if Model == 'C': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=20, max_T=100, path_datasets=path) stop = datetime.now() log(f'GEN - generowanie trajektorii - koniec [{stop - start}]') if part == 2: path = f'data/part2/model{Model}/generated/' dirmake(path) models = [2] # FBM if Model == 'A': T = 100 if Model == 'B': T = 20 log(f'GEN - generowanie trajektorii - start [{path}]') start = datetime.now() if Model in ['A', 'B']: andi_dataset_2(N=N, tasks=[1], dimensions=[2], save_dataset=True, min_T=T - 1, max_T=T, path_datasets=path, models=models) else: andi_dataset_2(N=N, tasks=[1], dimensions=[2], save_dataset=True, min_T=20, max_T=100, path_datasets=path, models=models) stop = datetime.now() log(f'GEN - generowanie trajektorii - koniec [{stop - start}]') if part in [3, 4, 5, 6]: path = f'data/part{part}/model{Model}/generated/' dirmake(path) log(f'GEN - generowanie trajektorii - start [{path}]') start = datetime.now() if Model == 'A': T = 100 if Model == 'B': T = 20 noise = [0, 0.1, 0.3, 1][part - 3] andi_dataset_3(N=N, tasks=[1], dimensions=[2], save_dataset=True, min_T=T - 1, max_T=T, path_datasets=path, noise=[noise]) stop = datetime.now() log(f'GEN - generowanie trajektorii - koniec [{stop - start}]') if part in [7, 8, 9, 10]: path = f'data/part{part}/model{Model}/generated/' dirmake(path) start = datetime.now() log(f'GEN - generowanie trajektorii - start [{path}]') N = floor(1.1 * 10**(part - 6)) if Model == 'A': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=99, max_T=100, path_datasets=path) if Model == 'B': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=20, max_T=21, path_datasets=path) if Model == 'C': AD.andi_dataset(N=N, tasks=1, dimensions=2, save_dataset=True, min_T=20, max_T=100, path_datasets=path) stop = datetime.now() log(f'GEN - generowanie trajektorii - koniec [{stop - start}]') print(' --- ZAKOŃCZONO')