def main(): # Retrieve data file_path = "data/interim/train_interim.csv" housing_prices = pd.read_csv(file_path) # Seperating predictors and target input_feats, output_feats = preprocessing.make_dataset( housing_prices, "SalePrice") # Subsetting columns of interest feature_names = [ "LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd", "HouseStyle", ] features = input_feats[feature_names] # Data processing preprocess_pipeline = preprocessing.preprocess_pipeline(features) # Generating pipeline for model model = make_pipeline(preprocess_pipeline, LinearRegression()) # Train the model model, predictions, actual = train_model.train_model( features, output_feats, model) # Evaluating the model train_model.evaluate_model(predictions, actual, model)
def parameterized_test(self, model, mode): # given: data_dir = "test-data" interim_dir = self.test_dir + "/interim" processed_dir = self.test_dir + "/processed" model_dir = self.test_dir + "/model" model_path = model_dir + ("" if mode == "full" else "_" + mode) + "/0001.txt" submission_dir = self.test_dir + "/submissions" submission_path = submission_dir + "/submission.csv" # data preparation # when: make_dataset(data_dir, interim_dir) # then: self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) # feature engineering # when: build_features(data_dir, processed_dir) # then: self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) # model training # when: train_model(model, mode, processed_dir, model_dir) # then: self.assertTrue(os.path.exists(model_path)) # model prediction # when: predict_model(processed_dir, model, model_path, submission_path) # then: self.assertTrue(os.path.exists(submission_path))
def main_test(): start_time = time.time() train = read_train_data(nrows=None) test = read_test_data() train, test = process_data(train, test) X = train.drop(['ID_code', 'target'], axis=1) y = train['target'] X_test = test.drop(['ID_code'], axis=1) oof, predictions, scores, feature_importance = train_model( X, X_test, y, params, plot_feature_importance=True) str_metric_score = metric + '_0' + str( int(scores['auc_score'].iloc[0] * 10000)) # submit(test, predictions, str_metric_score) comment = 'starter removed statistics feature, remove also 0 score, bagging_fraction1' # storage_src(str_metric_score, scores, feature_importance, comment) elapsed_time = time.time() - start_time print(elapsed_time)
def main(): # Retrieve data file_path = "data/interim/train_interim.csv" housing_prices = pd.read_csv(file_path) # Seperating predictors and target input_feats, output_feats = preprocessing.make_dataset(housing_prices, "SalePrice") # Subsetting columns of interest feature_names = [ "LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd", "HouseStyle", ] features = input_feats[feature_names] # Data processing preprocess_pipeline = preprocessing.preprocess_pipeline(features) # Generating pipeline for model pipeline = make_pipeline(preprocess_pipeline, KNeighborsRegressor()) # Defining a params for grid-search params = { "kneighborsregressor__n_neighbors": range(2, 21), "kneighborsregressor__weights": ["uniform", "distance"], } model = GridSearchCV(pipeline, params, cv=10, scoring="neg_mean_squared_error") # Train the model model, predictions, actual = train_model.train_model(features, output_feats, model) # check the best parameters that was chosen print(f"Best parameters chosen: {model.best_params_}") # Evaluating the model train_model.evaluate_model(predictions, actual, model)
def main_submit(): start_time = time.time() train = read_train_data(nrows=None) test = read_test_data() train, test = process_data(train, test) X = train.drop(['ID_code', 'target'], axis=1) y = train['target'] X_test = test.drop(['ID_code'], axis=1) oof, predictions, scores, feature_importance = train_model( X, X_test, y, params, n_fold=10, plot_feature_importance=True, model_type='lgb_sklearn') str_metric_score = metric + '_0' + str( int(scores['auc_score'].iloc[0] * 10000)) submit(test, predictions, str_metric_score) comment = 'add 5 max min feature before standard scale' storage_src(str_metric_score, scores, feature_importance, comment) elapsed_time = time.time() - start_time print(elapsed_time)
import numpy as np import pandas as pd from src.models.train_model import train_model dic = train_model() model = dic['model'] vect = dic['vect'] def predict_class(row): row['class_cat'] = model.predict(vect.transform([row['feat_name']]))[0] probabilities = list(model.predict_proba( vect.transform([row['feat_name']])))[0] row['probabilities'] = round(max(probabilities), 2) if row['class_cat'] == 0: row['class'] = 'Das' if row['class_cat'] == 1: row['class'] = 'Der' if row['class_cat'] == 2: row['class'] = 'Die' return row def gen_df_results(): feat_value = model.coef_[0] order_of_importance = (-feat_value).argsort() feat_names = np.array(vect.get_feature_names()) dic_results = {'feat_name': feat_names[order_of_importance],
def run(all_code_types, d_embedding, embedding_dropout_p, min_count, batch_size, verbose, epochs, lr, wd, logsig, sig_depth, run_name, patience, add_time, leadlag, t_scale, t_max, use_timestamps, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, training_proportion=1, testing_subsample_size=None, split_paths=False, tensorboard_log=False, evaluate_on_test=True): """Run the experiment for either cross validation or testing""" dataset, dataset_test, vocab = generate_ml_data( all_code_types, min_count, batch_size, verbose=verbose, allen_mode=True, dataset_path=None, training_proportion=training_proportion, testing_subsample_size=testing_subsample_size, split_paths=split_paths) logger.info("Using k-fold cross validation") # Allen kfold metrics_by_fold = [] cross_validator = StratifiedKFold(n_splits=K_FOLDS, shuffle=True) n_splits = cross_validator.get_n_splits(dataset) for fold_index, (train_indices, validation_indices) in enumerate( cross_validator(dataset)): logger.info(f"Fold {fold_index}/{n_splits - 1}") train_dataset = Subset( dataset, train_indices, ) validation_dataset = Subset(dataset, validation_indices) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=True) if tensorboard_log or evaluate_on_test: serialization_dir = os.path.join(TENSORBOARD_DIR, run_name, str(uuid.uuid4()), str(fold_index)) else: serialization_dir = None model = init_sig(vocab, d_embedding, embedding_dropout_p, sig_depth, logsig, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) logger.info('USING CUDA GPU') else: cuda_device = -1 fold_metrics, model = train_model(model, lr, wd, train_loader, validation_loader, patience, epochs, cuda_device, serialization_dir) if serialization_dir is not None: ex.add_artifact( os.path.join(serialization_dir, 'best.th')) # Add file location to sacred log metrics_by_fold.append(fold_metrics) if evaluate_on_test: if serialization_dir is None: raise Exception( 'serialization_dir needed to load best model from validation' ) test_dataloader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=True) # Held out test data metrics = evaluate(model, test_dataloader, cuda_device) return metrics torch.cuda.empty_cache() metrics = reformat_metrics(metrics_by_fold, ex) return metrics
def main(): # Sidebar section: page_selection = st.sidebar.radio("Select a market:", ["Nikkey", "Bovespa"]) dct_market = { "Nikkey": { "country": "Japan", "continent": "Asia", "index_name": "^N225" }, "Bovespa": { "country": "Brazil", "continent": "America", "index_name": "^BVSP" } } st.markdown(f"# {page_selection}") end_date = date.today() start_date = end_date - timedelta(days=3150) # start_date = datetime.strptime('2004-11-02', '%Y-%m-%d') # end_date = datetime.strptime('2008-11-28', '%Y-%m-%d') start_date = st.sidebar.date_input('Start date', start_date) end_date = st.sidebar.date_input('End date', end_date) df = yf.download(dct_market[page_selection]["index_name"], start=start_date, end=end_date) df["rt"] = (np.log(df["Close"]) - np.log(df["Close"].shift(periods=1))) * 100 df = create_shifted_rt(df, [1, 5, 37]) df_clustered = uniform_clustering( df[["Close", "rt", "rt-1", "rt-5", "rt-37"]], ["rt", "rt-1", "rt-5", "rt-37"]) df_clustered.dropna(how="any", axis=0, inplace=True) lst_relations = [('cluster_rt-37', 'cluster_rt'), ('cluster_rt-5', 'cluster_rt'), ('cluster_rt-1', 'cluster_rt')] df_clustered = df_clustered[[ "rt", "cluster_rt-37", "cluster_rt-5", "cluster_rt-1", "cluster_rt" ]] predict_n_days = 20 model = train_model(df_clustered.iloc[:-predict_n_days], lst_relations) evidence = { 'cluster_rt-37': df_clustered.iloc[-37]['cluster_rt'], 'cluster_rt-5': df_clustered.iloc[-5]['cluster_rt'], 'cluster_rt-1': df_clustered.iloc[-1]['cluster_rt'] } predict = predict_model(model, evidence=evidence) st.text(f"Previsão para amanhã: {predict[0]}") resultado = {} for i in np.arange(1, predict_n_days + 1): evidence = { 'cluster_rt-37': df_clustered.iloc[-37 - i]['cluster_rt'], 'cluster_rt-5': df_clustered.iloc[-5 - i]['cluster_rt'], 'cluster_rt-1': df_clustered.iloc[-1 - i]['cluster_rt'] } predict = predict_model(model, evidence=evidence) resultado[i] = [ predict[0]['cluster_rt'], df_clustered.iloc[i]['cluster_rt'], df_clustered.iloc[i]['rt'] ] resultado = pd.DataFrame.from_dict(resultado, orient='index') resultado.rename(columns={0: 'Previsão', 1: 'Real', 2: 'rt'}, inplace=True) rt_mean = round( resultado.groupby(by=["Real"]).agg( {"rt": ["min", "max", "count", "mean"]}), 2)[("rt", "mean")] if page_selection == "Nikkey": conditions = [ resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0, resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0, resultado["Previsão"] == 5.0, resultado["Previsão"] == 6.0 ] elif page_selection == "Bovespa": conditions = [ resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0, resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0 ] choices = rt_mean.tolist() resultado["rt_predict"] = np.select(conditions, choices, default=np.nan) resultado = resultado[::-1] resultado["rt_predict_acumulado"] = resultado["rt_predict"].cumsum() resultado["rt_acumulado"] = resultado["rt"].cumsum() st.dataframe(resultado) rmse_uniform = mean_squared_error(resultado["rt"], resultado["rt_predict"], squared=False) acuracia = accuracy_score(resultado["Real"], resultado["Previsão"], normalize=True) st.text(f"Acurácia: {round(acuracia*100, 2)}%") st.text(f"RMSE: {round(rmse_uniform, 2)}%") # fig = plt.figure(figsize=(20, 4)) # ax = fig.add_subplot(111) # ax.plot(df['Close'], label=dct_market[page_selection]["index_name"]) # date_min = df.index.min() # date_max = df.index.max() # ax.xaxis.set_major_locator(plt.MaxNLocator(30)) # ax.set_xlim(left=date_min, right=date_max) # ax.legend(loc='lower left', frameon=False) # plt.xticks(rotation=90) # st.pyplot(fig) st.line_chart(df[['Close']]) st.line_chart(df["rt"])
"""Use prediction model and evaluate it""" from os import path from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from src.models import train_model from src.visualizations import Visualize base_path ='/home/chpatola/Desktop/Skola/Python/cookie_nlp/' #1. Test model and look into results test_X, test_y, bag, tf_idf = train_model.train_model(base_path) print("Mean accurancy in validation: {:.2f} %".format(100*bag.best_score_)) predictions = bag.predict(test_X) print("Predictions:\n {} \nTruth:\n {}".format(predictions[0:3], test_y[0:3])) print(test_X[0:3]) #2. Save classification report and confusion matrix to file classi_rep = Visualize._plot_classification_report(test_y, predictions) classi_rep.savefig( path.join(base_path,'reports/figures/classificationReport.png'), bbox_inches='tight') parties = test_y.sort_values().unique() Visualize.cm_analysis(test_y, predictions, path.join(base_path,'reports/figures/confusion_matrix.png'), labels=parties ) #3. Print results print(confusion_matrix(test_y, predictions)) print(classification_report(test_y, predictions))