def main(): raw_market, processed_market = load_data() if not os.path.exists('model/leads-recommender-model.pkl'): with st.spinner( 'Não encontremos o modelo pré-treinado no seu diretório, vamos treiná-lo novamente...' ): train_model() st.success( 'Modelo treinado! Nas próximas execuções essa tarefa não precisará ser realizada novamente.' ) st.title('Leads recommender') fileup = st.file_uploader('Faça o upload de seu portfólio') model = load_model() if fileup is not None: try: portfolio = pd.read_csv(fileup, index_col='id') flag = 1 except ValueError: st.error( 'O portfólio selecionado não segue o padrão necessário, adeque-o e tente novamente. \n' 'Para saber mais, consulte a documentação.') flag = 0 if flag == 1: with st.spinner('Gerando recomendações...'): processed_portfolio = build_portfolio(processed_market, portfolio) leads = generate_leads(processed_portfolio, processed_market, model) processed_leads = process_leads(leads, processed_market) raw_leads, df_leads = build_leads_df(raw_market, leads) save_leads(raw_leads, df_leads) df_leads_colorized = colorize_df(df_leads, processed_portfolio, processed_leads) st.header('Dashboard:') st.dataframe(df_leads_colorized) st.success( 'Recomendação concluída! Os resultados estão salvos na pasta "output".' ) st.subheader( 'Visualize dados importantes sobre as empresas recomendadas:' ) show_charts(build_charts(df_leads))
def main(year, tgt): import pickle import time import pandas as pd timestr = time.strftime("%Y%m%d-%H%M%S") log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) #tgt = 'final.output.recovery' #tgt = 'rougher.output.recovery' #year = 2016 note = '_1' # not used in this stub but often useful for finding various file root = Path(__file__).resolve().parents[2] print(root) # Get raw features with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f: data_dict = pickle.load(f) # X_train is used for training and validation, X_test - final predictions (we have no labels for it) # Fix the year at 2016 for now #X_train = data_dict[year]['X_train'] #y_train = data_dict[year]['y_train'] X = data_dict[year]['X_train'] y = data_dict[year]['y_train'] print(f'X_train shape: {X.shape}, y_train: {y.shape}') X_test = data_dict[year]['X_test'] inds = (X['rougher.input.feed_zn'] > 0.5).index inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index inds_common = inds_y.intersection(inds) X = X.loc[inds_common, ] y = y.loc[inds_common, tgt] param_grids = { 'n_neighbors': [5, 10], } default = {'n_jobs': 6} # n_estimators: Any = 10, # criterion: Any = 'mse', # max_depth: Any = None, # min_samples_split: Any = 2, # min_samples_leaf: Any = 1, # min_weight_fraction_leaf: Any = 0.0, # max_features: Any = 'auto', # max_leaf_nodes: Any = None, # bootstrap: Any = True, # oob_score: Any = False, # n_jobs: Any = 1, # random_state: Any = None, # verbose: Any = 0, # warm_start: Any = False) -> None grids = ParameterGrid(param_grids) cv = TimeSeriesSplit(n_splits=5) mus = [] sds = [] grids_full = [] for i in trange(len(grids)): g = grids[i] g = {**g, **default} scores, mu, sd, m = train_model(X, y, cv, model=KNN, params=g) grids_full.append(g) mus.append(mu) sds.append(sd) id_grid = np.argmin(mus) grid_best = grids_full[id_grid] print( f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}' ) m.fit_final(X, y, params=grid_best) ypred = m.predict(X_test) preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred}) preds.to_csv(f'{root}/results/KNN_{tgt}_{year}_{note}.csv', index=False)
def main(year, tgt): import pickle import time import pandas as pd timestr = time.strftime("%Y%m%d-%H%M%S") log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) # tgt = 'final.output.recovery' #tgt = 'rougher.output.recovery' #year = 2017 note = '_pca' # not used in this stub but often useful for finding various file root = Path(__file__).resolve().parents[2] print(root) # Get raw features with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f: data_dict = pickle.load(f) # X_train is used for training and validation, X_test - final predictions (we have no labels for it) # Fix the year at 2016 for now #X_train = data_dict[year]['X_train'] #y_train = data_dict[year]['y_train'] X = data_dict[year]['X_train_pca'] y = data_dict[year]['y_train'] print(f'X_train shape: {X.shape}, y_train: {y.shape}') X_test = data_dict[year]['X_test_pca'] #inds = (X['rougher.input.feed_zn'] > 0.5).index inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index inds_common = inds_y X = X.loc[inds_common, ] y = y.loc[inds_common, tgt] param_grids = { 'n_estimators': [1000], #'min_samples_leaf':[2,5,10], 'max_features': [0.8], # tuned 'max_depth': [14], # tuned } default = {'criterion': 'mae', 'n_jobs': -1, 'random_state': 123} # n_estimators: Any = 10, # criterion: Any = 'mse', # max_depth: Any = None, # min_samples_split: Any = 2, # min_samples_leaf: Any = 1, # min_weight_fraction_leaf: Any = 0.0, # max_features: Any = 'auto', # max_leaf_nodes: Any = None, # bootstrap: Any = True, # oob_score: Any = False, # n_jobs: Any = 1, # random_state: Any = None, # verbose: Any = 0, # warm_start: Any = False) -> None grids = ParameterGrid(param_grids) Nmonths_total = 8 Nspl = int(Nmonths_total * 30 / 25) Nmonths_test = 4 Nmonths_min_train = 2.5 cv = TimeSeriesSplitImproved(n_splits=Nspl) mus = [] sds = [] grids_full = [] for i in trange(len(grids)): g = grids[i] g = {**g, **default} scores, mu, sd, m = train_model( X, y, cv, model=QuantileRF, params=g, fixed_length=False, train_splits=Nspl // Nmonths_total * Nmonths_min_train, test_splits=int(Nmonths_test / Nmonths_total * Nspl)) grids_full.append(g) mus.append(mu) sds.append(sd) # Plot the figures!: mus = np.array(mus) sds = np.array(sds) fig, ax = plt.subplots(figsize=(20, 10)) ax.fill_between(np.arange(len(grids)), y1=mus - sds, y2=mus + sds) ax.plot(np.arange(len(grids)), mus, '-r') labs = [str(g) for g in grids] ax.set_xticks(np.arange(len(grids))) ax.set_xticklabels(labs, rotation=90) fig.savefig(f'{root}/results/qrf_{tgt}_{year}_{note}.png') id_grid = np.argmin(mus) grid_best = grids_full[id_grid] print( f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}' ) m.fit_final(X, y, params=grid_best) ypred = m.predict(X_test) preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred}) preds.to_csv(f'{root}/results/qrf_{tgt}_{year}_{note}.csv', index=False) with open(f'{root}/results/qrf_{tgt}_{year}_{note}.pkl', 'wb') as f: pickle.dump(m.model, f)
def main(year, tgt): import pickle import time import pandas as pd timestr = time.strftime("%Y%m%d-%H%M%S") log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) #tgt = 'final.output.recovery' #tgt = 'rougher.output.recovery' #year = 2017 note = '_cv_roll_pca' # not used in this stub but often useful for finding various file root = Path(__file__).resolve().parents[2] print(root) # Get raw features with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f: data_dict = pickle.load(f) # X_train is used for training and validation, X_test - final predictions (we have no labels for it) # Fix the year at 2016 for now #X_train = data_dict[year]['X_train'] #y_train = data_dict[year]['y_train'] X = data_dict[year]['X_train_pca'] y = data_dict[year]['y_train'] print(f'X_train shape: {X.shape}, y_train: {y.shape}') X_test = data_dict[year]['X_test_pca'] #inds = (X['rougher.input.feed_zn'] > 0.5).index inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index #inds_common = inds_y.intersection(inds) inds_common = inds_y X = X.loc[inds_common, ] y = y.loc[inds_common, tgt] param_grids = { 'max_depth': [6, 10, 14], # 'max_leaves':[255,511,1023,4095,8100], 'subsample': [0.1, 0.3, 0.6], 'colsample_bytree': [0.3, 0.5, 0.8], 'gamma': [5, 1e1, 20] } default = { 'eta': 0.2, 'objective': 'reg:linear', "early_stopping_rounds": 10, 'verbose_eval': 4000, 'n_thread': -1, "num_round": 550, 'silent': True } grids = ParameterGrid(param_grids) Nmonths_total = 8 Nspl = int(Nmonths_total * 30 / 15) Nmonths_test = 4 Nmonths_min_train = 2.5 cv = TimeSeriesSplitImproved(n_splits=Nspl) mus = [] sds = [] grids_full = [] for i in trange(len(grids)): g = grids[i] print(g) g = {**g, **default} scores, mu, sd, m = train_model( X, y, cv, model=XGBoost, params=g, fixed_length=False, train_splits=Nspl // Nmonths_total * Nmonths_min_train, test_splits=int(Nmonths_test / Nmonths_total * Nspl)) grids_full.append(g) mus.append(mu) sds.append(sd) # Plot the figures!: mus = np.array(mus) sds = np.array(sds) fig, ax = plt.subplots(figsize=(20, 20)) ax.fill_between(np.arange(len(grids)), y1=mus - sds, y2=mus + sds) ax.plot(np.arange(len(grids)), mus, '-r') labs = [str(g) for g in grids] ax.set_xticks(np.arange(len(labs))) ax.set_xticklabels(labs, rotation=90) fig.savefig(f'{root}/results/xgb_{tgt}_{year}_{note}.png', bbox_inches='tight') id_grid = np.argmin(mus) grid_best = grids_full[id_grid] print( f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}' ) m.fit_final(X, y, params=grid_best) ypred = m.predict(X_test, params={"ntree_limit": m.bst.best_ntree_limit}) preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred}) preds.to_csv(f'{root}/results/xgb_{tgt}_{year}_{note}.csv', index=False)