Exemple #1
0
def main():
    raw_market, processed_market = load_data()

    if not os.path.exists('model/leads-recommender-model.pkl'):
        with st.spinner(
                'Não encontremos o modelo pré-treinado no seu diretório, vamos treiná-lo novamente...'
        ):
            train_model()
        st.success(
            'Modelo treinado! Nas próximas execuções essa tarefa não precisará ser realizada novamente.'
        )

    st.title('Leads recommender')
    fileup = st.file_uploader('Faça o upload de seu portfólio')

    model = load_model()

    if fileup is not None:
        try:
            portfolio = pd.read_csv(fileup, index_col='id')
            flag = 1
        except ValueError:
            st.error(
                'O portfólio selecionado não segue o padrão necessário, adeque-o e tente novamente. \n'
                'Para saber mais, consulte a documentação.')
            flag = 0

        if flag == 1:
            with st.spinner('Gerando recomendações...'):
                processed_portfolio = build_portfolio(processed_market,
                                                      portfolio)
                leads = generate_leads(processed_portfolio, processed_market,
                                       model)

                processed_leads = process_leads(leads, processed_market)
                raw_leads, df_leads = build_leads_df(raw_market, leads)
                save_leads(raw_leads, df_leads)

                df_leads_colorized = colorize_df(df_leads, processed_portfolio,
                                                 processed_leads)

                st.header('Dashboard:')
                st.dataframe(df_leads_colorized)

                st.success(
                    'Recomendação concluída! Os resultados estão salvos na pasta "output".'
                )

                st.subheader(
                    'Visualize dados importantes sobre as empresas recomendadas:'
                )
                show_charts(build_charts(df_leads))
Exemple #2
0
def main(year, tgt):
    import pickle
    import time
    import pandas as pd

    timestr = time.strftime("%Y%m%d-%H%M%S")

    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)
    #tgt = 'final.output.recovery'
    #tgt = 'rougher.output.recovery'

    #year = 2016
    note = '_1'
    # not used in this stub but often useful for finding various file
    root = Path(__file__).resolve().parents[2]
    print(root)
    # Get raw features
    with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f:
        data_dict = pickle.load(f)
    # X_train is used for training and validation, X_test - final predictions (we have no labels for it)
    # Fix the year at 2016 for now
    #X_train = data_dict[year]['X_train']
    #y_train = data_dict[year]['y_train']

    X = data_dict[year]['X_train']
    y = data_dict[year]['y_train']
    print(f'X_train shape: {X.shape}, y_train: {y.shape}')

    X_test = data_dict[year]['X_test']
    inds = (X['rougher.input.feed_zn'] > 0.5).index
    inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index
    inds_common = inds_y.intersection(inds)

    X = X.loc[inds_common, ]
    y = y.loc[inds_common, tgt]

    param_grids = {
        'n_neighbors': [5, 10],
    }
    default = {'n_jobs': 6}

    # n_estimators: Any = 10,
    # criterion: Any = 'mse',
    # max_depth: Any = None,
    # min_samples_split: Any = 2,
    # min_samples_leaf: Any = 1,
    # min_weight_fraction_leaf: Any = 0.0,
    # max_features: Any = 'auto',
    # max_leaf_nodes: Any = None,
    # bootstrap: Any = True,
    # oob_score: Any = False,
    # n_jobs: Any = 1,
    # random_state: Any = None,
    # verbose: Any = 0,
    # warm_start: Any = False) -> None

    grids = ParameterGrid(param_grids)
    cv = TimeSeriesSplit(n_splits=5)

    mus = []
    sds = []
    grids_full = []
    for i in trange(len(grids)):
        g = grids[i]
        g = {**g, **default}
        scores, mu, sd, m = train_model(X, y, cv, model=KNN, params=g)
        grids_full.append(g)
        mus.append(mu)
        sds.append(sd)

    id_grid = np.argmin(mus)
    grid_best = grids_full[id_grid]
    print(
        f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}'
    )
    m.fit_final(X, y, params=grid_best)
    ypred = m.predict(X_test)
    preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred})

    preds.to_csv(f'{root}/results/KNN_{tgt}_{year}_{note}.csv', index=False)
Exemple #3
0
def main(year, tgt):
    import pickle
    import time
    import pandas as pd

    timestr = time.strftime("%Y%m%d-%H%M%S")

    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)
    # tgt = 'final.output.recovery'
    #tgt = 'rougher.output.recovery'

    #year = 2017
    note = '_pca'
    # not used in this stub but often useful for finding various file
    root = Path(__file__).resolve().parents[2]
    print(root)
    # Get raw features
    with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f:
        data_dict = pickle.load(f)
    # X_train is used for training and validation, X_test - final predictions (we have no labels for it)
    # Fix the year at 2016 for now
    #X_train = data_dict[year]['X_train']
    #y_train = data_dict[year]['y_train']

    X = data_dict[year]['X_train_pca']
    y = data_dict[year]['y_train']
    print(f'X_train shape: {X.shape}, y_train: {y.shape}')

    X_test = data_dict[year]['X_test_pca']
    #inds = (X['rougher.input.feed_zn'] > 0.5).index
    inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index
    inds_common = inds_y

    X = X.loc[inds_common, ]
    y = y.loc[inds_common, tgt]

    param_grids = {
        'n_estimators': [1000],
        #'min_samples_leaf':[2,5,10],
        'max_features': [0.8],  # tuned
        'max_depth': [14],  # tuned
    }
    default = {'criterion': 'mae', 'n_jobs': -1, 'random_state': 123}

    # n_estimators: Any = 10,
    # criterion: Any = 'mse',
    # max_depth: Any = None,
    # min_samples_split: Any = 2,
    # min_samples_leaf: Any = 1,
    # min_weight_fraction_leaf: Any = 0.0,
    # max_features: Any = 'auto',
    # max_leaf_nodes: Any = None,
    # bootstrap: Any = True,
    # oob_score: Any = False,
    # n_jobs: Any = 1,
    # random_state: Any = None,
    # verbose: Any = 0,
    # warm_start: Any = False) -> None
    grids = ParameterGrid(param_grids)
    Nmonths_total = 8
    Nspl = int(Nmonths_total * 30 / 25)
    Nmonths_test = 4
    Nmonths_min_train = 2.5
    cv = TimeSeriesSplitImproved(n_splits=Nspl)

    mus = []
    sds = []
    grids_full = []
    for i in trange(len(grids)):
        g = grids[i]
        g = {**g, **default}
        scores, mu, sd, m = train_model(
            X,
            y,
            cv,
            model=QuantileRF,
            params=g,
            fixed_length=False,
            train_splits=Nspl // Nmonths_total * Nmonths_min_train,
            test_splits=int(Nmonths_test / Nmonths_total * Nspl))
        grids_full.append(g)
        mus.append(mu)
        sds.append(sd)

    # Plot the figures!:
    mus = np.array(mus)
    sds = np.array(sds)
    fig, ax = plt.subplots(figsize=(20, 10))
    ax.fill_between(np.arange(len(grids)), y1=mus - sds, y2=mus + sds)
    ax.plot(np.arange(len(grids)), mus, '-r')

    labs = [str(g) for g in grids]
    ax.set_xticks(np.arange(len(grids)))
    ax.set_xticklabels(labs, rotation=90)

    fig.savefig(f'{root}/results/qrf_{tgt}_{year}_{note}.png')

    id_grid = np.argmin(mus)
    grid_best = grids_full[id_grid]
    print(
        f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}'
    )
    m.fit_final(X, y, params=grid_best)
    ypred = m.predict(X_test)
    preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred})

    preds.to_csv(f'{root}/results/qrf_{tgt}_{year}_{note}.csv', index=False)
    with open(f'{root}/results/qrf_{tgt}_{year}_{note}.pkl', 'wb') as f:
        pickle.dump(m.model, f)
def main(year, tgt):
    import pickle
    import time
    import pandas as pd

    timestr = time.strftime("%Y%m%d-%H%M%S")

    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)
    #tgt = 'final.output.recovery'
    #tgt = 'rougher.output.recovery'

    #year = 2017
    note = '_cv_roll_pca'
    # not used in this stub but often useful for finding various file
    root = Path(__file__).resolve().parents[2]
    print(root)
    # Get raw features
    with open(f'{root}/data/processed/data_dict_all.pkl', 'rb') as f:
        data_dict = pickle.load(f)
    # X_train is used for training and validation, X_test - final predictions (we have no labels for it)
    # Fix the year at 2016 for now
    #X_train = data_dict[year]['X_train']
    #y_train = data_dict[year]['y_train']

    X = data_dict[year]['X_train_pca']
    y = data_dict[year]['y_train']
    print(f'X_train shape: {X.shape}, y_train: {y.shape}')

    X_test = data_dict[year]['X_test_pca']
    #inds = (X['rougher.input.feed_zn'] > 0.5).index
    inds_y = y[(y[tgt] > 5) & (y[tgt] < 100)].index
    #inds_common = inds_y.intersection(inds)
    inds_common = inds_y
    X = X.loc[inds_common, ]
    y = y.loc[inds_common, tgt]

    param_grids = {
        'max_depth': [6, 10, 14],
        # 'max_leaves':[255,511,1023,4095,8100],
        'subsample': [0.1, 0.3, 0.6],
        'colsample_bytree': [0.3, 0.5, 0.8],
        'gamma': [5, 1e1, 20]
    }
    default = {
        'eta': 0.2,
        'objective': 'reg:linear',
        "early_stopping_rounds": 10,
        'verbose_eval': 4000,
        'n_thread': -1,
        "num_round": 550,
        'silent': True
    }

    grids = ParameterGrid(param_grids)

    Nmonths_total = 8
    Nspl = int(Nmonths_total * 30 / 15)
    Nmonths_test = 4
    Nmonths_min_train = 2.5
    cv = TimeSeriesSplitImproved(n_splits=Nspl)

    mus = []
    sds = []
    grids_full = []
    for i in trange(len(grids)):
        g = grids[i]
        print(g)
        g = {**g, **default}
        scores, mu, sd, m = train_model(
            X,
            y,
            cv,
            model=XGBoost,
            params=g,
            fixed_length=False,
            train_splits=Nspl // Nmonths_total * Nmonths_min_train,
            test_splits=int(Nmonths_test / Nmonths_total * Nspl))
        grids_full.append(g)
        mus.append(mu)
        sds.append(sd)

    # Plot the figures!:
    mus = np.array(mus)
    sds = np.array(sds)
    fig, ax = plt.subplots(figsize=(20, 20))
    ax.fill_between(np.arange(len(grids)), y1=mus - sds, y2=mus + sds)
    ax.plot(np.arange(len(grids)), mus, '-r')

    labs = [str(g) for g in grids]
    ax.set_xticks(np.arange(len(labs)))
    ax.set_xticklabels(labs, rotation=90)

    fig.savefig(f'{root}/results/xgb_{tgt}_{year}_{note}.png',
                bbox_inches='tight')

    id_grid = np.argmin(mus)
    grid_best = grids_full[id_grid]
    print(
        f'Best score: {mus[id_grid]} +- {sds[id_grid]} at grid = {grid_best}, {tgt} -- {year}'
    )
    m.fit_final(X, y, params=grid_best)
    ypred = m.predict(X_test, params={"ntree_limit": m.bst.best_ntree_limit})
    preds = pd.DataFrame(data={'date': X_test.index, tgt: ypred})

    preds.to_csv(f'{root}/results/xgb_{tgt}_{year}_{note}.csv', index=False)