def main():

    # Retrieve data
    file_path = "data/interim/train_interim.csv"
    housing_prices = pd.read_csv(file_path)

    # Seperating predictors and target
    input_feats, output_feats = preprocessing.make_dataset(
        housing_prices, "SalePrice")

    # Subsetting columns of interest
    feature_names = [
        "LotArea",
        "YearBuilt",
        "1stFlrSF",
        "2ndFlrSF",
        "FullBath",
        "BedroomAbvGr",
        "TotRmsAbvGrd",
        "HouseStyle",
    ]
    features = input_feats[feature_names]

    # Data processing
    preprocess_pipeline = preprocessing.preprocess_pipeline(features)

    # Generating pipeline for model
    model = make_pipeline(preprocess_pipeline, LinearRegression())

    # Train the model
    model, predictions, actual = train_model.train_model(
        features, output_feats, model)

    # Evaluating the model
    train_model.evaluate_model(predictions, actual, model)
Exemple #2
0
    def parameterized_test(self, model, mode):
        # given:
        data_dir = "test-data"
        interim_dir = self.test_dir + "/interim"
        processed_dir = self.test_dir + "/processed"
        model_dir = self.test_dir + "/model"
        model_path = model_dir + ("" if mode == "full" else "_" +
                                  mode) + "/0001.txt"
        submission_dir = self.test_dir + "/submissions"
        submission_path = submission_dir + "/submission.csv"

        # data preparation
        # when:
        make_dataset(data_dir, interim_dir)

        # then:
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))

        # feature engineering
        # when:
        build_features(data_dir, processed_dir)

        # then:
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))

        # model training
        # when:
        train_model(model, mode, processed_dir, model_dir)

        # then:
        self.assertTrue(os.path.exists(model_path))

        # model prediction
        # when:
        predict_model(processed_dir, model, model_path, submission_path)

        # then:
        self.assertTrue(os.path.exists(submission_path))
def main_test():
    start_time = time.time()
    train = read_train_data(nrows=None)
    test = read_test_data()

    train, test = process_data(train, test)
    X = train.drop(['ID_code', 'target'], axis=1)
    y = train['target']
    X_test = test.drop(['ID_code'], axis=1)
    oof, predictions, scores, feature_importance = train_model(
        X, X_test, y, params, plot_feature_importance=True)
    str_metric_score = metric + '_0' + str(
        int(scores['auc_score'].iloc[0] * 10000))
    # submit(test, predictions, str_metric_score)
    comment = 'starter removed statistics feature, remove also 0 score, bagging_fraction1'
    # storage_src(str_metric_score, scores, feature_importance, comment)
    elapsed_time = time.time() - start_time
    print(elapsed_time)
Exemple #4
0
def main():

    # Retrieve data
    file_path = "data/interim/train_interim.csv"
    housing_prices = pd.read_csv(file_path)

    # Seperating predictors and target
    input_feats, output_feats = preprocessing.make_dataset(housing_prices, "SalePrice")

    # Subsetting columns of interest
    feature_names = [
        "LotArea",
        "YearBuilt",
        "1stFlrSF",
        "2ndFlrSF",
        "FullBath",
        "BedroomAbvGr",
        "TotRmsAbvGrd",
        "HouseStyle",
    ]
    features = input_feats[feature_names]

    # Data processing
    preprocess_pipeline = preprocessing.preprocess_pipeline(features)

    # Generating pipeline for model
    pipeline = make_pipeline(preprocess_pipeline, KNeighborsRegressor())

    # Defining a params for grid-search
    params = {
        "kneighborsregressor__n_neighbors": range(2, 21),
        "kneighborsregressor__weights": ["uniform", "distance"],
    }

    model = GridSearchCV(pipeline, params, cv=10, scoring="neg_mean_squared_error")

    # Train the model
    model, predictions, actual = train_model.train_model(features, output_feats, model)
    # check the best parameters that was chosen
    print(f"Best parameters chosen: {model.best_params_}")
    # Evaluating the model
    train_model.evaluate_model(predictions, actual, model)
def main_submit():
    start_time = time.time()
    train = read_train_data(nrows=None)
    test = read_test_data()

    train, test = process_data(train, test)
    X = train.drop(['ID_code', 'target'], axis=1)
    y = train['target']
    X_test = test.drop(['ID_code'], axis=1)
    oof, predictions, scores, feature_importance = train_model(
        X,
        X_test,
        y,
        params,
        n_fold=10,
        plot_feature_importance=True,
        model_type='lgb_sklearn')
    str_metric_score = metric + '_0' + str(
        int(scores['auc_score'].iloc[0] * 10000))
    submit(test, predictions, str_metric_score)
    comment = 'add 5 max min feature before standard scale'
    storage_src(str_metric_score, scores, feature_importance, comment)
    elapsed_time = time.time() - start_time
    print(elapsed_time)
import numpy as np
import pandas as pd
from src.models.train_model import train_model

dic = train_model()

model = dic['model']
vect = dic['vect']


def predict_class(row):
    row['class_cat'] = model.predict(vect.transform([row['feat_name']]))[0]
    probabilities = list(model.predict_proba(
        vect.transform([row['feat_name']])))[0]
    row['probabilities'] = round(max(probabilities), 2)
    if row['class_cat'] == 0:
        row['class'] = 'Das'
    if row['class_cat'] == 1:
        row['class'] = 'Der'
    if row['class_cat'] == 2:
        row['class'] = 'Die'
    return row


def gen_df_results():

    feat_value = model.coef_[0]
    order_of_importance = (-feat_value).argsort()
    feat_names = np.array(vect.get_feature_names())

    dic_results = {'feat_name': feat_names[order_of_importance],
Exemple #7
0
def run(all_code_types,
        d_embedding,
        embedding_dropout_p,
        min_count,
        batch_size,
        verbose,
        epochs,
        lr,
        wd,
        logsig,
        sig_depth,
        run_name,
        patience,
        add_time,
        leadlag,
        t_scale,
        t_max,
        use_timestamps,
        feedforward_num_layers,
        feedforward_hidden_dims,
        feedforward_activations,
        feedforward_dropout,
        training_proportion=1,
        testing_subsample_size=None,
        split_paths=False,
        tensorboard_log=False,
        evaluate_on_test=True):
    """Run the experiment for either cross validation or testing"""

    dataset, dataset_test, vocab = generate_ml_data(
        all_code_types,
        min_count,
        batch_size,
        verbose=verbose,
        allen_mode=True,
        dataset_path=None,
        training_proportion=training_proportion,
        testing_subsample_size=testing_subsample_size,
        split_paths=split_paths)

    logger.info("Using k-fold cross validation")
    # Allen kfold
    metrics_by_fold = []
    cross_validator = StratifiedKFold(n_splits=K_FOLDS, shuffle=True)

    n_splits = cross_validator.get_n_splits(dataset)

    for fold_index, (train_indices, validation_indices) in enumerate(
            cross_validator(dataset)):
        logger.info(f"Fold {fold_index}/{n_splits - 1}")
        train_dataset = Subset(
            dataset,
            train_indices,
        )
        validation_dataset = Subset(dataset, validation_indices)
        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
        validation_loader = DataLoader(dataset=validation_dataset,
                                       batch_size=batch_size,
                                       shuffle=True)
        if tensorboard_log or evaluate_on_test:
            serialization_dir = os.path.join(TENSORBOARD_DIR, run_name,
                                             str(uuid.uuid4()),
                                             str(fold_index))
        else:
            serialization_dir = None

        model = init_sig(vocab, d_embedding, embedding_dropout_p, sig_depth,
                         logsig, all_code_types, feedforward_num_layers,
                         feedforward_hidden_dims, feedforward_activations,
                         feedforward_dropout, leadlag, add_time, t_max,
                         t_scale, use_timestamps, split_paths)
        if torch.cuda.is_available():
            cuda_device = 0
            model = model.cuda(cuda_device)
            logger.info('USING CUDA GPU')
        else:
            cuda_device = -1

        fold_metrics, model = train_model(model, lr, wd, train_loader,
                                          validation_loader, patience, epochs,
                                          cuda_device, serialization_dir)
        if serialization_dir is not None:
            ex.add_artifact(
                os.path.join(serialization_dir,
                             'best.th'))  # Add file location to sacred log

        metrics_by_fold.append(fold_metrics)

        if evaluate_on_test:
            if serialization_dir is None:
                raise Exception(
                    'serialization_dir needed to load best model from validation'
                )
            test_dataloader = DataLoader(dataset=dataset_test,
                                         batch_size=batch_size,
                                         shuffle=True)  # Held out test data
            metrics = evaluate(model, test_dataloader, cuda_device)
            return metrics
        torch.cuda.empty_cache()

    metrics = reformat_metrics(metrics_by_fold, ex)
    return metrics
Exemple #8
0
def main():
    # Sidebar section:
    page_selection = st.sidebar.radio("Select a market:",
                                      ["Nikkey", "Bovespa"])

    dct_market = {
        "Nikkey": {
            "country": "Japan",
            "continent": "Asia",
            "index_name": "^N225"
        },
        "Bovespa": {
            "country": "Brazil",
            "continent": "America",
            "index_name": "^BVSP"
        }
    }

    st.markdown(f"# {page_selection}")

    end_date = date.today()
    start_date = end_date - timedelta(days=3150)

    # start_date = datetime.strptime('2004-11-02', '%Y-%m-%d')
    # end_date = datetime.strptime('2008-11-28', '%Y-%m-%d')

    start_date = st.sidebar.date_input('Start date', start_date)
    end_date = st.sidebar.date_input('End date', end_date)

    df = yf.download(dct_market[page_selection]["index_name"],
                     start=start_date,
                     end=end_date)

    df["rt"] = (np.log(df["Close"]) -
                np.log(df["Close"].shift(periods=1))) * 100

    df = create_shifted_rt(df, [1, 5, 37])

    df_clustered = uniform_clustering(
        df[["Close", "rt", "rt-1", "rt-5", "rt-37"]],
        ["rt", "rt-1", "rt-5", "rt-37"])
    df_clustered.dropna(how="any", axis=0, inplace=True)

    lst_relations = [('cluster_rt-37', 'cluster_rt'),
                     ('cluster_rt-5', 'cluster_rt'),
                     ('cluster_rt-1', 'cluster_rt')]

    df_clustered = df_clustered[[
        "rt", "cluster_rt-37", "cluster_rt-5", "cluster_rt-1", "cluster_rt"
    ]]

    predict_n_days = 20

    model = train_model(df_clustered.iloc[:-predict_n_days], lst_relations)

    evidence = {
        'cluster_rt-37': df_clustered.iloc[-37]['cluster_rt'],
        'cluster_rt-5': df_clustered.iloc[-5]['cluster_rt'],
        'cluster_rt-1': df_clustered.iloc[-1]['cluster_rt']
    }

    predict = predict_model(model, evidence=evidence)

    st.text(f"Previsão para amanhã: {predict[0]}")

    resultado = {}

    for i in np.arange(1, predict_n_days + 1):

        evidence = {
            'cluster_rt-37': df_clustered.iloc[-37 - i]['cluster_rt'],
            'cluster_rt-5': df_clustered.iloc[-5 - i]['cluster_rt'],
            'cluster_rt-1': df_clustered.iloc[-1 - i]['cluster_rt']
        }

        predict = predict_model(model, evidence=evidence)

        resultado[i] = [
            predict[0]['cluster_rt'], df_clustered.iloc[i]['cluster_rt'],
            df_clustered.iloc[i]['rt']
        ]

    resultado = pd.DataFrame.from_dict(resultado, orient='index')
    resultado.rename(columns={0: 'Previsão', 1: 'Real', 2: 'rt'}, inplace=True)

    rt_mean = round(
        resultado.groupby(by=["Real"]).agg(
            {"rt": ["min", "max", "count", "mean"]}), 2)[("rt", "mean")]

    if page_selection == "Nikkey":
        conditions = [
            resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0,
            resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0,
            resultado["Previsão"] == 5.0, resultado["Previsão"] == 6.0
        ]
    elif page_selection == "Bovespa":
        conditions = [
            resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0,
            resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0
        ]

    choices = rt_mean.tolist()

    resultado["rt_predict"] = np.select(conditions, choices, default=np.nan)

    resultado = resultado[::-1]

    resultado["rt_predict_acumulado"] = resultado["rt_predict"].cumsum()
    resultado["rt_acumulado"] = resultado["rt"].cumsum()

    st.dataframe(resultado)

    rmse_uniform = mean_squared_error(resultado["rt"],
                                      resultado["rt_predict"],
                                      squared=False)

    acuracia = accuracy_score(resultado["Real"],
                              resultado["Previsão"],
                              normalize=True)

    st.text(f"Acurácia: {round(acuracia*100, 2)}%")
    st.text(f"RMSE: {round(rmse_uniform, 2)}%")

    # fig = plt.figure(figsize=(20, 4))
    # ax = fig.add_subplot(111)

    # ax.plot(df['Close'], label=dct_market[page_selection]["index_name"])

    # date_min = df.index.min()
    # date_max = df.index.max()
    # ax.xaxis.set_major_locator(plt.MaxNLocator(30))
    # ax.set_xlim(left=date_min, right=date_max)

    # ax.legend(loc='lower left', frameon=False)
    # plt.xticks(rotation=90)
    # st.pyplot(fig)

    st.line_chart(df[['Close']])

    st.line_chart(df["rt"])
"""Use prediction model and evaluate it"""
from os import path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from src.models import train_model
from src.visualizations import Visualize

base_path ='/home/chpatola/Desktop/Skola/Python/cookie_nlp/'

#1. Test model and look into results
test_X, test_y, bag, tf_idf = train_model.train_model(base_path)

print("Mean accurancy in validation: {:.2f} %".format(100*bag.best_score_))

predictions = bag.predict(test_X)
print("Predictions:\n {} \nTruth:\n {}".format(predictions[0:3], test_y[0:3]))
print(test_X[0:3])

#2. Save classification report and confusion matrix to file
classi_rep = Visualize._plot_classification_report(test_y, predictions)
classi_rep.savefig(
    path.join(base_path,'reports/figures/classificationReport.png'),
    bbox_inches='tight')
parties = test_y.sort_values().unique()
Visualize.cm_analysis(test_y,
                      predictions,
                      path.join(base_path,'reports/figures/confusion_matrix.png'),
                      labels=parties
                      )
 #3. Print results
print(confusion_matrix(test_y, predictions))
print(classification_report(test_y, predictions))