Esempio n. 1
0
def test_get_data_jb():
    expected = [1, 2, 3, 4]
    save_data(expected,
              name='tests/sampletest/data/processed/proce',
              method='jb')
    output = get_data(path='tests/sampletest/data/processed/proce.jbl')

    assert expected == output
    assert type(expected) == type(output)
Esempio n. 2
0
#       format_version: '1.5'
#       jupytext_version: 1.5.0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import pandas as pd
import numpy as np
import seaborn as sns
import datasist.project as dp
import datasist as ds

#read data from the raw data directory using datasist
data = dp.get_data('train.csv', loc='raw', method='csv')
ds.structdata.describe(data)

# +
#check for missing values
ds.structdata.display_missing(data)

#seperate the label from the data
label = data.Rating
data.drop(columns=['Rating'], inplace=True)


#Encode all categorical feature with label encoding
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
Esempio n. 3
0
#       jupytext_version: 1.5.0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# +
import datasist.project as dp
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

#retrieve data from the processed folder
data = dp.get_data("train_proc.csv", method='csv')
label = dp.get_data("train_labels.csv", method='csv')

#base model with random forest
rf = RandomForestRegressor(n_estimators=10, random_state=2)
score = cross_val_score(estimator=rf,
                        X=data,
                        y=label.Rating,
                        cv=5,
                        scoring="neg_mean_squared_error",
                        n_jobs=-1)
score = -1 * np.mean(score)
print("RMSE is {}".format(score))

#save the model
dp.save_model(rf, name='rf_model_n10')