Ejemplo n.º 1
0
def test_get_data_jb():
    expected = [1, 2, 3, 4]
    save_data(expected,
              name='tests/sampletest/data/processed/proce',
              method='jb')
    output = get_data(path='tests/sampletest/data/processed/proce.jbl')

    assert expected == output
    assert type(expected) == type(output)
Ejemplo n.º 2
0
def test_save_data_before_init(
):  #Test data saving in an un-initialized project
    expected1 = 'proc_file.jbl'
    expected2 = 'raw_file.jbl'

    aa = pd.DataFrame([1, 2, 3, 4, 5])

    save_data(aa, name='tests/proc_file', method='jb')
    save_data(aa, name='tests/raw_file', method='jb')

    assert expected1 in os.listdir('tests')
    assert expected2 in os.listdir('tests')
    os.remove('tests/proc_file.jbl')
    os.remove('tests/raw_file.jbl')
Ejemplo n.º 3
0
def test_save_data_jbl(
):  #Test data saving in a directory structure created with datasist start_project function
    expected1 = 'proc_file.jbl'
    expected2 = 'raw_file.jbl'
    aa = pd.DataFrame([1, 2, 3, 4, 5])

    config_path = os.path.join('tests/sampletest', 'config.txt')
    with open(config_path) as configfile:
        config = json.load(configfile)

    data_path_raw = os.path.join(config['datapath'], 'raw')
    data_path_proc = os.path.join(config['datapath'], 'processed')

    save_data(aa,
              name=data_path_proc + '/proc_file',
              method='jb',
              loc='processed')
    save_data(aa, name=data_path_raw + '/raw_file', method='jb', loc='raw')

    assert expected1 in os.listdir(data_path_proc)
    assert expected2 in os.listdir(data_path_raw)
Ejemplo n.º 4
0
#read data from the raw data directory using datasist
data = dp.get_data('train.csv', loc='raw', method='csv')
ds.structdata.describe(data)

# +
#check for missing values
ds.structdata.display_missing(data)

#seperate the label from the data
label = data.Rating
data.drop(columns=['Rating'], inplace=True)


#Encode all categorical feature with label encoding
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

for col in data.columns:
    data[col] = lb.fit_transform(data[col])
    
    
data.head()
# -

#export the processed data and label to the processed folder
dp.save_data(data, 'train_proc')
dp.save_data(label, 'train_labels')