def test_get_data_jb(): expected = [1, 2, 3, 4] save_data(expected, name='tests/sampletest/data/processed/proce', method='jb') output = get_data(path='tests/sampletest/data/processed/proce.jbl') assert expected == output assert type(expected) == type(output)
def test_save_data_before_init( ): #Test data saving in an un-initialized project expected1 = 'proc_file.jbl' expected2 = 'raw_file.jbl' aa = pd.DataFrame([1, 2, 3, 4, 5]) save_data(aa, name='tests/proc_file', method='jb') save_data(aa, name='tests/raw_file', method='jb') assert expected1 in os.listdir('tests') assert expected2 in os.listdir('tests') os.remove('tests/proc_file.jbl') os.remove('tests/raw_file.jbl')
def test_save_data_jbl( ): #Test data saving in a directory structure created with datasist start_project function expected1 = 'proc_file.jbl' expected2 = 'raw_file.jbl' aa = pd.DataFrame([1, 2, 3, 4, 5]) config_path = os.path.join('tests/sampletest', 'config.txt') with open(config_path) as configfile: config = json.load(configfile) data_path_raw = os.path.join(config['datapath'], 'raw') data_path_proc = os.path.join(config['datapath'], 'processed') save_data(aa, name=data_path_proc + '/proc_file', method='jb', loc='processed') save_data(aa, name=data_path_raw + '/raw_file', method='jb', loc='raw') assert expected1 in os.listdir(data_path_proc) assert expected2 in os.listdir(data_path_raw)
#read data from the raw data directory using datasist data = dp.get_data('train.csv', loc='raw', method='csv') ds.structdata.describe(data) # + #check for missing values ds.structdata.display_missing(data) #seperate the label from the data label = data.Rating data.drop(columns=['Rating'], inplace=True) #Encode all categorical feature with label encoding from sklearn.preprocessing import LabelEncoder lb = LabelEncoder() for col in data.columns: data[col] = lb.fit_transform(data[col]) data.head() # - #export the processed data and label to the processed folder dp.save_data(data, 'train_proc') dp.save_data(label, 'train_labels')