def preprocess_data(dat, col_names, scale) -> TrainData: proc_dat = scale.transform(dat) mask = np.ones(proc_dat.shape[1], dtype=bool) dat_cols = list(dat.columns) for col_name in col_names: mask[dat_cols.index(col_name)] = False feats = proc_dat[:, mask] targs = proc_dat[:, ~mask] return TrainData(feats, targs)
def preprocess_data(dat, col_names) -> Tuple[TrainData, StandardScaler]: scale = StandardScaler().fit(dat) proc_dat = scale.transform(dat) mask = np.ones(proc_dat.shape[1], dtype=bool) dat_cols = list(dat.columns) for col_name in col_names: mask[dat_cols.index(col_name)] = False feats = proc_dat[:, mask] targs = proc_dat[:, ~mask] return TrainData(feats, targs), scale
def preprocess_data(dat, col_names) -> Tuple[TrainData, StandardScaler]: """ This function will simply standardize the data and also mask the target then split in features and target + send the scaler """ scale = StandardScaler().fit(dat) proc_dat = scale.transform(dat) mask = np.ones(proc_dat.shape[1], dtype=bool) dat_cols = list(dat.columns) for col_name in col_names: mask[dat_cols.index(col_name)] = False feats = proc_dat[:, mask] targs = proc_dat[:, ~mask] return TrainData(feats, targs), scale
def preprocess_data( passed_raw_data, label_col_names ) -> Tuple[TrainData, StandardScaler]: # -> gives Annotations for return type here scale = StandardScaler().fit( passed_raw_data ) #calculate mean and standard deviation for standard scaler for all the columns in the datad. processed_data = scale.transform( passed_raw_data ) #scale using the mean and standard deviation calculated above #print(processed_data.shape)#(40560, 82) mask = np.ones( processed_data.shape[1], dtype=bool ) #creates an array of "True" with the length equal to the number of columns in the .processed data dat_cols = list( passed_raw_data.columns ) #gets the list of the names of the columns, not the columns themselves for col_name in label_col_names: #label mask[dat_cols.index( col_name)] = False #set mask of the label column to False #print(processed_data.shape) (40560, 82) features = processed_data[:, mask] #---------------all rows but only columns with mask true #print(features.shape) (40560, 81) targets = processed_data[:, ~mask] #---------------all rows but only with not mask true #TrainData(features, targets) #returns a tuple of feat = ndarray[], targs = ndarray[] return TrainData(features, targets), scale
def preprocess_data(dat, speed_data, col_names, mean_stand=True) -> Tuple[TrainData, StandardScaler]: if mean_stand: scale = dat.mean() scale = np.array(scale) scale_speed = speed_data.mean() scale_speed = np.array(scale_speed) proc_dat = dat proc_dat_speed = speed_data i = 0 for col in proc_dat.columns: proc_dat[col] -= scale[i] i += 1 proc_dat = np.array(proc_dat) i = 0 for col in proc_dat_speed.columns: proc_dat_speed[col] -= scale_speed[i] i += 1 proc_dat_speed = np.array(proc_dat_speed) else: scale = StandardScaler().fit(dat) proc_dat = scale.transform(dat) scale_speed = StandardScaler().fit(speed_data) proc_dat_speed = scale_speed.transform(speed_data) # origin data # proc_dat = np.array(dat) mask = np.ones(proc_dat.shape[1], dtype=bool) dat_cols = list(dat.columns) for col_name in col_names: mask[dat_cols.index(col_name)] = False feats = proc_dat[:, mask] targs = proc_dat[:, ~mask] return TrainData(feats, targs, proc_dat_speed), scale, scale_speed