def get_transfer_inds(self, labels_or_ids): if labels_or_ids is None: return array_functions.true(self.n) if self.is_regression: return array_functions.find_set(self.data_set_ids, labels_or_ids) else: return array_functions.find_set(self.true_y, labels_or_ids)
def train_and_test(self, data): source_data = self.get_source_data(data) #Because source learner is probably fully labeled, make sure we're not using validation parameter tuning self.source_learner.configs.use_validation = False viz_mds = False if viz_mds: source_labels = self.configs.source_labels target_labels = self.configs.target_labels data.data_set_ids[:] = 0 data.data_set_ids[array_functions.find_set(data.y,source_labels[0,:])] = 1 data.data_set_ids[array_functions.find_set(data.y,source_labels[1,:])] = 2 data.change_labels(source_labels,target_labels) array_functions.plot_MDS(data.x,data.true_y,data.data_set_ids) if self.train_source_learner: if self.use_stacking: self.source_learner.train_and_test(data) else: self.source_learner.train_and_test(source_data) data_copy = self._prepare_data(data,include_unlabeled=True) data_copy = data_copy.get_transfer_subset(self.configs.target_labels, include_unlabeled=True) data_copy = data_copy.get_subset(data_copy.is_target) return super(HypothesisTransfer, self).train_and_test(data_copy)
def train_and_test(self, data): source_order = self.configs.source_domain_order target_order = self.configs.target_domain_order #results = super(DomainModelShiftMethod, self).train_and_test(data_copy) source_to_keep = array_functions.find_set(data.data_set_ids, source_order) source_data = data.get_subset(source_to_keep) source_data.y = source_data.true_y source_configs = deepcopy(self.configs) source_configs.labels_to_keep = source_order source_configs.labels_to_not_sample = np.asarray([source_order[0]]) source_configs.source_labels = np.asarray([source_order[0]]) source_configs.target_labels = np.asarray([source_order[1]]) source_transformation = local_transfer_methods.OffsetTransfer(source_configs) source_transformation.use_validation = True source_transformation.train_and_test(source_data) target_to_keep = array_functions.find_set(data.data_set_ids, [target_order[0]]) target_data = data.get_subset(target_to_keep) target_data.reveal_labels(target_data.data_set_ids == target_order[0]) target_configs = deepcopy(self.configs) target_configs.labels_to_keep = np.asarray([target_order[0]]) target_configs.source_labels = np.asarray([]) target_configs.target_labels = np.asarray([target_order[0]]) offset_labels = source_transformation.predict(target_data).y target_data.y = offset_labels target_data.true_y = offset_labels self.target_learner = method.NadarayaWatsonMethod(target_configs) self.target_learner.use_validation = True self.target_learner.train_and_test(target_data) t = data.get_subset(data.data_set_ids == target_order[1]) return super(DomainModelShiftMethod, self).train_and_test(t)
def get_transfer_subset(self,labels_or_ids,include_unlabeled=False): assert len(labels_or_ids) > 0 if self.is_regression: inds = array_functions.find_set(self.data_set_ids,labels_or_ids) if not include_unlabeled: inds = inds & self.is_labeled else: inds = array_functions.find_set(self.y,labels_or_ids) if include_unlabeled: inds = inds | ~self.is_labeled return self.get_subset(inds)
def get_transfer_subset(self, labels_or_ids, include_unlabeled=False): try: labels_or_ids[0] except: labels_or_ids = np.asarray([labels_or_ids]) assert len(labels_or_ids) > 0 if self.is_regression: inds = array_functions.find_set(self.data_set_ids, labels_or_ids) if not include_unlabeled: inds = inds & self.is_labeled else: inds = array_functions.find_set(self.y, labels_or_ids) if include_unlabeled: inds = inds | ~self.is_labeled return self.get_subset(inds)
def test_mnist(): num_per_class = 50 data = helper_functions.load_object('../data_sets/mnist/raw_data.pkl') classes_to_use = [0, 4, 8, 7] I = array_functions.find_set(data.y, classes_to_use) data = data.get_subset(I) to_keep = None for i in classes_to_use: inds = (data.y == i).nonzero()[0] I = np.random.choice(inds, size=num_per_class, replace=False) if to_keep is None: to_keep = I else: to_keep = np.concatenate((to_keep, I)) data.change_labels([classes_to_use[1], classes_to_use[3]], [classes_to_use[0], classes_to_use[2]]) data.change_labels([classes_to_use[0], classes_to_use[2]], [0, 1]) data_test = data.get_subset(~to_keep) data = data.get_subset(to_keep) label_names = [ str(classes_to_use[0]) + '+' + str(classes_to_use[1]), str(classes_to_use[2]) + '+' + str(classes_to_use[3]), ] #data = add_label_noise_cluster(data, num_neighbors=20) #data = add_label_noise(data, 20) test_methods(data.x, data.y, data_test.x, data_test.y, label_names, mnist=True)
def create_stations(file): feat_names, data = create_data_set.load_csv(file, True, dtype='str', delim=',', num_rows=1000000000) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def create_stations(file): feat_names, data = create_data_set.load_csv( file, True, dtype='str', delim=',', num_rows=1000000000 ) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def get_split(self, i, num_labeled=None): if 'use_data_set_ids' not in self.__dict__: self.use_data_set_ids = True d = copy.deepcopy(self.data) split = self.splits[i] d.apply_split(split) if self.labels_to_keep is not None: #d = d.get_with_labels(self.labels_to_keep) d = d.get_transfer_subset(self.labels_to_keep) to_keep = None if hasattr(self, 'data_set_ids_to_keep' ) and self.data_set_ids_to_keep is not None: to_keep = array_functions.find_set(d.data_set_ids, self.data_set_ids_to_keep) d.y[to_keep] = np.nan if num_labeled is not None: if d.is_regression: target_labels = self.target_labels if target_labels is None: target_labels = [0] for label in target_labels: labeled_inds = np.nonzero(d.is_train & d.is_labeled)[0] if self.use_data_set_ids and \ d.data_set_ids is not None and \ self.target_labels is not None: #labeled_inds = np.nonzero(d.is_train & (d.data_set_ids == self.target_labels))[0] labeled_inds = np.nonzero(d.is_train & (d.data_set_ids == label))[0] if np.isfinite(num_labeled): to_clear = labeled_inds[num_labeled:] d.y[to_clear] = np.nan d.y[d.is_test] = np.nan else: d.y = d.y.astype('float32') d.true_y = d.true_y.astype('float32') classes = d.classes for c in classes: if c in self.labels_to_not_sample: continue class_inds_train = (d.y == c) & d.is_train if to_keep is not None: class_inds_train[to_keep] = False class_inds_train = np.nonzero(class_inds_train)[0] if np.isfinite(num_labeled): assert len(class_inds_train) >= num_labeled d.y[class_inds_train[num_labeled:]] = np.nan class_inds_test = (d.y == c) & ~d.is_train if to_keep is not None: class_inds_test[to_keep] = False class_inds_test = np.nonzero(class_inds_test)[0] d.y[class_inds_test] = np.nan if to_keep is not None: d.y[to_keep] = d.true_y[to_keep] return d
def create_forest_fires(): months = { 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12 } days = { 'sun': 1, 'mon': 2, 'tue': 3, 'wed': 4, 'thu': 5, 'fri': 6, 'sat': 7 } #month_to_season = lambda x : (months[x]-1)/3 month_to_season = lambda x: months[x] day_to_int = lambda x: days[x] file = 'forest_fires/forestfires.csv' converters = {2: month_to_season, 3: day_to_int} field_names, forest_data = load_csv(file, dtype='float', converters=converters) x = forest_data y = forest_data[:, -1] i = field_names == 'month' domain_ids = forest_data[:, i] months_to_use = np.asarray([6, 7, 8]) #months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12]) to_use = array_functions.find_set(domain_ids, months_to_use) x = x[to_use, :] y = y[to_use] domain_ids = domain_ids[to_use] x = x[:, 4:] field_names = field_names[4:] I = (y > 0) & (y < 700) x = x[I, :] y = y[I] domain_ids = domain_ids[I] from methods import method learner = method.NadarayaWatsonMethod() viz_features(x, y, domain_ids, field_names, learner=learner) pass
def get_split(self, i, num_labeled=None): if 'use_data_set_ids' not in self.__dict__: self.use_data_set_ids = True d = copy.deepcopy(self.data) split = self.splits[i] d.apply_split(split) if self.labels_to_keep is not None: #d = d.get_with_labels(self.labels_to_keep) d = d.get_transfer_subset(self.labels_to_keep) to_keep = None if hasattr(self, 'data_set_ids_to_keep') and self.data_set_ids_to_keep is not None: to_keep = array_functions.find_set(d.data_set_ids, self.data_set_ids_to_keep) d.y[to_keep] = np.nan if num_labeled is not None: if d.is_regression: labeled_inds = np.nonzero(d.is_train & d.is_labeled)[0] if self.use_data_set_ids and \ d.data_set_ids is not None and \ self.target_labels is not None: labeled_inds = np.nonzero(d.is_train & (d.data_set_ids == self.target_labels))[0] to_clear = labeled_inds[num_labeled:] d.y[to_clear] = np.nan d.y[d.is_test] = np.nan else: d.y = d.y.astype('float32') d.true_y = d.true_y.astype('float32') classes = d.classes for c in classes: if c in self.labels_to_not_sample: continue class_inds_train = (d.y==c) & d.is_train if to_keep is not None: class_inds_train[to_keep] = False class_inds_train = np.nonzero(class_inds_train)[0] assert len(class_inds_train) >= num_labeled d.y[class_inds_train[num_labeled:]] = np.nan class_inds_test = (d.y==c) & ~d.is_train if to_keep is not None: class_inds_test[to_keep] = False class_inds_test = np.nonzero(class_inds_test)[0] d.y[class_inds_test] = np.nan if to_keep is not None: d.y[to_keep] = d.true_y[to_keep] return d
def create_forest_fires(): months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12, } days = {"sun": 1, "mon": 2, "tue": 3, "wed": 4, "thu": 5, "fri": 6, "sat": 7} # month_to_season = lambda x : (months[x]-1)/3 month_to_season = lambda x: months[x] day_to_int = lambda x: days[x] file = "forest_fires/forestfires.csv" converters = {2: month_to_season, 3: day_to_int} field_names, forest_data = load_csv(file, dtype="float", converters=converters) x = forest_data y = forest_data[:, -1] i = field_names == "month" domain_ids = forest_data[:, i] months_to_use = np.asarray([6, 7, 8]) # months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12]) to_use = array_functions.find_set(domain_ids, months_to_use) x = x[to_use, :] y = y[to_use] domain_ids = domain_ids[to_use] x = x[:, 4:] field_names = field_names[4:] I = (y > 0) & (y < 700) x = x[I, :] y = y[I] domain_ids = domain_ids[I] from methods import method learner = method.NadarayaWatsonMethod() viz_features(x, y, domain_ids, field_names, learner=learner) pass
def train_and_test(self, data): assert data.is_regression if data.is_regression: source_data = data.get_transfer_subset(self.configs.source_labels.ravel(),include_unlabeled=False) else: source_data = data.get_transfer_subset(self.configs.source_labels.ravel(),include_unlabeled=False) source_data.set_target() source_data.set_train() source_data.reveal_labels(~source_data.is_labeled) if source_data.is_regression: source_data.data_set_ids[:] = self.configs.target_labels[0] if not data.is_regression: source_data.change_labels(self.configs.source_labels,self.configs.target_labels) source_data = source_data.rand_sample(.1) self.source_learner.train_and_test(source_data) data_copy = data.get_transfer_subset(self.configs.labels_to_keep, include_unlabeled=True) is_source = array_functions.find_set(data_copy.data_set_ids,self.configs.source_labels) data_copy.type[is_source] = data_lib.TYPE_SOURCE assert data_copy.is_source.any() return super(SMSTransfer, self).train_and_test(data_copy)
continue source_label = data.classes[source_idx] I = (data_copy.true_y == source_label) | (data_copy.true_y == base_label) data_source = data_copy.get_subset(I) source_results = learner.train_and_test(data_source) transfer_error[source_idx, source_idx] += source_results.error_on_test_data for target_idx in range(num_classes): if target_idx == source_idx or target_idx == base_class_idx: continue target_label = data.classes[target_idx] if use_transfer: all_labels = np.asarray( [base_label, target_label, source_label]) I = array_functions.find_set(data_copy.true_y, np.asarray(all_labels)) data_target = data_copy.get_subset(I) data_base = data_copy.get_subset( data_copy.true_y == base_label) # Create a new label to duplicate base data new_label = data.classes.max() + 1 data_base.change_labels([base_label], [new_label]) data_target.combine(data_base) data_target.data_set_ids = None transfer_learner.configs.source_labels = np.expand_dims( np.asarray([source_label, base_label]), 0) transfer_learner.configs.target_labels = np.asarray( [target_label, new_label]) transfer_results = transfer_learner.train_and_test( data_target).prediction else:
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype='str', delim=',', num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = np.zeros((num_days, num_locations)) locs = locs.astype(np.float) p_min = .3 p_max = .7 is_in_range = array_functions.is_in_percentile( locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) #array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list( itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date #y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) #y = trip_counts[first_tuesday_idx + 0, :].T ''' y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) ''' y1 = trip_counts[3:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) #y[y > 100] = 0 #y[y > 5000] = 0 #y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray( [str(xy) for xy in xy_bins])
def get_with_labels_and_unlabeled(self,labels): inds = array_functions.find_set(self.true_y,labels) | ~self.is_labeled return self.get_subset(inds)
def get_data_set_ids(self, data_set_ids): inds = array_functions.find_set(self.data_set_ids,data_set_ids) return self.get_subset(inds)
) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float) station_names, station_locs = create_stations(station_file_name) feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', num_rows=1000000000 ) y_names = ['tripduration'] y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] date_strs = data[:, find_first_element(feat_names, 'starttime')] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str) a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
def get_data_set_ids(self, data_set_ids): inds = array_functions.find_set(self.data_set_ids, data_set_ids) return self.get_subset(inds)
num_rows=1000000000) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float) station_names, station_locs = create_stations(station_file_name) feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',', num_rows=1000000000) y_names = ['tripduration'] y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] date_strs = data[:, find_first_element(feat_names, 'starttime')] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str) a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
def selection_subset_ids(data, ids): return array_functions.find_set(data.data_set_ids, ids)
def select_classes(data, classes): return array_functions.find_set(data.true_y, classes)
def has_label(self, labels): return array_functions.find_set(self.y, labels)
def get_with_labels(self, labels): inds = array_functions.find_set(self.true_y, labels) return self.get_subset(inds)
def get_with_labels_and_unlabeled(self, labels): inds = array_functions.find_set(self.true_y, labels) | ~self.is_labeled return self.get_subset(inds)
def get_with_labels(self,labels): inds = array_functions.find_set(self.true_y,labels) return self.get_subset(inds)
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = 0 * np.ones((num_days, num_locations)) locs = locs.astype(np.float) p_min = 0.3 p_max = 0.7 is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max ) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) # array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date # y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) # y = trip_counts[first_tuesday_idx + 0, :].T """ y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) """ y1 = trip_counts[:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) # y[y > 100] = 0 # y[y > 5000] = 0 # y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
def get_transfer_inds(self,labels_or_ids): if self.is_regression: return array_functions.find_set(self.data_set_ids,labels_or_ids) else: return array_functions.find_set(self.true_y,labels_or_ids)
file_name = 'kc_house_data.csv' save_data = True sampled_size = 1000 feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 suffix = '' if create_geospatial_data: x_feats = ['long', 'lat'] x_feat_inds = array_functions.find_set(feat_names, x_feats) x = data[:, x_feat_inds] x = array_functions.remove_quotes(x) x = x.astype(np.float) x[:, 0] = array_functions.normalize(x[:, 0]) x[:, 1] = array_functions.normalize(x[:, 1]) I = array_functions.is_in_percentile(x[:, 0], .01, .99) I &= array_functions.is_in_percentile(x[:, 1], .01, .99) x = x[I, :] y = y[I] data = data[I, :] if split_date: dates = array_functions.remove_quotes(data[:, feat_names == 'date']) date_objs = []
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions file_name = 'kc_house_data.csv' feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long'] y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name]) x = data[:, ~clear_idx] x = array_functions.remove_quotes(x) x = x.astype(np.float) data = (x,y) helper_functions.save_object('processed_data.pkl', data) pass