def viz_features(x, y, domain_ids, feature_names=None, alpha=.1, learner=None): #y = array_functions.normalize(y) x = array_functions.vec_to_2d(x) for i in range(x.shape[1]): xi = x[:, i] xi_train = xi yi = y ids_i = domain_ids title = str(i) density = None if feature_names is not None: title = str(i) + ': ' + feature_names[i] if learner is not None: xi, yi, ids_i, density = train_on_data(xi, yi, domain_ids, learner) density = density * 100 + 1 I = array_functions.is_invalid(density) density[I] = 200 alpha = 1 array_functions.plot_2d_sub(xi, yi, alpha=alpha, title=title, data_set_ids=ids_i, sizes=density) k = 1 array_functions.plot_histogram(xi_train, 100) k = 1
def create_pollution( labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True ): file = "pollution/processed_data.pkl" y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ": " + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(labels_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_xy: data.reset_x() data.normalize_y() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = "pollution-%d-%d" % (series_to_use, num_instances) else: s = "pollution-%d" % series_to_use if normalize_xy: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_pollution(labels_to_use=np.arange(2), series_to_use=[0], num_instances=None, normalize_xy=True, save_data=True): #series_to_use = 1 file = 'pollution/processed_data.pkl' y, ids = helper_functions.load_object(file) data = None label_names = [] for i in range(y.shape[1]): print str(i) + '-' + ids[i] + ': ' + str(y[0, i, :]) for idx, s in enumerate(series_to_use): for label in labels_to_use: label_names.append(str(label) + '-' + ids[s]) y_to_use = y[:, s, :] print str(s) + ': ' + ids[s] time_series_data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[s]])) time_series_data.is_regression = True time_series_data.keep_series(labels_to_use) time_series_data = time_series_data.get_min_range() time_series_data.smooth_missing() time_series_data.x = time_series_data.x.astype(np.float) if num_instances is not None: time_series_data = time_series_data.get_range([0, num_instances]) if normalize_xy: time_series_data.reset_x() #time_series_data.normalize_y() curr_data = time_series_data.create_data_instance() curr_data.data_set_ids += idx * labels_to_use.size if data is None: data = curr_data else: data.combine(curr_data) data.label_names = label_names #perc_used = data.get_perc_used() if num_instances is not None: s = 'pollution-%s-%s' % (str(series_to_use), str(num_instances)) else: s = 'pollution-%d' % series_to_use if normalize_xy: s += '-norm' s += '/raw_data.pkl' #array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_time_series(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name='CO2_emissions'): file = name + '/processed_data.pkl' all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ': ' + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + '-%s-%d' % (str(series_to_use), num_instances) else: s = name + '-%s' % str(series_to_use) if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_time_series( label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions" ): file = name + "/processed_data.pkl" all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ": " + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + "-%s-%d" % (str(series_to_use), num_instances) else: s = name + "-%s" % str(series_to_use) if normalize_x: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def viz_features(x,y,domain_ids,feature_names=None,alpha=.1,learner=None): #y = array_functions.normalize(y) x = array_functions.vec_to_2d(x) for i in range(x.shape[1]): xi = x[:,i] xi_train = xi yi = y ids_i = domain_ids title = str(i) density = None if feature_names is not None: title = str(i) + ': ' + feature_names[i] if learner is not None: xi,yi,ids_i,density = train_on_data(xi,yi,domain_ids,learner) density = density*100 + 1 I = array_functions.is_invalid(density) density[I] = 200 alpha = 1 array_functions.plot_2d_sub(xi,yi,alpha=alpha,title=title,data_set_ids=ids_i,sizes=density) k = 1 array_functions.plot_histogram(xi_train,100) k=1
def create_drought(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True): file = 'drought/processed_data.pkl' y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ': ' + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = 'drought-%d-%d' % (series_to_use, num_instances) else: s = 'drought-%d' % series_to_use if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)