Ejemplo n.º 1
0
def viz_features(x, y, domain_ids, feature_names=None, alpha=.1, learner=None):
    #y = array_functions.normalize(y)
    x = array_functions.vec_to_2d(x)
    for i in range(x.shape[1]):
        xi = x[:, i]
        xi_train = xi
        yi = y
        ids_i = domain_ids
        title = str(i)
        density = None
        if feature_names is not None:
            title = str(i) + ': ' + feature_names[i]
        if learner is not None:
            xi, yi, ids_i, density = train_on_data(xi, yi, domain_ids, learner)
            density = density * 100 + 1
            I = array_functions.is_invalid(density)
            density[I] = 200
            alpha = 1
        array_functions.plot_2d_sub(xi,
                                    yi,
                                    alpha=alpha,
                                    title=title,
                                    data_set_ids=ids_i,
                                    sizes=density)
        k = 1
        array_functions.plot_histogram(xi_train, 100)
        k = 1
Ejemplo n.º 2
0
def create_pollution(
    labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True
):
    file = "pollution/processed_data.pkl"
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ": " + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(labels_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_xy:
        data.reset_x()
        data.normalize_y()

    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = "pollution-%d-%d" % (series_to_use, num_instances)
    else:
        s = "pollution-%d" % series_to_use
    if normalize_xy:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Ejemplo n.º 3
0
def create_pollution(labels_to_use=np.arange(2),
                     series_to_use=[0],
                     num_instances=None,
                     normalize_xy=True,
                     save_data=True):
    #series_to_use = 1
    file = 'pollution/processed_data.pkl'
    y, ids = helper_functions.load_object(file)
    data = None
    label_names = []
    for i in range(y.shape[1]):
        print str(i) + '-' + ids[i] + ': ' + str(y[0, i, :])
    for idx, s in enumerate(series_to_use):
        for label in labels_to_use:
            label_names.append(str(label) + '-' + ids[s])
        y_to_use = y[:, s, :]
        print str(s) + ': ' + ids[s]
        time_series_data = data_class.TimeSeriesData(y_to_use,
                                                     np.asarray([ids[s]]))
        time_series_data.is_regression = True
        time_series_data.keep_series(labels_to_use)
        time_series_data = time_series_data.get_min_range()
        time_series_data.smooth_missing()
        time_series_data.x = time_series_data.x.astype(np.float)
        if num_instances is not None:
            time_series_data = time_series_data.get_range([0, num_instances])

        if normalize_xy:
            time_series_data.reset_x()
            #time_series_data.normalize_y()

        curr_data = time_series_data.create_data_instance()
        curr_data.data_set_ids += idx * labels_to_use.size
        if data is None:
            data = curr_data
        else:
            data.combine(curr_data)
    data.label_names = label_names
    #perc_used = data.get_perc_used()
    if num_instances is not None:
        s = 'pollution-%s-%s' % (str(series_to_use), str(num_instances))
    else:
        s = 'pollution-%d' % series_to_use
    if normalize_xy:
        s += '-norm'
    s += '/raw_data.pkl'
    #array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Ejemplo n.º 4
0
def create_time_series(label_to_use=0,
                       series_to_use=0,
                       num_instances=None,
                       normalize_x=False,
                       save_data=True,
                       name='CO2_emissions'):
    file = name + '/processed_data.pkl'
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ': ' + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + '-%s-%d' % (str(series_to_use), num_instances)
    else:
        s = name + '-%s' % str(series_to_use)
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Ejemplo n.º 5
0
def create_time_series(
    label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions"
):
    file = name + "/processed_data.pkl"
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ": " + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + "-%s-%d" % (str(series_to_use), num_instances)
    else:
        s = name + "-%s" % str(series_to_use)
    if normalize_x:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Ejemplo n.º 6
0
def viz_features(x,y,domain_ids,feature_names=None,alpha=.1,learner=None):
    #y = array_functions.normalize(y)
    x = array_functions.vec_to_2d(x)
    for i in range(x.shape[1]):
        xi = x[:,i]
        xi_train = xi
        yi = y
        ids_i = domain_ids
        title = str(i)
        density = None
        if feature_names is not None:
            title = str(i) + ': ' + feature_names[i]
        if learner is not None:
            xi,yi,ids_i,density = train_on_data(xi,yi,domain_ids,learner)
            density = density*100 + 1
            I = array_functions.is_invalid(density)
            density[I] = 200
            alpha = 1
        array_functions.plot_2d_sub(xi,yi,alpha=alpha,title=title,data_set_ids=ids_i,sizes=density)
        k = 1
        array_functions.plot_histogram(xi_train,100)
        k=1
Ejemplo n.º 7
0
def create_drought(label_to_use=0,
                   series_to_use=0,
                   num_instances=None,
                   normalize_x=False,
                   save_data=True):
    file = 'drought/processed_data.pkl'
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ': ' + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use,
                                     np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(label_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_x:
        data.x -= data.x.min()
        data.x /= data.x.max()
    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = 'drought-%d-%d' % (series_to_use, num_instances)
    else:
        s = 'drought-%d' % series_to_use
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)