def create_wine(data_to_create=WINE_RED):
    red_file = "wine/winequality-red.csv"
    white_file = "wine/winequality-white.csv"
    field_names, red_data = load_csv(red_file, delim=";")
    white_data = load_csv(white_file, delim=";")[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            # learner = None
            viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner)
        suffix = "transfer"
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = "red"
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = "white"
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        # feat_idx = 1
        # data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    """
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    """
    s = wine_file % ("-" + suffix)
    helper_functions.save_object(s, data)
def create_energy():
    file = 'energy/ENB2012_data.csv'
    field_names, energy_data = load_csv(file)
    domain_ids = energy_data[:, 4]
    x = energy_data
    y = energy_data[:, -2]
    from methods import method
    #learner = method.NadarayaWatsonMethod()
    learner = None
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
def create_energy():
    file = "energy/ENB2012_data.csv"
    field_names, energy_data = load_csv(file)
    domain_ids = energy_data[:, 4]
    x = energy_data
    y = energy_data[:, -2]
    from methods import method

    # learner = method.NadarayaWatsonMethod()
    learner = None
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
def create_pair(i, j, y_col):
    file = pair_file(i, j)
    data_i = load_pair(i)
    data_j = load_pair(j)
    data_all = np.vstack((data_i, data_j))
    x = data_all[:, 1 - y_col]
    y = data_all[:, y_col]
    domain_ids = np.zeros(data_i.shape[0] + data_j.shape[0])
    domain_ids[data_i.shape[0] :] = 1
    # viz_features(x,y,domain_ids,learner=make_learner())
    viz_features(x, y, domain_ids, learner=None)
    create_and_save_data(x, y, domain_ids, file)
def create_pair(i, j, y_col):
    file = pair_file(i, j)
    data_i = load_pair(i)
    data_j = load_pair(j)
    data_all = np.vstack((data_i, data_j))
    x = data_all[:, 1 - y_col]
    y = data_all[:, y_col]
    domain_ids = np.zeros(data_i.shape[0] + data_j.shape[0])
    domain_ids[data_i.shape[0]:] = 1
    #viz_features(x,y,domain_ids,learner=make_learner())
    viz_features(x, y, domain_ids, learner=None)
    create_and_save_data(x, y, domain_ids, file)
def create_forest_fires():
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    }
    days = {
        'sun': 1,
        'mon': 2,
        'tue': 3,
        'wed': 4,
        'thu': 5,
        'fri': 6,
        'sat': 7
    }
    #month_to_season = lambda x : (months[x]-1)/3
    month_to_season = lambda x: months[x]
    day_to_int = lambda x: days[x]
    file = 'forest_fires/forestfires.csv'
    converters = {2: month_to_season, 3: day_to_int}
    field_names, forest_data = load_csv(file,
                                        dtype='float',
                                        converters=converters)
    x = forest_data
    y = forest_data[:, -1]
    i = field_names == 'month'
    domain_ids = forest_data[:, i]
    months_to_use = np.asarray([6, 7, 8])
    #months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12])
    to_use = array_functions.find_set(domain_ids, months_to_use)
    x = x[to_use, :]
    y = y[to_use]
    domain_ids = domain_ids[to_use]
    x = x[:, 4:]
    field_names = field_names[4:]
    I = (y > 0) & (y < 700)
    x = x[I, :]
    y = y[I]
    domain_ids = domain_ids[I]

    from methods import method
    learner = method.NadarayaWatsonMethod()
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
def create_mpg():
    file = "mpg/auto-mpg.data.txt"
    # field_names, mpg_data = load_csv(file,has_field_names=False,dtype='string',delim=' ')
    data = pd.read_csv(file, skiprows=0, delim_whitespace=True, dtype="string")
    data = np.asarray(data)[:, 0:-1]
    has_missing_values = (data == "?").any(1)
    data = data[~has_missing_values, :]
    data = data.astype("float")
    domain_ids = data[:, 1]
    x = data
    y = data[:, 0]
    viz_features(x, y, domain_ids)
    pass
def create_mpg():
    file = 'mpg/auto-mpg.data.txt'
    #field_names, mpg_data = load_csv(file,has_field_names=False,dtype='string',delim=' ')
    data = pd.read_csv(file, skiprows=0, delim_whitespace=True, dtype='string')
    data = np.asarray(data)[:, 0:-1]
    has_missing_values = (data == '?').any(1)
    data = data[~has_missing_values, :]
    data = data.astype('float')
    domain_ids = data[:, 1]
    x = data
    y = data[:, 0]
    viz_features(x, y, domain_ids)
    pass
def create_concrete(transfer=False):
    file = 'concrete/Concrete_Data.csv'
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ''
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == 'age').nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)]
        #0,3,5
        #data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = '-feat=' + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = '-' + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids,
                     used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    #viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
def create_concrete(transfer=False):
    file = "concrete/Concrete_Data.csv"
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ""
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == "age").nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)]
        # 0,3,5
        # data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = "-feat=" + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = "-" + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    # viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
def create_forest_fires():
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
        "sep": 9,
        "oct": 10,
        "nov": 11,
        "dec": 12,
    }
    days = {"sun": 1, "mon": 2, "tue": 3, "wed": 4, "thu": 5, "fri": 6, "sat": 7}
    # month_to_season = lambda x : (months[x]-1)/3
    month_to_season = lambda x: months[x]
    day_to_int = lambda x: days[x]
    file = "forest_fires/forestfires.csv"
    converters = {2: month_to_season, 3: day_to_int}
    field_names, forest_data = load_csv(file, dtype="float", converters=converters)
    x = forest_data
    y = forest_data[:, -1]
    i = field_names == "month"
    domain_ids = forest_data[:, i]
    months_to_use = np.asarray([6, 7, 8])
    # months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12])
    to_use = array_functions.find_set(domain_ids, months_to_use)
    x = x[to_use, :]
    y = y[to_use]
    domain_ids = domain_ids[to_use]
    x = x[:, 4:]
    field_names = field_names[4:]
    I = (y > 0) & (y < 700)
    x = x[I, :]
    y = y[I]
    domain_ids = domain_ids[I]

    from methods import method

    learner = method.NadarayaWatsonMethod()
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
def create_bike_sharing():
    file = "bike_sharing/day.csv"
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype="string")
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns)
    domain_ind = used_field_names == "yr"
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    # bike_data = bike_data[:,inds_to_keep]
    # used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        # learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ("-feat=" + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
def create_bike_sharing():
    file = 'bike_sharing/day.csv'
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype='string')
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns)
    domain_ind = used_field_names == 'yr'
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    #bike_data = bike_data[:,inds_to_keep]
    #used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        #learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ('-feat=' + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
def create_wine(data_to_create=WINE_RED):
    red_file = 'wine/winequality-red.csv'
    white_file = 'wine/winequality-white.csv'
    field_names, red_data = load_csv(red_file, delim=';')
    white_data = load_csv(white_file, delim=';')[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            #learner = None
            viz_features(x,
                         y,
                         ids,
                         used_field_names,
                         alpha=.01,
                         learner=learner)
        suffix = 'transfer'
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = 'red'
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = 'white'
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        #feat_idx = 1
        #data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    '''
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    '''
    s = wine_file % ('-' + suffix)
    helper_functions.save_object(s, data)