Ejemplo n.º 1
0
 def save_predictions(self,
                      header="NumberOfSales",
                      csv="final_sales_predictions.csv"):
     new = pd.DataFrame()
     new[header] = pd.Series(self.final_predictions)
     ds.save_dataset(new, csv)
     return new
Ejemplo n.º 2
0
def full_prep_test_ds_to_cust_pred():
    test_datas = d.read_imputed_onehot_test_dataset()
    data_from = d.read_imputed_onehot_dataset()
    data_from['Date'] = p.to_datetime(data_from['Date'], format='%d/%m/%Y')
    data_from['Day'] = data_from['Date'].dt.weekday_name
    datas = prepare_ds_to_customer_prediction(test_datas, data_from)
    d.save_dataset(datas, "test_dataset_for_customers_prediction.csv")
Ejemplo n.º 3
0
def build_sales_predictor_test_dataset(name):
    ds_tr = d.read_dataset("final_sales_only_train.csv")
    ds = d.read_imputed_onehot_test_dataset()
    ds = __prepare_sales_only_test_ds(ds, ds_tr)
    d.save_dataset(ds, name)
Ejemplo n.º 4
0
def select_features(name, featlist, fname):
    ds = d.read_dataset(name)
    ds = ds[featlist]
    d.save_dataset(ds, fname)
Ejemplo n.º 5
0
                if not new_cols.__contains__(s):
                    new_cols.append(s)
    else:
        new_cols = vals
    for new in new_cols:
        ds[header + new] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i).find(new) != -1:
                ds.set_value(i, header + new, 1)
    return ds


def one_hot_numeric(ds, attr, header):
    """Transforms the given attribute of the given DataFrame object into one hot encoding.
    If you plan to use this, don't use split attribute.
    Returns a DataFrame object."""
    vals = d.values_of(ds, attr)
    new_cols = vals
    for new in new_cols:
        ds[header + str(new)] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i) == new:
                ds.set_value(i, header + str(new), 1)
    return ds


if __name__ == '__main__':
    ds = d.read_test_dataset()
    ds = full_preprocess(ds)
    d.save_dataset(ds, "imputed_test_ds_one_hot.csv")
Ejemplo n.º 6
0
error, totp, totr, shops, months = eva.region_error(preds, preds, regions, ids, dates, True)

print(error)
print(totp)
print(shops)
print(months)


sh, mo, sa = gen_pandas_cols()
print(len(shops))
sa = np.array(sa, np.int)
print(sa.min())
final_sub = pd.DataFrame()
final_sub['StoreID'] = pd.Series(sh)
final_sub['Month'] = pd.Series(mo)
final_sub['NumberOfSales'] = pd.Series(sa)

print(final_sub)
d.save_dataset(final_sub, "final_sub_new1.csv")

print(d.to_numpy(final_sub['NumberOfSales']).sum())
# ADDING MORE STATISTICS
trainset = "final_for_sales_train.csv"

trainds = d.read_dataset(trainset)
final_sub_stats = preu.mean_sales_per_month_per_shop(final_sub, trainds)
final_sub_stats['Ratio'] = final_sub_stats['NumberOfSales']/final_sub_stats['MeanSalesPerShopPerMonth']
print(final_sub_stats)

Ejemplo n.º 7
0
        preds += p
    preds[preds < 0] = 0
    for i in range(min(len(preds), number_print)):
        print("PRED: ", preds[i], "   y: ", y[i])

    print("R2: ", eva.r2(ds, preds, 'NumberOfSales'))


if __name__ == '__main__':
    TRAIN = True
    LOAD = False
    SAVE_DF = False
    name = "test"
    ds = d.read_imputed_onehot_dataset()
    ds = prepare_ds(ds)
    d.save_dataset(ds, "fully_preprocessed_ds.csv")
    ds_train = utils.get_frame_in_range(ds, 3, 2016, 12, 2017)
    ds_test = utils.get_frame_in_range(ds, 1, 2018, 2, 2018)
    y = prepare_out(ds_train)
    real_y = np.array(y)
    dy = np.zeros(y.shape)
    x = drop_useless(ds_train)
    y_test = prepare_out(ds_test)
    if SAVE_DF:
        d.save_dataset(ds_test, "dataset_to_predict_sales.csv")
    x_test = drop_useless(ds_test)

    models = []
    for i in range(number_of_model):
        if not LOAD:
            models.append(m.nonsequentialNNDropout(x.shape[1], i == 0))
Ejemplo n.º 8
0
def reorder_datas_cols(name, attrlist, nameout):
    datas = d.read_dataset(name)
    datas = reorder_attributes(datas, attrlist)
    d.save_dataset(datas, nameout)
Ejemplo n.º 9
0
                     autoexclude=True).exclude('NumberOfSales',
                                               'Month').build()
#data = sb.SetBuilder(target='NumberOfSales').exclude('Day').build()

nn = neural_network.MLPRegressor(hidden_layer_sizes=(100, 5),
                                 activation='relu',
                                 solver='adam',
                                 batch_size='auto',
                                 learning_rate='adaptive',
                                 learning_rate_init=0.001,
                                 max_iter=50,
                                 shuffle=True,
                                 random_state=9,
                                 tol=0.000001,
                                 verbose=True,
                                 warm_start=False,
                                 momentum=0.9,
                                 nesterovs_momentum=True,
                                 early_stopping=False,
                                 validation_fraction=0.1,
                                 beta_1=0.9,
                                 beta_2=0.999,
                                 epsilon=1e-08)

n = nn.fit(data.xtr, data.ytr.ravel())

ypred = nn.predict(data.xts)

ds.save_dataset(pd.DataFrame(ypred), 'customer_pred_jan_feb_NN.csv')

print('R2 = %s' % eval.evaluate(data.yts, ypred))
Ejemplo n.º 10
0
yy = np.array(yy)

pred = yy.mean(axis=0)

print('Bagging R2 = %s' % eval.evaluate(data.ytr, pred))

re, totr, totp = ev_cust.region_error(data.ytr, pred, regions, ids, dates)
diff = totr - totp
print("REG_ERR: ", re * 100)
print("REG_MEAN_ERR: ", re.mean() * 100)
print("REAL_SUM: ", totr.sum())
print("PRED_SUM: ", totp.sum())
print("SUM_OF_DIFFS: ", diff.sum())

data_t = d.read_dataset("final_sales_only_test_r.csv")
data_t = sb.SetBuilder(target='NumberOfSales', autoexclude=True, df=data_t.copy(),
                       split=(3, 2016, 2, 2018, 3, 2018, 4, 2018))\
    .exclude_list(excluded_feats())\
    .build()

yy = []
for i in range(it):
    y = models[i].predict(data_t.xts)
    yy.append(y)

pred = np.array(yy).mean(axis=0)

tosave = pandas.DataFrame()
tosave['NumberOfSales'] = pandas.Series(pred)
d.save_dataset(tosave, "sales_only_final_pred_nn.csv")
Ejemplo n.º 11
0
if __name__ == '__main__':
    split = [[(3, 2016, 12, 2016), (3, 2017, 2, 2018)], [(1, 2017, 2, 2017)]]
    datas = build_cust_predictor_train_dataset(1, 2017, 2, 2017)
    datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas, split=split)\
        .exclude("Month", "NumberOfSales")\
        .build()
    n = 10
    mods = []
    for i in range(n):
        print(i + 1)
        x, y = datas.random_sampling(1.0)
        mod = skc.LinearSklearn(1, model)
        mod.train(x, y)
        mods.append(mod)
        p = mod.predict(datas.xtr).squeeze()
        print("TRAIN R2: ", eval.r2_score(datas.ytr, p))
        print("TEST R2: ", eval.r2_score(datas.yts, mod.predict(datas.xts)))
        print("##########################")

    preds = []
    for i in range(n):
        preds.append(mods[i].predict(datas.xts))

    custpred = np.array(preds).mean(axis=0)

    print("TEST R2: ", eval.r2_score(datas.yts, custpred))

    new = pandas.DataFrame()
    new['NumberOfCustomers'] = pandas.Series(custpred)
    ds.save_dataset(new, "genfeb17.csv")
Ejemplo n.º 12
0
    y_t = prepare_out(d_reg_t)
    x_t = drop_useless(d_reg_t, 1)
    mod = skc.LinearSklearn(1, model)
    mod.train(x, y)
    models[i] = mod
    p = mod.predict(x).squeeze()
    pt = mod.predict(x_t).squeeze()
    r2_t = eval.r2_score(y_t, pt)
    sum += r2_t * (len(d_reg) + len(d_reg_t))
    print("TRAIN R2: ", eval.r2_score(y, p))
    print("TEST R2: ", r2_t)
    print("##########################")

print("AVG TEST R2: ", sum / len(datas))

custpred = []
for i in test.index.tolist():
    row = test.loc[i]
    val = ""
    for t in types:
        if row[t] == 1:
            val = t
    row = drop_useless(row).reshape([1, -1])
    custpred.append(models[val].predict(row).squeeze())

custpred = np.array(custpred)

new = pandas.DataFrame(index=test.index)
new['NumberOfCustomers'] = pandas.Series(custpred, test.index)
ds.save_dataset(new, "cust_ensemble_per_region_predictions1.csv")
Ejemplo n.º 13
0
if __name__ == '__main__':
    TRAIN = True
    LOAD = False
    SAVE_DS = True
    name = "test2_"
    ds = d.read_imputed_onehot_dataset()
    ds = prepare_ds(ds)
    ds_train = utils.get_frame_in_range(ds, 3, 2016, 12, 2017)
    ds_test = utils.get_frame_in_range(ds, 1, 2018, 2, 2018)
    y = prepare_out(ds_train)
    real_y = np.array(y)
    dy = np.zeros(y.shape)
    x = drop_useless(ds_train)
    y_test = prepare_out(ds_test)
    if SAVE_DS:
        d.save_dataset(ds_test, "dataset_to_predict_customers.csv")
    x_test = drop_useless(ds_test)

    models = []
    for i in range(number_of_model):
        if not LOAD:
            models.append(m.nonsequentialNNtest(x.shape[1], i == 0))
        else:
            models.append(k.models.load_model("mod" + name + str(i) + ".h5"))
        opt = k.optimizers.adam(lr=3e-5)
        models[i].compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])
        models[i].summary()
        if TRAIN:
            models[i].fit(x=x, y=y, batch_size=10000, epochs=50, verbose=2, validation_data=(x_test, y_test))
            models[i].save("mod" + name + str(i) + ".h5")
            dy = models[i].predict(x, 500)
Ejemplo n.º 14
0
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn import preprocessing
import pandas as pd


def model():
    return knn(n_neighbors=10,
               weights='uniform',
               algorithm='ball_tree',
               leaf_size=15)


if __name__ == '__main__':

    datas = ds.read_dataset("best_for_customers.csv")

    datas = sb.SetBuilder(target='NumberOfCustomers',
                          autoexclude=True,
                          df=datas).exclude('NumberOfSales', 'Month').build()

    mod = skc.LinearSklearn(1, model)
    x = datas.xtr
    y = datas.ytr
    mod.train(x, y)
    custpred = mod.predict(datas.xts)
    print("TEST R2: ", eval.r2_score(datas.yts, custpred))
    print("##########################")
    new = pandas.DataFrame()
    new['NumberOfCustomers'] = pandas.Series(custpred)
    ds.save_dataset(new, "knncustpreds1.csv")
Ejemplo n.º 15
0
def build_cust_predictor_test_dataset(name):
    ds_tr = d.read_dataset("final_for_customer_train.csv")
    ds = d.read_imputed_onehot_test_dataset()
    ds = __prepare_customers_test_ds(ds, ds_tr)
    d.save_dataset(ds, name)
Ejemplo n.º 16
0
def build_sales_predictor_train_dataset(name):
    ds = d.read_imputed_onehot_dataset()
    ds = __prepare_sales_train_ds(ds)
    d.save_dataset(ds, name)
Ejemplo n.º 17
0
    modpreds_t = []
    for i in range(n):
        print(i + 1)
        x, y = datas.xtr, datas.ytr
        mod = skc.LinearSklearn(1, model[i])
        mod.train(x, y)
        mods.append(mod)
        p = mod.predict(x).squeeze()
        print(p)
        p_t = mod.predict(datas.xts).squeeze()
        modpreds.append(p)
        modpreds_t.append(p_t)
        print("TRAIN R2: ", eval.r2_score(y, p))
        print("TEST R2: ", eval.r2_score(datas.yts, p_t))
        print("##########################")

    modpreds = np.array(modpreds).transpose()
    modpreds_t = np.array(modpreds_t).transpose()
    x = np.hstack((datas.xtr, modpreds))
    x_t = np.hstack((datas.xts, modpreds_t))

    fin = skc.LinearSklearn(1, final)
    fin.train(x, datas.ytr)
    custpred = fin.predict(x_t)
    print(custpred)
    print("TEST R2: ", eval.r2_score(datas.yts, custpred))

    new = pandas.DataFrame()
    new['NumberOfCustomers'] = pandas.Series(custpred)
    ds.save_dataset(new, "meta_mod2.csv")