Exemple #1
0
def mean_cust_per_shop_if_holiday(df, data_from=None):
    if data_from is None:
        data_from = df
    df['MeanCustPerShopIfHoliday'] = p.Series(np.zeros(len(df)), df.index)
    means = dict()
    num = dict()
    for i in data_from.index.tolist():
        prom = d.content_of(data_from, 'IsHoliday', i)
        reg = d.content_of(data_from, 'StoreID', i)
        index = str(reg)
        val = d.content_of(data_from, 'NumberOfCustomers', i)
        if prom != 0:
            try:
                means[index] += val
                num[index] += 1
            except KeyError:
                means[index] = val
                num[index] = 1
    for i in df.index.tolist():
        prom = d.content_of(df, 'IsHoliday', i)
        reg = d.content_of(df, 'StoreID', i)
        if prom != 0:
            index = str(reg)
            df.set_value(i, 'MeanCustPerShopIfHoliday', means[index]/num[index])
    return df
Exemple #2
0
def mean_std_sales_per_shop_per_day(df, data_from=None):
    if data_from is None:
        data_from = df
    df['MeanSalesPerShopPerDay'] = p.Series(np.zeros(len(df)), df.index)
    df['StdSalesPerShopPerDay'] = p.Series(np.zeros(len(df)), df.index)
    means = dict()
    stds = dict()
    num = dict()
    for i in data_from.index.tolist():
        id = d.content_of(data_from, 'StoreID', i)
        day = d.content_of(data_from, 'Day', i)
        index = str(id) + day
        val = d.content_of(data_from, 'NumberOfSales', i)
        try:
            means[index] += val
            num[index] += 1
        except KeyError:
            means[index] = val
            num[index] = 1
    for i in df.index.tolist():
        id = d.content_of(df, 'StoreID', i)
        day = d.content_of(df, 'Day', i)
        index = str(id) + day
        df.set_value(i, 'MeanSalesPerShopPerDay', means[index]/num[index])

    for i in data_from.index.tolist():
        id = d.content_of(data_from, 'StoreID', i)
        day = d.content_of(data_from, 'Day', i)
        index = str(id) + day
        val = d.content_of(data_from, 'NumberOfSales', i)
        try:
            stds[index] += (val - means[index]/num[index]) * (val - means[index]/num[index])
        except KeyError:
            stds[index] = (val - means[index]/num[index]) * (val - means[index]/num[index])

    for i in df.index.tolist():
        id = d.content_of(df, 'StoreID', i)
        day = d.content_of(df, 'Day', i)
        index = str(id) + day
        df.set_value(i, 'StdSalesPerShopPerDay', np.sqrt(stds[index]/num[index]))
    return df
Exemple #3
0
def one_hot_numeric(ds, attr, header):
    """Transforms the given attribute of the given DataFrame object into one hot encoding.
    If you plan to use this, don't use split attribute.
    Returns a DataFrame object."""
    vals = d.values_of(ds, attr)
    new_cols = vals
    for new in new_cols:
        ds[header + str(new)] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i) == new:
                ds.set_value(i, header + str(new), 1)
    return ds
Exemple #4
0
def mean_sales_per_month_per_region(df, data_from=None):
    if data_from is None:
        data_from = df
    df['MeanSalesPerRegionPerMonth'] = p.Series(np.zeros(len(df)), df.index)
    means = dict()
    num = dict()
    for i in data_from.index.tolist():
        month = int(d.content_of(data_from, 'Month', i))
        reg = d.content_of(data_from, 'Region', i)
        index = str(month) + "_" + str(reg)
        val = d.content_of(data_from, 'NumberOfSales', i)
        try:
            means[index] += val
            num[index] += 1
        except KeyError:
            means[index] = val
            num[index] = 1
    for i in df.index.tolist():
        month = int(d.content_of(df, 'Month', i))
        reg = d.content_of(df, 'Region', i)
        index = str(month) + "_" + str(reg)
        df.set_value(i, 'MeanSalesPerRegionPerMonth', means[index]/num[index])
    return df
Exemple #5
0
def mean_cust_per_month_per_shop(df, data_from=None):
    if data_from is None:
        data_from = df
    df['MeanCustPerShopPerMonth'] = p.Series(np.zeros(len(df)), df.index)
    means = dict()
    num = dict()
    for i in data_from.index.tolist():
        month = int(d.content_of(data_from, 'Month', i))
        reg = d.content_of(data_from, 'StoreID', i)
        index = str(month) + "_" + str(reg)
        val = d.content_of(data_from, 'NumberOfCustomers', i)
        try:
            means[index] += val
            num[index] += 1
        except KeyError:
            means[index] = val
            num[index] = 1
    for i in df.index.tolist():
        month = int(d.content_of(df, 'Month', i))
        reg = d.content_of(df, 'StoreID', i)
        index = str(month) + "_" + str(reg)
        df.set_value(i, 'MeanCustPerShopPerMonth', means[index]/num[index])
    return df
Exemple #6
0
def add_min_per_shop(df, data_from=None):
    if data_from is None:
        data_from = df
    ids = d.values_of(data_from, 'StoreID')
    stds = dict()
    for id in ids:
        try:
            _ = stds[str(id)]
        except KeyError:
            stds[str(id)] = min_per_shop(data_from, id)

    df['min_shop'] = p.Series(np.zeros(len(df)), df.index)
    for i in df.index.tolist():
        df.set_value(i, 'min_shop', stds[str(d.content_of(df, 'StoreID', i))])
    return df
Exemple #7
0
def add_avg_cust_per_shop(df, data_from=None):
    if data_from is None:
        data_from = df
    ids = d.values_of(data_from, 'StoreID')
    means = dict()
    for id in ids:
        try:
            _ = means[str(id)]
        except KeyError:
            means[str(id)] = average_cust_per_shop(data_from, id)

    df['meancustshop'] = p.Series(np.zeros(len(df)), df.index)
    for i in df.index.tolist():
        df.set_value(i, 'meancustshop', means[str(d.content_of(df, 'StoreID', i))])
    return df
Exemple #8
0
def one_hot(ds, attr, header, split=False):
    """Transforms the given attribute of the given DataFrame object into one hot encoding.
    If you plan to use this, don't use split attribute.
    Returns a DataFrame object."""
    vals = d.values_of(ds, attr)
    if split:
        new_cols = []
        for v in vals:
            split = v.split("-")
            for s in split:
                if not new_cols.__contains__(s):
                    new_cols.append(s)
    else:
        new_cols = vals
    for new in new_cols:
        ds[header + new] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i).find(new) != -1:
                ds.set_value(i, header + new, 1)
    return ds