Beispiel #1
0
    def test_sanitize(self):
        new_cols = sanitize(self.sanitize_arr)

        self.assertListEqual(new_cols, [
            "having_ip_address",
            "url_length",
            "shortining_service",
            "having_at_symbol",
            "double_slash_redirecting",
            "prefix_suffix",
            "having_sub_domain",
            "domain_registeration_length",
            "favicon",
            "port",
            "https_token",
            "request_url",
            "url_of_anchor",
            "links_in_tags",
            "sfh",
            "submitting_to_email",
            "abnormal_url",
            "redirect",
            "on_mouseover",
            "right_click",
            "pop_up_widnow",
            "iframe",
            "age_of_domain",
            "dns_record",
            "web_traffic",
            "page_rank",
            "google_index",
            "links_pointing_to_page",
            "statistical_report",
            "result",
            "485_5468a44_44_4_e3_c_cc_c_d",
        ])
pd.set_option('display.width', 1000)
from sqlalchemy import create_engine
import matplotlib.pylab as plt
import matplotlib.dates as mdates
plt.ioff()

connection = create_engine(
    "mysql://*****:*****@etcinsights.nazwa.pl/etcinsights_ws"
).connect()
mall_cols = connection.execute("select * from mall")
mall = connection.execute("select * from mall").fetchall()
print("Data extracted from sales_forecast successfully!")
connection.close()
mall = pd.DataFrame(mall)
mall.columns = [col for col in mall_cols.keys()]
mall.columns = cleaner.sanitize(mall.columns)
mall['traffic'] = mall['traffic'].astype(int)
mall['date'] = pd.to_datetime(mall['date'], format="%Y-%m-%d")
mall = mall.groupby(by=['year', 'week'])['traffic'].sum().reset_index()
# make graph of total

fig, ax = plt.subplots(figsize=(16, 9))
ax.set(xlabel='date', ylabel='people in')
ax.grid(True)
ax.xaxis.set_tick_params(rotation=90)
ax.plot(mall['week'] + "-" + mall['year'], mall['traffic'], marker='o')
plt.tight_layout()
fig.show()

mall_rf_data = mall[
    mall['date'] >= datetime.datetime.strptime('2019-07-29', "%Y-%m-%d")]
import pmdarima as pm
print(f"Using pmdarima {pm.__version__}")

# pd.set_option('display.max.columns', 20)
# pd.set_option('display.width', 1000)

connection = create_engine(
    "mysql://*****:*****@etcinsights.nazwa.pl/etcinsights_harebakken"
).connect()
traffic_cols = connection.execute("select * from traffic_date")
traffic = connection.execute("select * from traffic_date").fetchall()
print("Data extracted from sales_forecast successfully!")
connection.close()
traffic = pd.DataFrame(traffic)
traffic.columns = [col for col in traffic_cols.keys()]
traffic.columns = cleaner.sanitize(traffic.columns)

traffic = traffic.filter(['date', 'traffic'])
traffic.set_index(['date'], inplace=True)
traffic.index = pd.to_datetime(traffic.index)
traffic['traffic'] += 0.001

# result = seasonal_decompose(traffic, model='multiplicative', period=7)
# result.plot()
# plt.show()
pm.plot_acf(traffic)
stepwise_model = auto_arima(traffic,
                            start_p=1,
                            start_q=1,
                            max_p=5,
                            max_q=5,
    data_req=requests.get('https://maxbo.link.express/external/api/v2/5d02982d29512bcc1729bb3964efb830/sales/query/?start_date='+date+'T00:00:00&end_date='+date+'T23:59:59&store_alias=ALL&type=CASH').json()
    data_date=pd.DataFrame()
    for i in range(len(data_req['store'])):
        store_name=data_req['store'][i]['store_name']
        sales_count=data_req['store'][i]['salesCount']
        print("{0} - {1} from date {2}".format(i+1,store_name,date))
        data_temp = pd.DataFrame()
        for j in range(len(data_req['store'][i]['sales'])):
            data_level_down=data_req['store'][i]['sales'][j]
            temp=pd.DataFrame(data_level_down['lineItems']).assign(store_name=store_name,sales_count=sales_count, date=date, id_tr=str(i+1)+date.replace('-','')+str(j+1)) # id identify the transaction store+date+transaction
            data_temp=pd.concat([data_temp, temp], axis=0)
        data_date=pd.concat([data_date, data_temp], axis=0)
    data = pd.concat([data, data_date], axis=0)
data.drop(columns=['edpNr', 'productId', 'gross', 'discountPercent', 'itemCount'], inplace=True)
data.reset_index(drop=True, inplace=True)
data.columns=cleaner.sanitize(data.columns) # cleaning the data headers

# send data the file named 'sales_data'
file = open('sales_data', 'wb')
pickle.dump(data, file)
file.close()
# load the data
file = open('sales_data', 'rb')
data = pickle.load(file)
file.close()

def original_name(part_name):
    # extract exactly the same name of vendor_name, which we know only partially
    return data.dropna()[data.dropna()['vendor_name'].str.contains(part_name)]['vendor_name'].unique().tolist()
original_name("Jordan")
                                     var_name='Day',
                                     value_name='Date')
traffic_sales_data = pd.read_excel(
    'traffic_sales_data/traffic_sales_2018_2019_2020.xlsx')
traffic_sales_data = traffic_sales_data.melt(id_vars=['Week', 'Year', 'N_P'],
                                             value_vars=[
                                                 'Monday', 'Tuesday',
                                                 'Wednesday', 'Thursday',
                                                 'Friday', 'Saturday', 'Sunday'
                                             ],
                                             var_name='Day',
                                             value_name='Value')
traffic_sales_data = traffic_sales_data.merge(calendar_weeks,
                                              how='left',
                                              on=['Year', 'Week', 'Day'])
traffic_sales_data.columns = cleaner.sanitize(traffic_sales_data.columns)

traffic_sales_data_date = traffic_sales_data[
    traffic_sales_data['n_p'] == "TRAFFIC"].rename(
        columns={'value': 'people_in'})[['people_in', 'date']]
traffic_sales_data_date['people_in'] = traffic_sales_data_date[
    'people_in'].astype(int)

data_harebakken_date = data_harebakken.groupby(
    by=['date'])['people_in'].sum().reset_index()
traffic_date = pd.concat([traffic_sales_data_date, data_harebakken_date])

traffic_date['date'] = pd.to_datetime(traffic_date['date'], format="%Y-%m-%d")
traffic_date.sort_values(by=["date"], inplace=True)
traffic_date['week'] = traffic_date['date'].dt.strftime("%V")
traffic_date['month'] = traffic_date['date'].dt.strftime("%B")