def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) logger.info('reading fact_order_metrics') df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, s3=s3) cost_allocator_ppc(df_fo_source=fo_bing_filter(df_fo), source_name='bing', levels=['date', 'brandcode', 'countrycode', 'device'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_facebook_filter(df_fo), source_name='facebook', levels=['date', 'brandcode'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_adwords_filter(df_fo), source_name='adwords', levels=['date', 'adwords_campaign_id', 'device'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_criteo_filter(df_fo), source_name='criteo', levels=['date', 'brandcode', 'countrycode'], env=env, days=days) cost_allocator_sovendus(df_fo_source=fo_sovendus_filter(df_fo), env=env) cost_allocator_awin(df_fo_source=df_fo, env=env, days=days) update_table_partition(env, table_name='order_cost')
def cost_allocator_ppc(df_fo_source, source_name, levels, env, days): """ Use to allocate cost for ppc sources and upload results to s3 Parameters ---------- df_fo_source: DataFrame DataFrame from fact_order_metrics after sliced for specific data source orders source_name: string Name of the data source levels: list detail level that cost will be allocated on """ folder_source = f'tables/funnel/{source_name}' bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) # reading cost table logger.info(f'reading {folder_source}') df_source = download_and_read(prefix=folder_source, days=days,s3=s3) # allocating cost logger.info('allocating cost') df_order_cost = cost_allocation_logic_ppc(df_fo_source, df_source, levels, source_name) # uploading to s3 logger.info(f'uploading partitioned {source_name}.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix=source_name)
def main(env, **date_ranges): t0 = time.time() bucket = get_bucket_name(env) s3 = s3Toolkit(bucket=bucket) profile_list = get_ua_profile_list('App', bucket) dimensions = ['ga:date', 'ga:deviceCategory'] metrics = ['ga:sessions'] # initialize connection and pull data from individual GA views ga = GoogleAnalytics() df_all = pd.DataFrame() for dim_ua_id, ua_id, name in profile_list: logger.info(f'Fetching {name} ({ua_id})') df = ga.fetch(view_id=ua_id, dimensions=dimensions, metrics=metrics, **date_ranges) df['dim_ua_profile_id'] = dim_ua_id df_all = df_all.append(df, ignore_index=True) if len(df_all) == 0: logger.error('Empty DataFrame') exit() # pre-process data df_all.sessions = df_all.sessions.astype('int') df_all.dim_ua_profile_id = df_all.dim_ua_profile_id.astype('int') df_all.date = df_all.date.apply( lambda x: '-'.join([x[:4], x[4:6], x[-2:]])) df_all['row_last_updated_ts'] = TIMESTAMP dates = set(df_all.date) # split data into a file per day and store them separately for date in dates: df_date = df_all.loc[df_all.date == date].copy() s3_key = f'tables/ga/app-sessions/app_sessions_{date}.csv' s3.upload_dataframe(df=df_date, s3_key=s3_key, delimiter=';') logger.info( f'Inserting {date} {df_date.dim_ua_profile_id.nunique()} profiles {len(df_date)} records into {s3_key}' ) t1 = time.time() logger.info( f'Done, {len(dates)} dates inserted time spent {round(t1-t0,1)}s')
def main(env): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) sql = ''' select * from sandbox_ana.sovendus_commission''' try: logger.info(f'Fetching {TABLE_NAME}') with DatabaseRedshift() as db: df = db.fetch(sql) if df.empty: logger.warning('Empty DataFrame') except Exception as e: logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}') logger.info(f'Uploading to {S3_DESTINATION_KEY}') s3.upload_dataframe(df=df, delimiter=';', s3_key=S3_DESTINATION_KEY) logger.info('Upload suceeded')
def cost_allocator_awin(df_fo_source, env, days): """ Use to allocate cost for Awin and upload results to s3 """ # prepare dataset bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) t = 'tables/funnel/awin' # logger.info(f'reading {t}') df_awin = download_and_read(prefix=t, days=days, s3=s3) # align mixed data types within the column df_awin.order_reference = df_awin.order_reference.apply(lambda x: str(x)) df_cpa = df_awin.loc[df_awin.order_reference.str.isdigit()].copy() # slicing for actual orders df_cpa.rename(columns={'transaction_commission_validation_date': 'cpa', 'date': 'awin_date'}, inplace=True) # dwh date sometimes is 1 or 2 days later than awin date, this step is to change date to dwh date df_fo_date = df_fo_source[['date','referenceorder']].copy() df_fo_date.referenceorder = df_fo_date.referenceorder.apply(lambda x: str(x)) df_cpa = df_cpa.merge(df_fo_date, how='inner', left_on='order_reference', right_on='referenceorder') df_cpa = df_cpa[['date', 'referenceorder', 'cpa']] # df_cpa_extra = df_awin.loc[df_awin.order_reference.str.contains('bonus')].copy() # slicing for bonus rows df_cpa_extra = df_awin.loc[~df_awin.order_reference.str.isdigit()].copy() # slicing for bonus rows df_orders = df_cpa.groupby(['date'], as_index=False)[['referenceorder']].count() # count number of orders per day df_bonus = df_cpa_extra.groupby(['date'], as_index=False)[['transaction_commission_validation_date']].sum() # sum bonus commission # distributing bonus commission to all the orders within the same day df_merged = df_orders.merge(df_bonus, how='inner') df_merged['cpa_extra'] = df_merged['transaction_commission_validation_date'] / df_merged['referenceorder'] df_order_cost = df_cpa.merge(df_merged[['date','cpa_extra']], how='left') df_order_cost.fillna(0, inplace=True) df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']] df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra df_order_cost['data_source'] = 'awin' logger.info('uploading partitioned awin.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='awin')
def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) sql = read_sql(filename='fact_ua_categories.sql', days=days) try: logger.info(f'Fetching {TABLE_NAME}') with DatabaseRedshift() as db: df = db.fetch(sql) if df.empty: logger.warning('Empty DataFrame') except Exception as e: logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}') logger.info(f'Uploading to {S3_DESTINATION_KEY}') s3.partition_table_upload(df=df, prefix=S3_DESTINATION_KEY) logger.info('Upload succeeded, adding table partition') update_table_partition(env, table_name='fact_ua_categories')
def cost_allocator_sovendus(df_fo_source, env): """ Use to allocate cost for Sovendus and upload results to s3 Parameters ---------- df_fo_source: DataFrame DataFrame from fact_order_metrics after sliced for specific data source orders """ bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) logger.info('reading sovendus-commission') df_svd = s3.read_csv_from_s3(s3_key='tables/redshift/sovendus-commission/sovendus-commission.csv') # find sovendus orders df_fo_svd = df_fo_source # if last date is null, fill current date today = datetime.strftime(datetime.today(), "%Y-%m-%d") df_svd.last_date.fillna(today, inplace=True) # join commission table on order table on brand country level # keep only the rows where date falls into the effective period logger.info('allocating cost') df_order_cost = df_fo_svd.merge(df_svd, on=['brandcode', 'countrycode'], how='left') df_order_cost = df_order_cost.loc[(df_order_cost.date >= df_order_cost.first_date) & (df_order_cost.date <= df_order_cost.last_date)].copy() df_order_cost['cpa'] = df_order_cost.gsii * df_order_cost.commission_rate df_order_cost['cpa_extra'] = 0 df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']] df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra df_order_cost['data_source'] = 'sovendus' logger.info('uploading partitioned sovendus.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='sovendus')
def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) # download/read files df_adwords_campaign = s3.read_csv_from_s3( s3_key='tables/funnel/dim-adwords-campaign/dim-adwords-campaign.csv') fo_cols = [ 'date', 'brandcode', 'countrycode', 'source', 'medium', 'channel_grouping', 'campaign', 'referenceorder', 'adwords_campaign_id', 'is_ga_tracked', 'gsiibc' ] df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, cols=fo_cols, s3=s3) adwords_cols = [ 'date', 'adwords_campaign_id', 'impressions', 'clicks', 'cost' ] df_adwords = download_and_read(prefix='tables/funnel/adwords', days=days, cols=adwords_cols, s3=s3) awin_cols = ['date', 'referenceorder', 'cpa_total'] df_awin = download_and_read(prefix='tables/cost-allocation/order', days=days, cols=awin_cols, s3=s3, suffix='awin') bing_cols = [ 'date', 'brandcode', 'countrycode', 'campaign', 'impressions', 'clicks', 'cost' ] df_bing = download_and_read(prefix='tables/funnel/bing', days=days, cols=bing_cols, s3=s3) criteo_cols = [ 'date', 'brandcode', 'countrycode', 'impressions', 'clicks', 'cost' ] df_criteo = download_and_read(prefix='tables/funnel/criteo', days=days, cols=criteo_cols, s3=s3) facebook_cols = ['date', 'referenceorder', 'cpa_total'] df_facebook = download_and_read(prefix='tables/cost-allocation/order', days=days, cols=facebook_cols, s3=s3, suffix='facebook') gsc_cols = ['date', 'brandcode', 'countrycode', 'impressions', 'clicks'] df_gsc = download_and_read(prefix='tables/funnel/gsc', days=days, cols=gsc_cols, s3=s3) ua_cat_cols = [ 'date', 'brandcode', 'countrycode', 'source', 'medium', 'channel_grouping', 'campaign', 'adwords_campaign_id', 'sessions' ] df_ua_cat = download_and_read(prefix='tables/redshift/fact-ua-categories', days=days, cols=ua_cat_cols, s3=s3) # process data df_fo = df_fo.merge( df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']], how='left') df_fo = df_fo.merge(df_awin[['referenceorder', 'cpa_total']], how='left') df_fo = process_dwh(df_fo, is_fo=True) df_ua_cat = df_ua_cat.merge( df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']], how='left') df_ua_cat = process_dwh(df_ua_cat, is_fo=False) df_adwords = df_adwords.merge(df_adwords_campaign, how='left') df_adwords_g = process_adwords(df_adwords) df_awin = df_awin.merge(df_fo[[ 'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel' ]], how='left') df_awin_g = process_awin(df_awin) df_bing_g = process_bing(df_bing) df_criteo_g = process_criteo(df_criteo) df_facebook = df_facebook.merge(df_fo[[ 'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel' ]], how='left') df_facebook_g = process_facebook(df_facebook) df_gsc_g = process_gsc(df_gsc) df_source_g = pd.concat([ df_adwords_g, df_awin_g, df_bing_g, df_criteo_g, df_facebook_g, df_gsc_g ]) df_fo_g = df_fo.groupby( ['date', 'brandcode', 'countrycode', 'data_source', 'channel'], as_index=False).agg({ 'referenceorder': 'count', 'gsiibc': 'sum' }) df_ua_cat_g = df_ua_cat.groupby( ['date', 'brandcode', 'countrycode', 'data_source', 'channel'], as_index=False)[['sessions']].sum() df_final = df_source_g.merge(df_fo_g, how='outer').merge(df_ua_cat_g, how='outer') s3.partition_table_upload(df=df_final, prefix='tables/reports/channel-report')