Esempio n. 1
0
def main(env, days):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    logger.info('reading fact_order_metrics')
    df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, s3=s3)

    cost_allocator_ppc(df_fo_source=fo_bing_filter(df_fo),
                       source_name='bing',
                       levels=['date', 'brandcode', 'countrycode', 'device'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_facebook_filter(df_fo),
                       source_name='facebook',
                       levels=['date', 'brandcode'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_adwords_filter(df_fo),
                       source_name='adwords',
                       levels=['date', 'adwords_campaign_id', 'device'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_criteo_filter(df_fo),
                       source_name='criteo',
                       levels=['date', 'brandcode', 'countrycode'],
                       env=env, days=days)

    cost_allocator_sovendus(df_fo_source=fo_sovendus_filter(df_fo),
                            env=env)

    cost_allocator_awin(df_fo_source=df_fo, env=env, days=days)

    update_table_partition(env, table_name='order_cost')
Esempio n. 2
0
def cost_allocator_ppc(df_fo_source, source_name, levels, env, days):
    """
    Use to allocate cost for ppc sources and upload results to s3

    Parameters
    ----------
    df_fo_source: DataFrame
        DataFrame from fact_order_metrics after sliced for specific data source orders
    source_name: string
        Name of the data source
    levels: list
        detail level that cost will be allocated on


    """
    folder_source = f'tables/funnel/{source_name}'

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    # reading cost table
    logger.info(f'reading {folder_source}')
    df_source = download_and_read(prefix=folder_source, days=days,s3=s3)

    # allocating cost
    logger.info('allocating cost')
    df_order_cost = cost_allocation_logic_ppc(df_fo_source, df_source, levels, source_name)

    # uploading to s3
    logger.info(f'uploading partitioned {source_name}.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix=source_name)
def main(env, **date_ranges):

    t0 = time.time()

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket=bucket)

    profile_list = get_ua_profile_list('App', bucket)

    dimensions = ['ga:date', 'ga:deviceCategory']
    metrics = ['ga:sessions']

    # initialize connection and pull data from individual GA views
    ga = GoogleAnalytics()
    df_all = pd.DataFrame()
    for dim_ua_id, ua_id, name in profile_list:
        logger.info(f'Fetching {name} ({ua_id})')
        df = ga.fetch(view_id=ua_id,
                      dimensions=dimensions,
                      metrics=metrics,
                      **date_ranges)
        df['dim_ua_profile_id'] = dim_ua_id
        df_all = df_all.append(df, ignore_index=True)

    if len(df_all) == 0:
        logger.error('Empty DataFrame')
        exit()

    # pre-process data
    df_all.sessions = df_all.sessions.astype('int')
    df_all.dim_ua_profile_id = df_all.dim_ua_profile_id.astype('int')
    df_all.date = df_all.date.apply(
        lambda x: '-'.join([x[:4], x[4:6], x[-2:]]))
    df_all['row_last_updated_ts'] = TIMESTAMP

    dates = set(df_all.date)

    # split data into a file per day and store them separately
    for date in dates:
        df_date = df_all.loc[df_all.date == date].copy()
        s3_key = f'tables/ga/app-sessions/app_sessions_{date}.csv'
        s3.upload_dataframe(df=df_date, s3_key=s3_key, delimiter=';')
        logger.info(
            f'Inserting {date} {df_date.dim_ua_profile_id.nunique()} profiles {len(df_date)} records into {s3_key}'
        )

    t1 = time.time()
    logger.info(
        f'Done, {len(dates)} dates inserted time spent {round(t1-t0,1)}s')
Esempio n. 4
0
def main(env):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    sql = ''' select * from sandbox_ana.sovendus_commission'''

    try:
        logger.info(f'Fetching {TABLE_NAME}')
        with DatabaseRedshift() as db:
            df = db.fetch(sql)
        if df.empty:
            logger.warning('Empty DataFrame')
    except Exception as e:
        logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}')

    logger.info(f'Uploading to {S3_DESTINATION_KEY}')
    s3.upload_dataframe(df=df, delimiter=';', s3_key=S3_DESTINATION_KEY)
    logger.info('Upload suceeded')
Esempio n. 5
0
def cost_allocator_awin(df_fo_source, env, days):
    """
    Use to allocate cost for Awin  and upload results to s3

    """
    # prepare dataset
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    t = 'tables/funnel/awin'
    # logger.info(f'reading {t}')
    df_awin = download_and_read(prefix=t, days=days, s3=s3)

    # align mixed data types within the column
    df_awin.order_reference = df_awin.order_reference.apply(lambda x: str(x))

    df_cpa = df_awin.loc[df_awin.order_reference.str.isdigit()].copy()  # slicing for actual orders
    df_cpa.rename(columns={'transaction_commission_validation_date': 'cpa', 'date': 'awin_date'}, inplace=True)

    # dwh date sometimes is 1 or 2 days later than awin date, this step is to change date to dwh date
    df_fo_date = df_fo_source[['date','referenceorder']].copy()
    df_fo_date.referenceorder = df_fo_date.referenceorder.apply(lambda x: str(x))

    df_cpa = df_cpa.merge(df_fo_date, how='inner', left_on='order_reference', right_on='referenceorder')
    df_cpa = df_cpa[['date', 'referenceorder', 'cpa']]
    # df_cpa_extra = df_awin.loc[df_awin.order_reference.str.contains('bonus')].copy()  # slicing for bonus rows
    df_cpa_extra = df_awin.loc[~df_awin.order_reference.str.isdigit()].copy()  # slicing for bonus rows
    df_orders = df_cpa.groupby(['date'], as_index=False)[['referenceorder']].count()  # count number of orders per day
    df_bonus = df_cpa_extra.groupby(['date'], as_index=False)[['transaction_commission_validation_date']].sum()  # sum bonus commission

    # distributing bonus commission to all the orders within the same day
    df_merged = df_orders.merge(df_bonus, how='inner')
    df_merged['cpa_extra'] = df_merged['transaction_commission_validation_date'] / df_merged['referenceorder']

    df_order_cost = df_cpa.merge(df_merged[['date','cpa_extra']], how='left')
    df_order_cost.fillna(0, inplace=True)
    df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']]
    df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra
    df_order_cost['data_source'] = 'awin'

    logger.info('uploading partitioned awin.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='awin')
def main(env, days):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    sql = read_sql(filename='fact_ua_categories.sql', days=days)

    try:
        logger.info(f'Fetching {TABLE_NAME}')
        with DatabaseRedshift() as db:
            df = db.fetch(sql)
        if df.empty:
            logger.warning('Empty DataFrame')
    except Exception as e:
        logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}')

    logger.info(f'Uploading to {S3_DESTINATION_KEY}')
    s3.partition_table_upload(df=df, prefix=S3_DESTINATION_KEY)
    logger.info('Upload succeeded, adding table partition')

    update_table_partition(env, table_name='fact_ua_categories')
Esempio n. 7
0
def cost_allocator_sovendus(df_fo_source, env):
    """
    Use to allocate cost for Sovendus and upload results to s3

    Parameters
    ----------
    df_fo_source: DataFrame
        DataFrame from fact_order_metrics after sliced for specific data source orders


    """
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    logger.info('reading sovendus-commission')
    df_svd = s3.read_csv_from_s3(s3_key='tables/redshift/sovendus-commission/sovendus-commission.csv')

    # find sovendus orders
    df_fo_svd = df_fo_source

    # if last date is null, fill current date
    today = datetime.strftime(datetime.today(), "%Y-%m-%d")
    df_svd.last_date.fillna(today, inplace=True)

    # join commission table on order table on brand country level
    # keep only the rows where date falls into the effective period
    logger.info('allocating cost')
    df_order_cost = df_fo_svd.merge(df_svd, on=['brandcode', 'countrycode'], how='left')
    df_order_cost = df_order_cost.loc[(df_order_cost.date >= df_order_cost.first_date) & (df_order_cost.date <= df_order_cost.last_date)].copy()
    df_order_cost['cpa'] = df_order_cost.gsii * df_order_cost.commission_rate
    df_order_cost['cpa_extra'] = 0
    df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']]
    df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra
    df_order_cost['data_source'] = 'sovendus'

    logger.info('uploading partitioned sovendus.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='sovendus')
Esempio n. 8
0
def main(env, days):
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    # download/read files
    df_adwords_campaign = s3.read_csv_from_s3(
        s3_key='tables/funnel/dim-adwords-campaign/dim-adwords-campaign.csv')

    fo_cols = [
        'date', 'brandcode', 'countrycode', 'source', 'medium',
        'channel_grouping', 'campaign', 'referenceorder',
        'adwords_campaign_id', 'is_ga_tracked', 'gsiibc'
    ]
    df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics',
                              days=days,
                              cols=fo_cols,
                              s3=s3)

    adwords_cols = [
        'date', 'adwords_campaign_id', 'impressions', 'clicks', 'cost'
    ]
    df_adwords = download_and_read(prefix='tables/funnel/adwords',
                                   days=days,
                                   cols=adwords_cols,
                                   s3=s3)

    awin_cols = ['date', 'referenceorder', 'cpa_total']
    df_awin = download_and_read(prefix='tables/cost-allocation/order',
                                days=days,
                                cols=awin_cols,
                                s3=s3,
                                suffix='awin')

    bing_cols = [
        'date', 'brandcode', 'countrycode', 'campaign', 'impressions',
        'clicks', 'cost'
    ]
    df_bing = download_and_read(prefix='tables/funnel/bing',
                                days=days,
                                cols=bing_cols,
                                s3=s3)

    criteo_cols = [
        'date', 'brandcode', 'countrycode', 'impressions', 'clicks', 'cost'
    ]
    df_criteo = download_and_read(prefix='tables/funnel/criteo',
                                  days=days,
                                  cols=criteo_cols,
                                  s3=s3)

    facebook_cols = ['date', 'referenceorder', 'cpa_total']
    df_facebook = download_and_read(prefix='tables/cost-allocation/order',
                                    days=days,
                                    cols=facebook_cols,
                                    s3=s3,
                                    suffix='facebook')

    gsc_cols = ['date', 'brandcode', 'countrycode', 'impressions', 'clicks']
    df_gsc = download_and_read(prefix='tables/funnel/gsc',
                               days=days,
                               cols=gsc_cols,
                               s3=s3)

    ua_cat_cols = [
        'date', 'brandcode', 'countrycode', 'source', 'medium',
        'channel_grouping', 'campaign', 'adwords_campaign_id', 'sessions'
    ]
    df_ua_cat = download_and_read(prefix='tables/redshift/fact-ua-categories',
                                  days=days,
                                  cols=ua_cat_cols,
                                  s3=s3)

    # process data
    df_fo = df_fo.merge(
        df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']],
        how='left')
    df_fo = df_fo.merge(df_awin[['referenceorder', 'cpa_total']], how='left')
    df_fo = process_dwh(df_fo, is_fo=True)

    df_ua_cat = df_ua_cat.merge(
        df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']],
        how='left')
    df_ua_cat = process_dwh(df_ua_cat, is_fo=False)

    df_adwords = df_adwords.merge(df_adwords_campaign, how='left')
    df_adwords_g = process_adwords(df_adwords)

    df_awin = df_awin.merge(df_fo[[
        'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel'
    ]],
                            how='left')
    df_awin_g = process_awin(df_awin)

    df_bing_g = process_bing(df_bing)

    df_criteo_g = process_criteo(df_criteo)

    df_facebook = df_facebook.merge(df_fo[[
        'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel'
    ]],
                                    how='left')
    df_facebook_g = process_facebook(df_facebook)

    df_gsc_g = process_gsc(df_gsc)

    df_source_g = pd.concat([
        df_adwords_g, df_awin_g, df_bing_g, df_criteo_g, df_facebook_g,
        df_gsc_g
    ])

    df_fo_g = df_fo.groupby(
        ['date', 'brandcode', 'countrycode', 'data_source', 'channel'],
        as_index=False).agg({
            'referenceorder': 'count',
            'gsiibc': 'sum'
        })
    df_ua_cat_g = df_ua_cat.groupby(
        ['date', 'brandcode', 'countrycode', 'data_source', 'channel'],
        as_index=False)[['sessions']].sum()

    df_final = df_source_g.merge(df_fo_g, how='outer').merge(df_ua_cat_g,
                                                             how='outer')

    s3.partition_table_upload(df=df_final,
                              prefix='tables/reports/channel-report')