コード例 #1
0
ファイル: faq_pageviews.py プロジェクト: miladmyled/GA_ETL
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation(analytics)

    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        data = faq.fetch_data(VIEW_ID, analytics, step_time, 'pageview')

        data.columns = ['date', 'pagePath', 'pageViews']
        data['pagePath'] = data['pagePath'].str.slice(0, 300)
        data['date'] = pd.to_datetime(data['date'])

        try:
            cursor.fast_executemany = True
            sql_comm = '''INSERT INTO [{}].[dbo].[{}]
            ([date],[pagePath], [pageViews])
             VALUES (?,?,?)'''.format(DB_NAME, TABLE_NAME)
            cursor.executemany(sql_comm, data.values.tolist())
            cursor.commit()
            doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(),
                                    'Successful Insert', server_len=len(data.index),
                                    database_len=len(data.index))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
        except Exception as e:
            doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)

        time.sleep(2)
コード例 #2
0
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation(analytics)

    for i in range((limit_date - ref_date).days - 1):
        step_time = ref_date + relativedelta(days=+i)
        year, month = jalali.Gregorian(step_time).persian_tuple()[0:2]
        custom_start = jalali.Persian(year, month, 1).gregorian_datetime()
        df_part1 = active_users.fetch_data_daily(
            VIEW_ID, analytics, step_time.strftime('%Y-%m-%d'), 'web')
        df_part1.columns = ['date', 'category', 'sessions', 'dailyUsers']
        df_part2 = active_users.fetch_data_monthly(
            VIEW_ID, analytics,
            step_time.replace(day=1).strftime('%Y-%m-%d'),
            step_time.strftime('%Y-%m-%d'), 'web')
        df_part2.columns = ['category', 'month', 'monthlyUsers']
        df_part3 = active_users.fetch_data_custom_wrapper(
            VIEW_ID, analytics, custom_start, step_time, 'monthlyUsersJalali',
            'web')
        df_part4 = active_users.fetch_data_custom_wrapper(
            VIEW_ID, analytics, step_time + relativedelta(days=-29), step_time,
            '30DaysWindow', 'web')

        df_part1['date'] = pd.to_datetime(df_part1['date'])
        total_df = df_part1.join(df_part2.set_index('category'), on='category')
        total_df = total_df.join(df_part3.set_index('category'), on='category')
        total_df = total_df.join(df_part4.set_index('category'), on='category')
        total_df.drop(['month'], axis=1, inplace=True)

        print(total_df)

        try:
            cursor.fast_executemany = True
            sql_comm = '''INSERT INTO [{}].[dbo].[{}]
            ([date],[category],[sessions],[dailyUsers],[monthlyUsers],[monthlyUsersJalali],[30DaysWindow])
             VALUES (?,?,?,?,?,?,?)'''.format(DB_NAME, TABLE_NAME)
            cursor.executemany(sql_comm, total_df.values.tolist())
            cursor.commit()
            doc = logger.create_log('Insert',
                                    'Ack',
                                    step_time,
                                    socket.gethostname(),
                                    'Successful Insert',
                                    server_len=len(total_df.index),
                                    database_len=len(total_df.index))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
        except Exception as e:
            doc = logger.create_log('Insert', 'Nack', step_time,
                                    socket.gethostname(), str(e))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)

        time.sleep(2)
コード例 #3
0
def validation(analytics):
    sql_maxdate = 'SELECT MAX ([date]) AS "Max Date" FROM {}.dbo.{};'.format(
        DB_NAME, TABLE_NAME)
    last_insert = pd.read_sql(sql_maxdate, cnxn).iloc[0][0]

    if last_insert is None:
        ref_date = datetime.datetime.strptime('2019-04-25', '%Y-%m-%d').date()
    else:
        ref_date = last_insert + relativedelta(days=1)
        sql_lastbatch = "SELECT PK FROM {}.dbo.{}" \
                        " WHERE [date] = '{}'".format(DB_NAME, TABLE_NAME, last_insert)
        last_len_DB = len(cnxn.execute(sql_lastbatch).fetchall())
        last_len_GA = len(
            rawdata.fetch_data(VIEW_ID, analytics,
                               last_insert.strftime('%Y-%m-%d'), 'trash'))
        if (last_len_GA - last_len_DB) > 0.001 * last_len_GA:
            doc = logger.create_log(
                'DB/GA Consistency',
                'Nack',
                hostname=socket.gethostname(),
                text='Corrupted Last Insert, truncate the last batch!',
                server_len=last_len_GA,
                database_len=last_len_DB)
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
            sys.exit()
    return ref_date
コード例 #4
0
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation(analytics)

    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        data = category.fetch_data(VIEW_ID, analytics, step_time, 'events')
        data.columns = [
            'supply_category', 'date', 'page_view', 'unique_page_view'
        ]

        data['date'] = pd.to_datetime(data['date'])
        data['supply_category'] = data['supply_category'].str.slice(0, 300 - 5)
        data['supply_category'].replace('(not set)',
                                        sqlalchemy.sql.null(),
                                        inplace=True)

        data.rename(columns={
            'supply_category': 'supplyCategory',
            'page_view': 'pageView',
            'unique_page_view': 'uniquePageView'
        },
                    inplace=True)

        try:
            data.to_sql(TABLE_NAME,
                        cnxn,
                        method="multi",
                        if_exists='append',
                        index=False,
                        chunksize=10)
            doc = logger.create_log('Insert',
                                    'Ack',
                                    step_time,
                                    socket.gethostname(),
                                    'Successful Insert',
                                    server_len=len(data.index),
                                    database_len=len(data.index))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
        except Exception as e:
            doc = logger.create_log('Insert', 'Nack', step_time,
                                    socket.gethostname(), str(e))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
コード例 #5
0
ファイル: carousel_data.py プロジェクト: miladmyled/GA_ETL
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    # ref_date = validation(analytics)
    # ref_date = datetime.datetime.strptime('2019-07-01', '%Y-%m-%d').date()
    ptrns = [
        'BRAND', 'CMP', 'HOME', 'LANDING', 'PDP', 'PLP', 'PROFILE', 'SEARCH',
        'INCREDIBLE', 'THANKYOU'
    ]

    for i in range(6):
        step_time = today_date + relativedelta(days=-i - 3)
        for ptrn in ptrns:
            total_df = carousel.fetch_data(view_id, analytics,
                                           step_time.strftime('%Y-%m-%d'),
                                           ptrn)
            if total_df.empty:
                time.sleep(2)
                continue
            else:
                total_df.columns = [
                    'date', 'pagepath', 'product_addtocarts',
                    'carousel_clicks', 'carousel_name', 'carousel_revenue',
                    'product_uniquepurchases'
                ]
                total_df['pagepath'] = total_df['pagepath'].map(
                    lambda x: x.replace('?', '/'))
                total_df['date'] = pd.to_datetime(total_df['date'])
                total_df['source'] = data_type
                total_df = total_df[[
                    'date', 'source', 'carousel_name', 'carousel_clicks',
                    'product_addtocarts', 'product_uniquepurchases',
                    'carousel_revenue'
                ]]
                total_df['carousel_name'] = total_df[
                    'carousel_name'].str.strip()
                total_df['carousel_name'] = total_df[
                    'carousel_name'].str.slice(0, 200 - 10)

            try:
                print(total_df)
                # cursor.fast_executemany = True
                # sql_comm = '''INSERT INTO [{}].[dbo].[{}]([date],[source],[carousel_name],[carousel_clicks],[product_addtocarts],[product_uniquepurchases],[carousel_revenue])
                #                 VALUES (?,?,?,?,?,?,?)'''.format(DB_NAME, TABLE_NAME)
                # cursor.executemany(sql_comm, total_df.values.tolist())
                # cursor.commit()
                # doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(),
                #                  'Successful Insert', server_len=len(total_df.index),
                #                  database_len=len(total_df.index))
                # es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
                print('done' + ' ' + str(step_time) + '**' + str(ptrn) +
                      ' for ' + data_type)
                time.sleep(2)
            except Exception as e:
                doc = logger.create_log('Insert', 'Nack', step_time,
                                        socket.gethostname(), str(e))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
コード例 #6
0
def validation(ref_len, _database_engine, _database_name, _database_table, es_engine, es_index):
    sql_maxdate = 'SELECT MAX ([date]) AS "Max Date" FROM {}.dbo.{};'.format(_database_name, _database_table)
    last_insert = pd.read_sql(sql_maxdate, _database_engine).iloc[0][0]

    if last_insert is None:
        ref_date = datetime.datetime.strptime('2017-03-01', '%Y-%m-%d').date()
    else:
        ref_date = last_insert + relativedelta(days=1)
        sql_lastbatch = "SELECT PK FROM {}.dbo.{}" \
                        " WHERE [date] = '{}'".format(_database_name, _database_table, last_insert)
        last_len_DB = len(_database_engine.execute(sql_lastbatch).fetchall())
        last_len_GA = ref_len #len(fetch_data_daily(config, last_insert.strftime('%Y-%m-%d')))
        if (last_len_GA - last_len_DB) > 0.001 * last_len_GA:
            doc = logger.create_log('DB/GA Consistency', 'Nack', hostname=socket.gethostname(),
                             text='Corrupted Last Insert, truncate the last batch!',
                             server_len=last_len_GA, database_len=last_len_DB)
            es_engine.log_into_es(es_engine, 'textlogs-{}'.format(es_index), doc)
            sys.exit()

    return ref_date
コード例 #7
0
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation(analytics)
    # ref_date = datetime.datetime.strptime('2019-06-05', '%Y-%m-%d').date()

    ptrns = [
        '/search/', '/promotion-page/', '/product-list/', '/cart/', '/brand/',
        '/dkp-', '/landing-page/', '/landings/', '/main/', '/profile/',
        'adro.co/', 'homepage', 'mobile-homepage', 'outsource'
    ]

    types = {
        '/search/': 'search',
        '/promotion-page/': 'promotion',
        '/product-list/': 'product-list',
        '/cart/': 'cart',
        '/brand/': 'brand',
        '/dkp-': 'product',
        '/landing-page/': 'landing-page',
        '/landings/': 'landings',
        '/main/': 'main',
        'homepage': 'homepage',
        'mobile-homepage': 'mobile-homepage',
        '/profile/': 'profile',
        'adro.co/': 'adro',
        'outsource': 'outsource'
    }

    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        for ptrn in ptrns[:-1]:
            print(ptrn)
            if ptrn == 'homepage':
                data = cart.fetch_data(VIEW_ID, analytics, step_time,
                                       'https://www.digikala.com/')
            elif ptrn == 'mobile-homepage':
                data = cart.fetch_data(VIEW_ID, analytics, step_time,
                                       'https://mobile.digikala.com/')
            else:
                data = cart.fetch_data(VIEW_ID, analytics, step_time, ptrn)
            data.rename(columns={
                'ga:dimension5': 'total',
                'ga:date': 'date',
                'ga:hits': 'hits'
            },
                        inplace=True)

            data['total'] = data['total'].map(lambda x: str_to_dict(x))
            data = data.dropna(subset=['total'])
            attributes = data['total'].apply(pd.Series)
            data = data.join(attributes)
            data.drop(['total'], axis=1, inplace=True)
            data.rename(columns={
                'page-path': 'pagepath',
                'referrer-path': 'refpath'
            },
                        inplace=True)

            # eliminate hits due to the referrer data ...
            if ptrn == 'homepage':
                data = data.query(
                    'pagepath == "https://www.digikala.com/" or '
                    'pagepath == "https://www.digikala.com/?ref=nav_logo"')
            elif ptrn == 'mobile-homepage':
                data = data.query(
                    'pagepath == "https://mobile.digikala.com/" or '
                    'pagepath == "https://mobile.digikala.com/?ref=nav_logo"')
            else:
                data = data[data['pagepath'].str.contains(ptrn) == True]

            data[['pagepath',
                  'pagetype']] = path_parser.column_pattern_retriever(
                      data, 'pagepath', ptrn, types[ptrn])
            data['reftype'] = np.nan

            if data.empty:
                continue
            for p in ptrns:
                if p == 'homepage' or p == 'mobile-homepage':
                    sub_data = data.query(
                        'refpath == "https://www.digikala.com/" or '
                        'refpath == "https://www.digikala.com/?ref=nav_logo" or '
                        'refpath == "https://mobile.digikala.com/?ref=nav_logo" or '
                        'refpath == "https://mobile.digikala.com/"')
                else:
                    sub_data = data[data['refpath'].str.contains(p) == True]

                if sub_data.empty:
                    continue
                sub_data[['refpath',
                          'reftype']] = path_parser.column_pattern_retriever(
                              sub_data, 'refpath', p, types[p])
                data.update(sub_data)
            data['refpath'] = data['refpath'].map(
                lambda x: 'google' if x.startswith('https://www.google.') else
                ('bing' if x.startswith('https://www.bing.') else x))
            data['reftype'] = data.apply(
                lambda row: 'outsource' if row['refpath'] == 'google' or row[
                    'refpath'] == 'bing' else row['reftype'],
                axis=1)
            data['reftype'] = data.apply(
                lambda row: row['reftype']
                if pd.notnull(row['reftype']) else 'other',
                axis=1)
            data['refpath'] = data.apply(
                lambda row: np.nan
                if row['reftype'] == 'other' else row['refpath'],
                axis=1)

            data['cart-id'] = data['cart-id'].apply(lambda x: np.nan if
                                                    (x == 0 or x == '') else x)
            data['user-id'] = data['user-id'].apply(lambda x: np.nan if
                                                    (x == 0 or x == '') else x)
            data['variant-id'] = data['variant-id'].apply(
                lambda x: np.nan if (x == 0 or x == '') else x)
            data.rename(columns={
                'pagetype': 'pageType',
                'pagepath': 'pagePath',
                'reftype': 'referrerType',
                'refpath': 'referrer',
                'user-id': 'userID',
                'cart-id': 'cartID',
                'variant-id': 'variantID',
            },
                        inplace=True)

            data['pagePath'] = data['pagePath'].str.slice(0, 150 - 5)
            try:
                data['referrer'] = data['referrer'].str.slice(0, 150 - 5)
            except:
                pass

            data.loc[:, 'date'] = pd.to_datetime(data['date'])
            print(data.shape)
            try:
                data.to_sql(TABLE_NAME,
                            cnxn,
                            method="multi",
                            if_exists='append',
                            index=False,
                            chunksize=10)
                doc = logger.create_log('Insert',
                                        'Ack',
                                        step_time,
                                        socket.gethostname(),
                                        'Successful Insert of {}'.format(ptrn),
                                        server_len=len(data.index),
                                        database_len=len(data.index))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
            except Exception as e:
                doc = logger.create_log('Insert', 'Nack', step_time,
                                        socket.gethostname(),
                                        '{} ERROR: '.format(ptrn) + str(e))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
                print('{} ... {} is Done!'.format(step_time, ptrn))
コード例 #8
0
from utils import path_parser
from config import config
from config.config import elastic_configs

VIEW_ID = 'ga:26751439'

DB_NAME = 'DB_Marketing'
TABLE_NAME = 'GA_Add2Cart_PagePath'
INDX = 'ga_add2cart'

# DO NOT CHANGE IT !!!
BATCH_SIZE = 100000

es = es_engine.init_engine(elastic_configs['ES_ADDRESS'])
doc = logger.create_log('ES Connection',
                        'Ack',
                        hostname=socket.gethostname(),
                        text="Successful Connect to ES!")
es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)

# Database Connection
try:
    cnxn = db_engine.init_engine_alchemy(DB_NAME)
    # cursor = cnxn.cursor()
    doc = logger.create_log('DB Connection',
                            'Ack',
                            hostname=socket.gethostname(),
                            text="Successful Connect to DB!")
    es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
except Exception as e:
    doc = logger.create_log('DB Connection',
                            'Nack',
コード例 #9
0
def main():
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation()

    ptrns = [
        '/search/', '/promotion-page/', '/product-list/', '/cart/', '/brand/',
        '/dkp-', '/landing-page/', '/landings/', '/main/', 'homepage'
    ]
    types = {
        '/search/': 'search',
        '/promotion-page/': 'promotion',
        '/product-list/': 'product-list',
        '/cart/': 'cart',
        '/brand/': 'brand',
        '/dkp-': 'product',
        '/landing-page/': 'landing-page',
        '/landings/': 'landings',
        '/main/': 'main',
        'homepage': 'home-page'
    }
    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        for ptrn in ptrns:
            data = pagepath.fetch_data(VIEW_ID, analytics, step_time, ptrn)

            if data.empty:
                continue
            data.columns = ['date', 'pagepath', 'pageview', 'unique_pageview']
            data['pagepath'] = data['pagepath'].map(
                lambda x: x.replace('?', '/'))
            data = data[~data['pagepath'].str.contains('/users/register/')]
            data = data[~data['pagepath'].str.contains('/users/login/')]

            # backup
            data['backup'] = data['pagepath']

            # distinguish compare & product
            if ptrn == '/dkp-':
                data['pagepath'] = data['pagepath'].map(
                    lambda x: 'compare' if x.startswith(
                        '/compare/dkp-') else path_parser.get_dkp(x))
            elif ptrn == 'homepage':
                # get logo data
                list_dfs = [data]
                list_dfs.append(
                    pagepath.fetch_data(VIEW_ID, analytics, step_time,
                                        'dk-logo'))
                if list_dfs[1].empty:
                    continue
                list_dfs[1].columns = [
                    'date', 'pagepath', 'pageview', 'unique_pageview'
                ]
                list_dfs[1]['pagepath'] = 'dk-logo'
                data = pd.concat(list_dfs)
            else:
                data['pagepath'] = data['pagepath'].map(
                    lambda x: ptrn[1:] + x.split(ptrn, 1)[-1])
                special_subcats = lambda x: x.split('/',2)[1] if x.startswith('search/category-') \
                    else ('search' if x.startswith('search/') \
                              else ('cart' if x.startswith('cart/')
                                    else ('landing-page' if x.startswith('landing-page/')
                                          else x.split('/', 2)[1])))
                data['pagepath'] = data['pagepath'].map(special_subcats)
            data['pageType'] = types[ptrn]
            data['device'] = 'dk-desktop'
            if ptrn in ['/promotion-page/', '/product-list/']:
                data['pageType'] = data.apply(
                    lambda x: 'fresh-' + x['pageType']
                    if 'fresh=1' in x['backup'] else x['pageType'],
                    axis=1)

            data.rename(columns={
                'pageview': 'pageView',
                'unique_pageview': 'uniquePageView',
                'pagepath': 'pagePath'
            },
                        inplace=True)
            ordered_cols = [
                'date', 'pageType', 'pagePath', 'pageView', 'uniquePageView'
            ]
            data = data[ordered_cols]
            data['pagePath'] = data['pagePath'].str.slice(0, 200 - 5)
            data.loc[:, 'date'] = pd.to_datetime(data['date'])
            data = data.groupby(['date', 'pageType',
                                 'pagePath']).sum().reset_index()

            try:
                data.to_sql(TABLE_NAME,
                            cnxn,
                            method="multi",
                            if_exists='append',
                            index=False,
                            chunksize=10)
                doc = logger.create_log('Insert',
                                        'Ack',
                                        step_time,
                                        socket.gethostname(),
                                        'Successful Insert of {}'.format(ptrn),
                                        server_len=len(data.index),
                                        database_len=len(data.index))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
            except Exception as e:
                doc = logger.create_log('Insert', 'Nack', step_time,
                                        socket.gethostname(),
                                        '{} ERROR: '.format(ptrn) + str(e))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
            print('{} ... {} is Done!'.format(step_time, ptrn))
コード例 #10
0
def main():
    analytics = ga_engine.initialize_analyticsreporting('ds-web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation(analytics)

    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        total_df = rawdata.fetch_data(VIEW_ID, analytics, step_time, 'trash')
        total_df['ga:adContent'].replace('(not set)', '', inplace=True)
        total_df['ga:campaign'].replace('(not set)', '', inplace=True)
        total_df['ga:keyword'].replace('(not set)', '', inplace=True)
        #total_df.columns = ['adContent', 'campaign', 'date', 'deviceCategory', 'goal12Completions',
        #                'keyword', 'medium', 'sessions', 'source', 'users']
        total_df = total_df.rename(
            columns={
                'ga:adContent': 'adContent',
                'ga:campaign': 'campaign',
                'ga:date': 'date',
                'ga:deviceCategory': 'deviceCategory',
                'ga:transactions': 'goal12Completions',
                'ga:keyword': 'keyword',
                'ga:medium': 'medium',
                'ga:sessions': 'sessions',
                'ga:source': 'source',
                'ga:users': 'users'
            })

        total_df['date'] = pd.to_datetime(total_df['date'])

        total_df['adContent'] = total_df['adContent'].str.strip()
        total_df['campaign'] = total_df['campaign'].str.strip()
        total_df['deviceCategory'] = total_df['deviceCategory'].str.strip()
        total_df['keyword'] = total_df['keyword'].str.strip()
        total_df['medium'] = total_df['medium'].str.strip()
        total_df['source'] = total_df['source'].str.strip()

        total_df['adContent'] = total_df['adContent'].str.slice(0, 500 - 10)
        total_df['campaign'] = total_df['campaign'].str.slice(0, 500 - 10)
        total_df['deviceCategory'] = total_df['deviceCategory'].str.slice(
            0, 100 - 10)
        total_df['keyword'] = total_df['keyword'].str.slice(0, 500 - 10)
        total_df['medium'] = total_df['medium'].str.slice(0, 100 - 10)
        total_df['source'] = total_df['source'].str.slice(0, 100 - 10)

        try:
            cursor.fast_executemany = True
            sql_comm = '''INSERT INTO [{}].[dbo].[{}]
            ([adContent],[campaign],[date],[deviceCategory],[goal12Completions],[keyword],
            [medium],[sessions],[source],[users]) VALUES (?,?,?,?,?,?,?,?,?,?)'''.format(
                DB_NAME, TABLE_NAME)
            cursor.executemany(sql_comm, total_df.values.tolist())
            cursor.commit()
            doc = logger.create_log('Insert',
                                    'Ack',
                                    step_time,
                                    socket.gethostname(),
                                    'Successful Insert',
                                    server_len=len(total_df.index),
                                    database_len=len(total_df.index))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
        except pyodbc.Error as e:
            doc = logger.create_log('Insert', 'Nack', step_time,
                                    socket.gethostname(), str(e))
            es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
            sys.exit()
コード例 #11
0
def main():
    fresh_suply = pd.DataFrame(mysql_queries.get_fresh_supply_cat(0))
    main_cats = pd.DataFrame(mysql_queries.get_main_cats(0))
    main_cats = main_cats.loc[main_cats['code'] == 'food-beverage']
    fresh_suply['code'] = fresh_suply['code'].map(lambda x: 'category-' + x)
    analytics = ga_engine.initialize_analyticsreporting('web')
    limit_date = datetime.datetime.now().date()
    ref_date = validation()
    # ref_date = datetime.datetime.strptime('2019-07-06', '%Y-%m-%d').date()

    ptrns = ['/search/', '/promotion-page/', '/product-list/',
             '/dkp-', '/main/']
    types = {'/search/': 'search', '/promotion-page/': 'promotion',
             '/product-list/': 'product-list', '/cart/': 'cart',
             '/brand/': 'brand', '/dkp-': 'product', '/landing-page/': 'landing-page',
             '/landings/': 'landings', '/main/': 'main', 'homepage': 'home-page'}
    for i in range((limit_date - ref_date).days - 1):
        step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d')
        for ptrn in ptrns:
            data = users_sources.fetch_data(VIEW_ID, analytics, step_time, ptrn)
            if data.empty:
                continue
            data.columns = ['date', 'landingpage', 'medium', 'newusers', 'source']
            data['landingpage'] = data['landingpage'].map(lambda x: x.replace('?', '/'))
            data = data[~data['landingpage'].str.contains('/users/register/')]
            data = data[~data['landingpage'].str.contains('/users/login/')]

            # #backup
            data['backup'] = data['landingpage']
            # distinguish compare & product
            if ptrn == '/dkp-':
                data['landingpage'] = data['landingpage'].map(lambda x: 'compare' if x.startswith('/compare/dkp-') else
                path_parser.get_dkp(x))
            elif ptrn == 'homepage':
                # get logo data
                list_dfs = [data]
                list_dfs.append(users_sources.fetch_data(VIEW_ID, analytics, step_time, 'dk-logo'))
                if list_dfs[1].empty:
                    continue
                list_dfs[1].columns = ['date', 'landingpage', 'medium', 'newusers', 'source']
                list_dfs[1]['landingpage'] = 'dk-logo'
                data = pd.concat(list_dfs)
            else:
                data['landingpage'] = data['landingpage'].map(lambda x: ptrn[1:] + x.split(ptrn,1)[-1])
                special_subcats = lambda x: x.split('/',2)[1] if x.startswith('search/category-') \
                    else ('search' if x.startswith('search/') \
                              else ('cart' if x.startswith('cart/')
                                    else ('landing-page' if x.startswith('landing-page/')
                                          else x.split('/', 2)[1])))
                data['landingpage'] = data['landingpage'].map(special_subcats)

            data['pageType'] = types[ptrn]
            if ptrn in ['/promotion-page/', '/product-list/']:
                data['pageType'] = data.apply(lambda x: 'fresh-'+x['pageType'] if 'fresh=1' in x['backup']
                else x['pageType'], axis=1)

            data.rename(columns={'newusers': 'new_users',
                                 'pageType': 'page_type',
                                 'landingpage': 'landingPage'}, inplace=True)
            ordered_cols = ['date', 'page_type', 'source', 'medium', 'landingPage', 'new_users']
            data = data[ordered_cols]
            # data['source'].replace('(none)', sqlalchemy.sql.null(), inplace=True)
            # data['medium'].replace('(none)', sqlalchemy.sql.null(), inplace=True)
            data['landingPage'] = data['landingPage'].str.slice(0, 200 - 5)
            data['source'] = data['source'].str.slice(0, 200 - 5)
            data['meidum'] = data['medium'].str.slice(0, 50 - 5)
            data.loc[:, 'date'] = pd.to_datetime(data['date'])
            data = data.groupby(['date', 'page_type', 'landingPage', 'source', 'medium']).sum().reset_index()

            fresh_suply_tmp = fresh_suply.copy()
            if ptrn == '/dkp-':
                data['landingPage'] = pd.to_numeric(data['landingPage'], errors='coerce')
                data = data.dropna(subset=['landingPage'])
                data['landingPage'] = data['landingPage'].astype(int)
                data.rename(columns={'landingPage': 'product_id'}, inplace=True)
                outcome = data.merge(fresh_suply_tmp, how='inner', on = ['product_id'])
                outcome.drop('code', axis=1, inplace=True)
                outcome.rename(columns={'product_id': 'code'}, inplace=True)
                outcome = outcome.drop_duplicates()
                outcome.drop('supply_cat', axis=1, inplace=True)
            elif ptrn == '/search/':
                fresh_suply_tmp.drop('product_id', axis=1, inplace=True)
                fresh_suply_tmp = fresh_suply_tmp.drop_duplicates()
                data.rename(columns={'landingPage': 'code'}, inplace=True)
                outcome = data.merge(fresh_suply_tmp, how='inner', on=['code'])
                outcome.drop('supply_cat', axis=1, inplace=True)
            elif ptrn == '/product-list/' or ptrn == '/promotion-page/':
                data = data[data['page_type'].str.startswith('fresh-')]
                data.rename(columns={'landingPage': 'code'}, inplace=True)
                outcome = data
            elif ptrn == '/main/':
                data.rename(columns={'landingPage': 'code'}, inplace=True)
                outcome = data.merge(main_cats, how='inner', on=['code'])



            try:
                with engine.connect() as conn, conn.begin():
                    outcome.to_sql(TABLE_NAME, conn, if_exists='append', index=False)
            except Exception as e:
                doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), '{} ERROR: '.format(ptrn)+str(e))
                es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)