def main(): analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation(analytics) for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') data = faq.fetch_data(VIEW_ID, analytics, step_time, 'pageview') data.columns = ['date', 'pagePath', 'pageViews'] data['pagePath'] = data['pagePath'].str.slice(0, 300) data['date'] = pd.to_datetime(data['date']) try: cursor.fast_executemany = True sql_comm = '''INSERT INTO [{}].[dbo].[{}] ([date],[pagePath], [pageViews]) VALUES (?,?,?)'''.format(DB_NAME, TABLE_NAME) cursor.executemany(sql_comm, data.values.tolist()) cursor.commit() doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert', server_len=len(data.index), database_len=len(data.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) time.sleep(2)
def main(): analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation(analytics) for i in range((limit_date - ref_date).days - 1): step_time = ref_date + relativedelta(days=+i) year, month = jalali.Gregorian(step_time).persian_tuple()[0:2] custom_start = jalali.Persian(year, month, 1).gregorian_datetime() df_part1 = active_users.fetch_data_daily( VIEW_ID, analytics, step_time.strftime('%Y-%m-%d'), 'web') df_part1.columns = ['date', 'category', 'sessions', 'dailyUsers'] df_part2 = active_users.fetch_data_monthly( VIEW_ID, analytics, step_time.replace(day=1).strftime('%Y-%m-%d'), step_time.strftime('%Y-%m-%d'), 'web') df_part2.columns = ['category', 'month', 'monthlyUsers'] df_part3 = active_users.fetch_data_custom_wrapper( VIEW_ID, analytics, custom_start, step_time, 'monthlyUsersJalali', 'web') df_part4 = active_users.fetch_data_custom_wrapper( VIEW_ID, analytics, step_time + relativedelta(days=-29), step_time, '30DaysWindow', 'web') df_part1['date'] = pd.to_datetime(df_part1['date']) total_df = df_part1.join(df_part2.set_index('category'), on='category') total_df = total_df.join(df_part3.set_index('category'), on='category') total_df = total_df.join(df_part4.set_index('category'), on='category') total_df.drop(['month'], axis=1, inplace=True) print(total_df) try: cursor.fast_executemany = True sql_comm = '''INSERT INTO [{}].[dbo].[{}] ([date],[category],[sessions],[dailyUsers],[monthlyUsers],[monthlyUsersJalali],[30DaysWindow]) VALUES (?,?,?,?,?,?,?)'''.format(DB_NAME, TABLE_NAME) cursor.executemany(sql_comm, total_df.values.tolist()) cursor.commit() doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert', server_len=len(total_df.index), database_len=len(total_df.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) time.sleep(2)
def validation(analytics): sql_maxdate = 'SELECT MAX ([date]) AS "Max Date" FROM {}.dbo.{};'.format( DB_NAME, TABLE_NAME) last_insert = pd.read_sql(sql_maxdate, cnxn).iloc[0][0] if last_insert is None: ref_date = datetime.datetime.strptime('2019-04-25', '%Y-%m-%d').date() else: ref_date = last_insert + relativedelta(days=1) sql_lastbatch = "SELECT PK FROM {}.dbo.{}" \ " WHERE [date] = '{}'".format(DB_NAME, TABLE_NAME, last_insert) last_len_DB = len(cnxn.execute(sql_lastbatch).fetchall()) last_len_GA = len( rawdata.fetch_data(VIEW_ID, analytics, last_insert.strftime('%Y-%m-%d'), 'trash')) if (last_len_GA - last_len_DB) > 0.001 * last_len_GA: doc = logger.create_log( 'DB/GA Consistency', 'Nack', hostname=socket.gethostname(), text='Corrupted Last Insert, truncate the last batch!', server_len=last_len_GA, database_len=last_len_DB) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) sys.exit() return ref_date
def main(): analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation(analytics) for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') data = category.fetch_data(VIEW_ID, analytics, step_time, 'events') data.columns = [ 'supply_category', 'date', 'page_view', 'unique_page_view' ] data['date'] = pd.to_datetime(data['date']) data['supply_category'] = data['supply_category'].str.slice(0, 300 - 5) data['supply_category'].replace('(not set)', sqlalchemy.sql.null(), inplace=True) data.rename(columns={ 'supply_category': 'supplyCategory', 'page_view': 'pageView', 'unique_page_view': 'uniquePageView' }, inplace=True) try: data.to_sql(TABLE_NAME, cnxn, method="multi", if_exists='append', index=False, chunksize=10) doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert', server_len=len(data.index), database_len=len(data.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
def main(): analytics = ga_engine.initialize_analyticsreporting('web') # ref_date = validation(analytics) # ref_date = datetime.datetime.strptime('2019-07-01', '%Y-%m-%d').date() ptrns = [ 'BRAND', 'CMP', 'HOME', 'LANDING', 'PDP', 'PLP', 'PROFILE', 'SEARCH', 'INCREDIBLE', 'THANKYOU' ] for i in range(6): step_time = today_date + relativedelta(days=-i - 3) for ptrn in ptrns: total_df = carousel.fetch_data(view_id, analytics, step_time.strftime('%Y-%m-%d'), ptrn) if total_df.empty: time.sleep(2) continue else: total_df.columns = [ 'date', 'pagepath', 'product_addtocarts', 'carousel_clicks', 'carousel_name', 'carousel_revenue', 'product_uniquepurchases' ] total_df['pagepath'] = total_df['pagepath'].map( lambda x: x.replace('?', '/')) total_df['date'] = pd.to_datetime(total_df['date']) total_df['source'] = data_type total_df = total_df[[ 'date', 'source', 'carousel_name', 'carousel_clicks', 'product_addtocarts', 'product_uniquepurchases', 'carousel_revenue' ]] total_df['carousel_name'] = total_df[ 'carousel_name'].str.strip() total_df['carousel_name'] = total_df[ 'carousel_name'].str.slice(0, 200 - 10) try: print(total_df) # cursor.fast_executemany = True # sql_comm = '''INSERT INTO [{}].[dbo].[{}]([date],[source],[carousel_name],[carousel_clicks],[product_addtocarts],[product_uniquepurchases],[carousel_revenue]) # VALUES (?,?,?,?,?,?,?)'''.format(DB_NAME, TABLE_NAME) # cursor.executemany(sql_comm, total_df.values.tolist()) # cursor.commit() # doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), # 'Successful Insert', server_len=len(total_df.index), # database_len=len(total_df.index)) # es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) print('done' + ' ' + str(step_time) + '**' + str(ptrn) + ' for ' + data_type) time.sleep(2) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)
def validation(ref_len, _database_engine, _database_name, _database_table, es_engine, es_index): sql_maxdate = 'SELECT MAX ([date]) AS "Max Date" FROM {}.dbo.{};'.format(_database_name, _database_table) last_insert = pd.read_sql(sql_maxdate, _database_engine).iloc[0][0] if last_insert is None: ref_date = datetime.datetime.strptime('2017-03-01', '%Y-%m-%d').date() else: ref_date = last_insert + relativedelta(days=1) sql_lastbatch = "SELECT PK FROM {}.dbo.{}" \ " WHERE [date] = '{}'".format(_database_name, _database_table, last_insert) last_len_DB = len(_database_engine.execute(sql_lastbatch).fetchall()) last_len_GA = ref_len #len(fetch_data_daily(config, last_insert.strftime('%Y-%m-%d'))) if (last_len_GA - last_len_DB) > 0.001 * last_len_GA: doc = logger.create_log('DB/GA Consistency', 'Nack', hostname=socket.gethostname(), text='Corrupted Last Insert, truncate the last batch!', server_len=last_len_GA, database_len=last_len_DB) es_engine.log_into_es(es_engine, 'textlogs-{}'.format(es_index), doc) sys.exit() return ref_date
def main(): analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation(analytics) # ref_date = datetime.datetime.strptime('2019-06-05', '%Y-%m-%d').date() ptrns = [ '/search/', '/promotion-page/', '/product-list/', '/cart/', '/brand/', '/dkp-', '/landing-page/', '/landings/', '/main/', '/profile/', 'adro.co/', 'homepage', 'mobile-homepage', 'outsource' ] types = { '/search/': 'search', '/promotion-page/': 'promotion', '/product-list/': 'product-list', '/cart/': 'cart', '/brand/': 'brand', '/dkp-': 'product', '/landing-page/': 'landing-page', '/landings/': 'landings', '/main/': 'main', 'homepage': 'homepage', 'mobile-homepage': 'mobile-homepage', '/profile/': 'profile', 'adro.co/': 'adro', 'outsource': 'outsource' } for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') for ptrn in ptrns[:-1]: print(ptrn) if ptrn == 'homepage': data = cart.fetch_data(VIEW_ID, analytics, step_time, 'https://www.digikala.com/') elif ptrn == 'mobile-homepage': data = cart.fetch_data(VIEW_ID, analytics, step_time, 'https://mobile.digikala.com/') else: data = cart.fetch_data(VIEW_ID, analytics, step_time, ptrn) data.rename(columns={ 'ga:dimension5': 'total', 'ga:date': 'date', 'ga:hits': 'hits' }, inplace=True) data['total'] = data['total'].map(lambda x: str_to_dict(x)) data = data.dropna(subset=['total']) attributes = data['total'].apply(pd.Series) data = data.join(attributes) data.drop(['total'], axis=1, inplace=True) data.rename(columns={ 'page-path': 'pagepath', 'referrer-path': 'refpath' }, inplace=True) # eliminate hits due to the referrer data ... if ptrn == 'homepage': data = data.query( 'pagepath == "https://www.digikala.com/" or ' 'pagepath == "https://www.digikala.com/?ref=nav_logo"') elif ptrn == 'mobile-homepage': data = data.query( 'pagepath == "https://mobile.digikala.com/" or ' 'pagepath == "https://mobile.digikala.com/?ref=nav_logo"') else: data = data[data['pagepath'].str.contains(ptrn) == True] data[['pagepath', 'pagetype']] = path_parser.column_pattern_retriever( data, 'pagepath', ptrn, types[ptrn]) data['reftype'] = np.nan if data.empty: continue for p in ptrns: if p == 'homepage' or p == 'mobile-homepage': sub_data = data.query( 'refpath == "https://www.digikala.com/" or ' 'refpath == "https://www.digikala.com/?ref=nav_logo" or ' 'refpath == "https://mobile.digikala.com/?ref=nav_logo" or ' 'refpath == "https://mobile.digikala.com/"') else: sub_data = data[data['refpath'].str.contains(p) == True] if sub_data.empty: continue sub_data[['refpath', 'reftype']] = path_parser.column_pattern_retriever( sub_data, 'refpath', p, types[p]) data.update(sub_data) data['refpath'] = data['refpath'].map( lambda x: 'google' if x.startswith('https://www.google.') else ('bing' if x.startswith('https://www.bing.') else x)) data['reftype'] = data.apply( lambda row: 'outsource' if row['refpath'] == 'google' or row[ 'refpath'] == 'bing' else row['reftype'], axis=1) data['reftype'] = data.apply( lambda row: row['reftype'] if pd.notnull(row['reftype']) else 'other', axis=1) data['refpath'] = data.apply( lambda row: np.nan if row['reftype'] == 'other' else row['refpath'], axis=1) data['cart-id'] = data['cart-id'].apply(lambda x: np.nan if (x == 0 or x == '') else x) data['user-id'] = data['user-id'].apply(lambda x: np.nan if (x == 0 or x == '') else x) data['variant-id'] = data['variant-id'].apply( lambda x: np.nan if (x == 0 or x == '') else x) data.rename(columns={ 'pagetype': 'pageType', 'pagepath': 'pagePath', 'reftype': 'referrerType', 'refpath': 'referrer', 'user-id': 'userID', 'cart-id': 'cartID', 'variant-id': 'variantID', }, inplace=True) data['pagePath'] = data['pagePath'].str.slice(0, 150 - 5) try: data['referrer'] = data['referrer'].str.slice(0, 150 - 5) except: pass data.loc[:, 'date'] = pd.to_datetime(data['date']) print(data.shape) try: data.to_sql(TABLE_NAME, cnxn, method="multi", if_exists='append', index=False, chunksize=10) doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert of {}'.format(ptrn), server_len=len(data.index), database_len=len(data.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), '{} ERROR: '.format(ptrn) + str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) print('{} ... {} is Done!'.format(step_time, ptrn))
from utils import path_parser from config import config from config.config import elastic_configs VIEW_ID = 'ga:26751439' DB_NAME = 'DB_Marketing' TABLE_NAME = 'GA_Add2Cart_PagePath' INDX = 'ga_add2cart' # DO NOT CHANGE IT !!! BATCH_SIZE = 100000 es = es_engine.init_engine(elastic_configs['ES_ADDRESS']) doc = logger.create_log('ES Connection', 'Ack', hostname=socket.gethostname(), text="Successful Connect to ES!") es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) # Database Connection try: cnxn = db_engine.init_engine_alchemy(DB_NAME) # cursor = cnxn.cursor() doc = logger.create_log('DB Connection', 'Ack', hostname=socket.gethostname(), text="Successful Connect to DB!") es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('DB Connection', 'Nack',
def main(): analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation() ptrns = [ '/search/', '/promotion-page/', '/product-list/', '/cart/', '/brand/', '/dkp-', '/landing-page/', '/landings/', '/main/', 'homepage' ] types = { '/search/': 'search', '/promotion-page/': 'promotion', '/product-list/': 'product-list', '/cart/': 'cart', '/brand/': 'brand', '/dkp-': 'product', '/landing-page/': 'landing-page', '/landings/': 'landings', '/main/': 'main', 'homepage': 'home-page' } for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') for ptrn in ptrns: data = pagepath.fetch_data(VIEW_ID, analytics, step_time, ptrn) if data.empty: continue data.columns = ['date', 'pagepath', 'pageview', 'unique_pageview'] data['pagepath'] = data['pagepath'].map( lambda x: x.replace('?', '/')) data = data[~data['pagepath'].str.contains('/users/register/')] data = data[~data['pagepath'].str.contains('/users/login/')] # backup data['backup'] = data['pagepath'] # distinguish compare & product if ptrn == '/dkp-': data['pagepath'] = data['pagepath'].map( lambda x: 'compare' if x.startswith( '/compare/dkp-') else path_parser.get_dkp(x)) elif ptrn == 'homepage': # get logo data list_dfs = [data] list_dfs.append( pagepath.fetch_data(VIEW_ID, analytics, step_time, 'dk-logo')) if list_dfs[1].empty: continue list_dfs[1].columns = [ 'date', 'pagepath', 'pageview', 'unique_pageview' ] list_dfs[1]['pagepath'] = 'dk-logo' data = pd.concat(list_dfs) else: data['pagepath'] = data['pagepath'].map( lambda x: ptrn[1:] + x.split(ptrn, 1)[-1]) special_subcats = lambda x: x.split('/',2)[1] if x.startswith('search/category-') \ else ('search' if x.startswith('search/') \ else ('cart' if x.startswith('cart/') else ('landing-page' if x.startswith('landing-page/') else x.split('/', 2)[1]))) data['pagepath'] = data['pagepath'].map(special_subcats) data['pageType'] = types[ptrn] data['device'] = 'dk-desktop' if ptrn in ['/promotion-page/', '/product-list/']: data['pageType'] = data.apply( lambda x: 'fresh-' + x['pageType'] if 'fresh=1' in x['backup'] else x['pageType'], axis=1) data.rename(columns={ 'pageview': 'pageView', 'unique_pageview': 'uniquePageView', 'pagepath': 'pagePath' }, inplace=True) ordered_cols = [ 'date', 'pageType', 'pagePath', 'pageView', 'uniquePageView' ] data = data[ordered_cols] data['pagePath'] = data['pagePath'].str.slice(0, 200 - 5) data.loc[:, 'date'] = pd.to_datetime(data['date']) data = data.groupby(['date', 'pageType', 'pagePath']).sum().reset_index() try: data.to_sql(TABLE_NAME, cnxn, method="multi", if_exists='append', index=False, chunksize=10) doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert of {}'.format(ptrn), server_len=len(data.index), database_len=len(data.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), '{} ERROR: '.format(ptrn) + str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) print('{} ... {} is Done!'.format(step_time, ptrn))
def main(): analytics = ga_engine.initialize_analyticsreporting('ds-web') limit_date = datetime.datetime.now().date() ref_date = validation(analytics) for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') total_df = rawdata.fetch_data(VIEW_ID, analytics, step_time, 'trash') total_df['ga:adContent'].replace('(not set)', '', inplace=True) total_df['ga:campaign'].replace('(not set)', '', inplace=True) total_df['ga:keyword'].replace('(not set)', '', inplace=True) #total_df.columns = ['adContent', 'campaign', 'date', 'deviceCategory', 'goal12Completions', # 'keyword', 'medium', 'sessions', 'source', 'users'] total_df = total_df.rename( columns={ 'ga:adContent': 'adContent', 'ga:campaign': 'campaign', 'ga:date': 'date', 'ga:deviceCategory': 'deviceCategory', 'ga:transactions': 'goal12Completions', 'ga:keyword': 'keyword', 'ga:medium': 'medium', 'ga:sessions': 'sessions', 'ga:source': 'source', 'ga:users': 'users' }) total_df['date'] = pd.to_datetime(total_df['date']) total_df['adContent'] = total_df['adContent'].str.strip() total_df['campaign'] = total_df['campaign'].str.strip() total_df['deviceCategory'] = total_df['deviceCategory'].str.strip() total_df['keyword'] = total_df['keyword'].str.strip() total_df['medium'] = total_df['medium'].str.strip() total_df['source'] = total_df['source'].str.strip() total_df['adContent'] = total_df['adContent'].str.slice(0, 500 - 10) total_df['campaign'] = total_df['campaign'].str.slice(0, 500 - 10) total_df['deviceCategory'] = total_df['deviceCategory'].str.slice( 0, 100 - 10) total_df['keyword'] = total_df['keyword'].str.slice(0, 500 - 10) total_df['medium'] = total_df['medium'].str.slice(0, 100 - 10) total_df['source'] = total_df['source'].str.slice(0, 100 - 10) try: cursor.fast_executemany = True sql_comm = '''INSERT INTO [{}].[dbo].[{}] ([adContent],[campaign],[date],[deviceCategory],[goal12Completions],[keyword], [medium],[sessions],[source],[users]) VALUES (?,?,?,?,?,?,?,?,?,?)'''.format( DB_NAME, TABLE_NAME) cursor.executemany(sql_comm, total_df.values.tolist()) cursor.commit() doc = logger.create_log('Insert', 'Ack', step_time, socket.gethostname(), 'Successful Insert', server_len=len(total_df.index), database_len=len(total_df.index)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) except pyodbc.Error as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc) sys.exit()
def main(): fresh_suply = pd.DataFrame(mysql_queries.get_fresh_supply_cat(0)) main_cats = pd.DataFrame(mysql_queries.get_main_cats(0)) main_cats = main_cats.loc[main_cats['code'] == 'food-beverage'] fresh_suply['code'] = fresh_suply['code'].map(lambda x: 'category-' + x) analytics = ga_engine.initialize_analyticsreporting('web') limit_date = datetime.datetime.now().date() ref_date = validation() # ref_date = datetime.datetime.strptime('2019-07-06', '%Y-%m-%d').date() ptrns = ['/search/', '/promotion-page/', '/product-list/', '/dkp-', '/main/'] types = {'/search/': 'search', '/promotion-page/': 'promotion', '/product-list/': 'product-list', '/cart/': 'cart', '/brand/': 'brand', '/dkp-': 'product', '/landing-page/': 'landing-page', '/landings/': 'landings', '/main/': 'main', 'homepage': 'home-page'} for i in range((limit_date - ref_date).days - 1): step_time = (ref_date + relativedelta(days=+i)).strftime('%Y-%m-%d') for ptrn in ptrns: data = users_sources.fetch_data(VIEW_ID, analytics, step_time, ptrn) if data.empty: continue data.columns = ['date', 'landingpage', 'medium', 'newusers', 'source'] data['landingpage'] = data['landingpage'].map(lambda x: x.replace('?', '/')) data = data[~data['landingpage'].str.contains('/users/register/')] data = data[~data['landingpage'].str.contains('/users/login/')] # #backup data['backup'] = data['landingpage'] # distinguish compare & product if ptrn == '/dkp-': data['landingpage'] = data['landingpage'].map(lambda x: 'compare' if x.startswith('/compare/dkp-') else path_parser.get_dkp(x)) elif ptrn == 'homepage': # get logo data list_dfs = [data] list_dfs.append(users_sources.fetch_data(VIEW_ID, analytics, step_time, 'dk-logo')) if list_dfs[1].empty: continue list_dfs[1].columns = ['date', 'landingpage', 'medium', 'newusers', 'source'] list_dfs[1]['landingpage'] = 'dk-logo' data = pd.concat(list_dfs) else: data['landingpage'] = data['landingpage'].map(lambda x: ptrn[1:] + x.split(ptrn,1)[-1]) special_subcats = lambda x: x.split('/',2)[1] if x.startswith('search/category-') \ else ('search' if x.startswith('search/') \ else ('cart' if x.startswith('cart/') else ('landing-page' if x.startswith('landing-page/') else x.split('/', 2)[1]))) data['landingpage'] = data['landingpage'].map(special_subcats) data['pageType'] = types[ptrn] if ptrn in ['/promotion-page/', '/product-list/']: data['pageType'] = data.apply(lambda x: 'fresh-'+x['pageType'] if 'fresh=1' in x['backup'] else x['pageType'], axis=1) data.rename(columns={'newusers': 'new_users', 'pageType': 'page_type', 'landingpage': 'landingPage'}, inplace=True) ordered_cols = ['date', 'page_type', 'source', 'medium', 'landingPage', 'new_users'] data = data[ordered_cols] # data['source'].replace('(none)', sqlalchemy.sql.null(), inplace=True) # data['medium'].replace('(none)', sqlalchemy.sql.null(), inplace=True) data['landingPage'] = data['landingPage'].str.slice(0, 200 - 5) data['source'] = data['source'].str.slice(0, 200 - 5) data['meidum'] = data['medium'].str.slice(0, 50 - 5) data.loc[:, 'date'] = pd.to_datetime(data['date']) data = data.groupby(['date', 'page_type', 'landingPage', 'source', 'medium']).sum().reset_index() fresh_suply_tmp = fresh_suply.copy() if ptrn == '/dkp-': data['landingPage'] = pd.to_numeric(data['landingPage'], errors='coerce') data = data.dropna(subset=['landingPage']) data['landingPage'] = data['landingPage'].astype(int) data.rename(columns={'landingPage': 'product_id'}, inplace=True) outcome = data.merge(fresh_suply_tmp, how='inner', on = ['product_id']) outcome.drop('code', axis=1, inplace=True) outcome.rename(columns={'product_id': 'code'}, inplace=True) outcome = outcome.drop_duplicates() outcome.drop('supply_cat', axis=1, inplace=True) elif ptrn == '/search/': fresh_suply_tmp.drop('product_id', axis=1, inplace=True) fresh_suply_tmp = fresh_suply_tmp.drop_duplicates() data.rename(columns={'landingPage': 'code'}, inplace=True) outcome = data.merge(fresh_suply_tmp, how='inner', on=['code']) outcome.drop('supply_cat', axis=1, inplace=True) elif ptrn == '/product-list/' or ptrn == '/promotion-page/': data = data[data['page_type'].str.startswith('fresh-')] data.rename(columns={'landingPage': 'code'}, inplace=True) outcome = data elif ptrn == '/main/': data.rename(columns={'landingPage': 'code'}, inplace=True) outcome = data.merge(main_cats, how='inner', on=['code']) try: with engine.connect() as conn, conn.begin(): outcome.to_sql(TABLE_NAME, conn, if_exists='append', index=False) except Exception as e: doc = logger.create_log('Insert', 'Nack', step_time, socket.gethostname(), '{} ERROR: '.format(ptrn)+str(e)) es_engine.log_into_es(es, 'textlogs-{}'.format(INDX), doc)