def scrape_weekly_data(): resp = urllib.request.urlopen(constants.WEEKLY_SERIES_URI) resp_data = resp.read() soup = BeautifulSoup(resp_data, 'html.parser') data_table = soup.find_all('table')[4] all_years_months = data_table.find_all('td', {"class": "B6"}) all_weekend_dates = data_table.find_all('td', {"class": "B5"}) all_prices = data_table.find_all('td', {"class": "B3"}) records = [] for year_month in all_years_months: week = 1 year_month_string = year_month.string.strip() for weekend_date in all_weekend_dates: weekend_date_string = weekend_date.string.strip() if not (len(weekend_date_string) == 0): date = (year_month_string[:4] + '-' + weekend_date_string.replace('/', '-')) records.append([date, all_prices[week - 1].string.strip()]) week = week + 1 if week > 5: all_weekend_dates = all_weekend_dates[5:] all_prices = all_prices[5:] break toCSV(records, csv_file_weekly)
def scrape_daily_data(): resp = urllib.request.urlopen(constants.DAILY_SERIES_URI) resp_data = resp.read() soup = BeautifulSoup(resp_data, 'html.parser') data_table = soup.find_all('table')[5] all_weeks = data_table.find_all('td', {"class": "B6"}) all_prices = data_table.find_all('td', {"class": "B3"}) records = [] for week in all_weeks: day = 0 for price in all_prices: week_string = week.string.strip() date = (week_string[:4] + '-' + f'{month_num(week_string[5:8]):02}' + '-' + f'{(int(week_string[9:11]) + day):02}') day = day + 1 records.append([date, price.string]) if day >= 5: all_prices = all_prices[5:] break toCSV(records, csv_file_daily)
def scrape_monthly_data(): resp = urllib.request.urlopen(constants.MONTHLY_SERIES_URI) resp_data = resp.read() soup = BeautifulSoup(resp_data, 'html.parser') data_table = soup.find_all('table')[4].find('table') all_years = data_table.find_all('td', {"class": "B4"}) all_prices = data_table.find_all('td', {"class": "B3"}) records = [] for year in all_years: month = 1 for price in all_prices: date = year.string.strip() + '-' + f'{month:02}' + '-' + '01' month = month + 1 records.append([date, price.string]) if month > 12: all_prices = all_prices[12:] break toCSV(records, csv_file_monthly)
def scrape_annual_data(): resp = urllib.request.urlopen(constants.ANNUAL_SERIES_URI) resp_data = resp.read() soup = BeautifulSoup(resp_data, 'html.parser') data_table = soup.find_all('table')[5] all_decades = data_table.find_all('td', {"class": "B4"}) all_prices = data_table.find_all('td', {"class": "B3"}) records = [] for decade in all_decades: year = 0 for price in all_prices: decade_prefix = int(decade.string.strip()[:3]) date = str(decade_prefix) + str(year) year = year + 1 records.append([date, price.string]) if year > 9: all_prices = all_prices[10:] break toCSV(records, csv_file_annual)
def combineMonth(): new_df = pd.DataFrame() for f in sorted(os.listdir(data_dir + '/' + str(content_type))): if f.endswith(".csv"): file = os.path.join(data_dir + '/' + str(content_type), f) if len(new_df) == 0: new_df = readChunk(file) else: df = readChunk(file) new_df = new_df.merge(df, how='left', on='USERID') new_df.set_index('USERID', inplace=True) cols = new_df.columns new_df['first_occurence'] = new_df.apply(func, axis=1) for i in cols: new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1') new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' * (32 - int(x))) new_df[cols] = new_df[cols].astype(str) new_df['all'] = new_df[cols].apply(''.join, axis=1) print(new_df[['all', 'total']]) new_df[colname] = new_df[['all', 'total' ]].apply(lambda x: int(x[0], 2) / int(x[1], 2), axis=1) print(new_df[colname]) print(cols) cols.append(colname) print(cols) toCSV(new_df[cols], outfile, index=True)
def getQuantiMonth(month): all_df = [] for f in os.listdir('../../data/quanti/' + month): df = readCSV(os.path.join('../../data/quanti/' + month, f), dtype=str) all_df.append(df) all_df = pd.concat(all_df) all_df.set_index("gigyaid", inplace=True) cols = all_df.columns new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols) for col in cols: if col == "contentswatched": # print(getContentsUnique(all_df, col)) contents = getContentsUnique(all_df, col) new_df = pd.merge(new_df, contents, left_index=True, right_on='gigyaid') new_df.drop("contentswatched_x", axis=1, inplace=True) new_df.rename({"contentswatched_y": "contentswatched"}, axis=1, inplace=True) else: all_df[col] = all_df[col].astype(float) new_df[col] = getSum(all_df, col)[col].values toCSV(new_df, "../../data/aggregated/quanti" + month + ".csv")
def main(cursor, query, outfile): s = time.time() df = pd.read_sql(query, con=cursor, chunksize=5000000) df = pd.concat(df) print(df.head()) toCSV(df, outfile, index=False) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total query time: ", total_time)
def cleanData(data_dir): print(data_dir) for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): file = os.path.join(data_dir, f) df = readChunk(file) df = df[cols] for col in remove_comma: df[col] = df[col].astype(str) df[col] = df[col].apply(lambda x: x.replace(",", " ") if x.replace(",", " ") else x) toCSV(df, file, index=False)
def extractUser(users, df, outdir): for user in df.USERID.unique(): if user in users: print(user) temp = df.loc[df.USERID == user] new_df = pd.DataFrame(index = list(range(0, 24)), columns = list(range(1, 183))) for i in temp.index.unique(): for j in range(temp.loc[i]['STARTHOUR'], temp.loc[i]['ENDHOUR']+1): new_df.iloc[j][temp.loc[i]['DAY']] = temp.loc[i]['CONTENT_TYPE'] new_df.index.name = 'HOUR' new_df.fillna(0, inplace = True) toCSV(new_df, outdir+user+".csv")
def extractColumns(data_dir, outdir): print(data_dir) for f in sorted(os.listdir(data_dir)): if f.endswith('.csv'): file = os.path.join(data_dir, f) df = readChunk(file) df.dropna(subset=['USERID'], inplace=True) df.USERID = df.USERID.astype(str) df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str) df = removeLurkers(df) outfile = os.path.join(outdir, f[-12:]) toCSV(df, outfile, index=False)
def transactionDates(): print('getting first and last transaction dates of the customers..') file = "results/first_and_last_transaction_correct.csv" df = readChunk(file, header=None) df.rename(columns={ 0: 'USERID', 1: 'FIRST_TRANSACTION', 2: 'LAST_TRANSACTION' }, inplace=True) file2 = 'results/average_regularity.csv' df2 = readChunk(file2) df2 = df2.merge(df, how='left', on='USERID') df2.drop(['RWEEK'], axis=1, inplace=True) toCSV(df2, 'results/transaction_dates.csv', index=False)
def getQualiMonth(month): all_df = [] for f in os.listdir('../../data/quali/' + month): df = readCSV(os.path.join('../../data/quali/' + month, f), converters=converters) all_df.append(df) all_df = pd.concat(all_df) all_df.set_index("gigyaid", inplace=True) cols = all_df.columns new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols) for col in cols: all_df[col] = all_df[col].apply(lambda x: [i.upper() for i in x]) new_df[col] = getUnique(all_df, col)[col].values new_df.index.name = "gigyaid" toCSV(new_df, "../../data/aggregated/quali" + month + ".csv")
def main(data_dir, out_dir): for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): df = readChunk(os.path.join(data_dir, f)) df = df[[ 'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE', 'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT' ]] s = time.time() df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT']) df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT']) df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Finish getting hour in {}".format(total_time)) toCSV(df, os.path.join(out_dir, f), index=False)
def customerRegularity(file, regularity_type='mean'): print('calculating regularity of type: ', regularity_type) df = readChunk(file) # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True) print('Number of customers: ', len(df.USERID.unique())) s = time.time() df['RWEEK'] = df['RWEEK'].astype(int) if regularity_type == 'mean': new_df = df.groupby('USERID')['RWEEK'].mean().to_frame() elif regularity_type == 'mode': new_df = df.groupby('USERID')['RWEEK'].agg( lambda x: pd.Series.mode(x)[0]).to_frame() new_df['RWEEK'] = round(new_df['RWEEK']) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) toCSV(new_df, 'results/average_regularity.csv')
def calculateTenure(): print('calculating tenure of the active and lost customers..') df = readChunk('results/customer_type.csv') s = time.time() tenure = [] df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION']) df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION']) for i in range(len(df)): if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE': tenure.append((pd.to_datetime('2019-09-01') - df.iloc[i]['FIRST_TRANSACTION']).days) else: tenure.append((df.iloc[i]['LAST_TRANSACTION'] - df.iloc[i]['FIRST_TRANSACTION']).days) df['TENURE'] = tenure e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) print(df.head(10)) toCSV(df, 'results/tenure.csv', index=False)
def generateMonth(): for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): file = os.path.join(data_dir, f) df = readChunk(file) df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int) df.DAY = df.DAY.astype(int) df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int) df = df.loc[df.SESSION_STARTDT_MONTH != 11] new_df = pd.DataFrame(index=df.USERID.unique()) new_df.index.name = 'USERID' temp = df.loc[df.CONTENT_TYPE == content_type] for i in range(df.DAY.min(), df.DAY.max() + 1): temp2 = temp.loc[temp.DAY == i] group = temp2.groupby(['USERID'])['DAY'].count().to_frame() group.DAY = group.DAY.apply(lambda x: np.nan if np.isnan(x) else '1') group.rename(columns={'DAY': str(i)}, inplace=True) new_df = new_df.merge(group, how='left', on='USERID') toCSV(new_df, 'results/' + str(content_type) + '/' + f)
def getCustomerType(): print('getting customer types...') transact = readChunk('results/transaction_dates.csv') aver = readChunk('results/average_regularity.csv') intersession = pd.read_csv('results/intersession.csv') intersession.columns = intersession.columns.str.upper() transact = transact.merge(aver, how='left', on='USERID') transact = transact.merge(intersession, how='right', on='USERID') transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION']) print(transact.head()) transact['RWEEK'] = transact['RWEEK'].astype(float) s = time.time() transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply( lambda x: (pd.to_datetime('2019-09-01') - x).days) transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply( lambda x: 0 if x == -1 else x).astype(float) transact = customerType2(transact, how='new') print(transact.head(10)) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) toCSV(transact, 'results/customer_type.csv', index=False)
def combineMonth(data_dir, outfile, check_login=False): all_df = [] for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): df = readChunk(os.path.join(data_dir, f)) df.dropna(subset=['USERID'], inplace=True) if check_login: df.USERID = df.USERID.astype(str) df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str) df = removeNotLoggedIn(df) df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str) df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int) df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int) df = df.loc[df.SESSION_STARTDT_MONTH != 11] df['DAY'] = df[[ 'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY' ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1) df = df.loc[df.CONTENT_TYPE != 'nan'] df.replace({'CONTENT_TYPE': content_type}, inplace=True) all_df.append(df) all_df = pd.concat(all_df) all_df = all_df[keepcols] toCSV(all_df, outfile, index=False)
def main(file, f): s = time.time() quali_out = os.path.join("../../data/quali/"+month, f[-12:]) quanti_out = os.path.join("../../data/quanti/"+month, f[-12:]) month_out = os.path.join("../../data/month/"+month, f[-12:]) pool = Pool() df = pool.apply(getQualitative, args = (file, ["gigyaid", "devicetype", "deviceos", "browsertype", "connectivitytype", "devicename", "mobiledevice", "screensize", "videoquality", "ipaddress"])) toCSV(df, quali_out) df = pool.apply(getQuantitative, args = (file, ["gigyaid", "viewpageduration", "pagedepth", "actiontaken", "videotitle", "bigdatasessionid"])) toCSV(df, quanti_out) df = pool.apply(getDate, args = (file, ["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp", "viewpageduration"])) toCSV(df, month_out) pool.close() pool.join() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Total process time: ", total_time)
1: 'SESSIONID', 2: 'ADPLAY_COUNT', 3: 'PLAY_COUNT', 4: 'PAUSE_COUNT', 5: 'RESUME_COUNT', 6: 'SEEK_COUNT' }, inplace=True) df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True) cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT'] for i in cols: df[i] = df[i].astype(int) new_df = pd.DataFrame(index=df.USERID.unique()) new_df.index.name = "USERID" new_df.reset_index(inplace=True) for i in cols: new_df = new_df.merge(df.groupby('USERID')[i].sum().to_frame(), how='left', on='USERID') df = readChunk('seek2.csv', header=None) df.rename(columns={0: 'USERID', 1: 'SESSIONID', 2: 'SEEK_COUNT'}, inplace=True) df.SEEK_COUNT = df.SEEK_COUNT.astype(int) new_df = new_df.merge(df.groupby('USERID')['SEEK_COUNT'].sum().to_frame(), how='left', on='USERID') print(new_df.head()) toCSV(new_df, 'CLICK.csv')
5: "RESUME", 6: "SEEK" }, inplace=True) df.drop(columns=['SEEK'], inplace=True) cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME"] for i in cols: df[i] = pd.to_numeric(df[i], errors="coerce") new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(), how='left', on='USERID') print(new_df.head()) df = readChunk("../characterization/seek2.csv") df.rename(columns={0: "USERID", 1: "SESSIONID", 3: "SEEK"}, inplace=True) df.SEEK = pd.to_numeric(df.SEEK, errors="coerce") new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(), how='left', on="USERID") cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME", "SEEK"] new_df["TOTAL"] = 0 for i in cols: new_df["TOTAL"] = new_df["TOTAL"] + new_df[i] for i in cols: new_df[i] = new_df[i] / new_df[:"TOTAL"] print(new_df.head()) toCSV(new_df, "engagement_attributes.csv", index=False)
import warnings warnings.filterwarnings("ignore") import sys sys.path.append("../") import os import time import pandas as pd import numpy as np from utils import readChunk, toCSV from matplotlib import pyplot as plt import seaborn as sns import matplotlib.style as style sns.set() style.use('seaborn-poster') style.use('bmh') df = readChunk("tv_completion.csv", header = None) df.VIDEO_DURATION = pd.to_numeric(df.VIDEO_DURATION, errors = 'coerce') df.rename(columns = {0:'USERID', 1: 'TITLE', 2:'VIDEO_DURATION', 3:'WATCHING_DURATION'}, inplace = True) df['CONTENT_COMPLETION'] = (df.WATCHING_DURATION/df.VIDEO_DURATION)*100 toCSV(df[['CONTENT_COMPLETION']], 'content_completion.csv', index = False)
convert = {'12': '0'} df.replace({'SESSION_STARTDT_MONTH': convert}, inplace=True) toint = ['SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY', 'STARTHOUR'] for i in toint: df[i] = df[i].astype(int) df['ORDER'] = (df.SESSION_STARTDT_MONTH * 100) + (df.SESSION_STARTDT_DAY * 10) + df.STARTHOUR df.sort_values('ORDER', inplace=True) print(len(df)) df.drop_duplicates(subset=['USERID'], keep='first', inplace=True) print(len(df)) labels = pd.read_csv('clustering_6.csv') labels.columns = labels.columns.str.upper() df = df.merge(labels, how='left', on='USERID') for i in df.LABEL: print(df.label) temp = df.loc[df.label == i] new_df = pd.DataFrame(index=list(range(1, 11)), columns=['COUNT']) for j in df.CONTENT_TYPE: temp2 = temp.loc[temp.CONTENT_TYPE == j] new_df.loc[int(j)]['COUNT'] = len(temp2) print(new_df) new_df.index.name = 'CONTENT_TYPE' toCSV(new_df, 'results/firstwatched/' + str(i) + '.csv')
import warnings warnings.filterwarnings("ignore") import sys sys.path.append("../") import os import time import pandas as pd import numpy as np from utils import readChunk, toCSV file = "../data/year_week.csv" df = readChunk(file, sep="\t") print(df.head()) df.WEEK = df.WEEK.astype(int) df.sort_values('WEEK', inplace=True) print(len(df)) df.drop_duplicates(subset=['USERID'], keep='first', inplace=True) print(len(df)) df.rename(columns={'WEEK': 'INCEPTION_WEEK'}, inplace=True) toCSV(df[['USERID', 'INCEPTION_WEEK']], "inception_week.csv", index=False)
import sys sys.path.append("../") import pandas as pd import numpy as np from utils import readChunk, toCSV df = readChunk("../sql/query_results/plateu_month.csv") df.rename(columns={'COUNT(SESSIONID)': 'FREQUENCY'}, inplace=True) df.FREQUENCY = df.FREQUENCY.astype(int) df.MONTH = df.MONTH.astype(int) df = df.loc[df.MONTH >= 201812] total_df = pd.DataFrame(index=df.index.unique(), columns=['frequency']) total_df = df.groupby('USERID')['FREQUENCY'].sum().to_frame() # for i in df.MONTH.unique(): # print(i) # temp = df.loc[df.MONTH == i] # users = temp.index.unique() # for j in users: # total_df.loc[j]['frequency'] = total_df.loc[j]['frequency'] + temp.loc[j]['FREQUENCY'] print(total_df.head()) total_df.index.name = 'USERID' toCSV(total_df, 'results/overall_frequency.csv')
elif col == 'WEEK': init = list(df.loc[df.WEEK == 201848.0].USERID.unique()) df = df.loc[df.WEEK >= 201848.0] new_df = pd.DataFrame(index=df.USERID.unique(), columns=[colname], data=0) new_df.index.name = 'USERID' for i in df[col].unique(): print(i) if col == 'MONTH': if i == 201812.0: continue elif col == 'WEEK': if i == 201848.0: continue temp = df.loc[df[col] == i] users = list(temp.USERID.unique()) common = list(set(init).intersection(users)) for j in range(len(common)): new_df.loc[common[j]][colname] = new_df.loc[common[j]][colname] + 1 # new_df[colname] = new_df.apply(lambda x: addvalue(x[colname]) if str(x.index) in common else x[colname], axis = 1) init.extend(users) init = list(set(init)) print(new_df.head()) print(new_df[colname].unique()) toCSV(new_df, out) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total time: ", total_time)
import sys sys.path.append("../") import pandas as pd import numpy as np from utils import readChunk, toCSV df = readChunk( "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv", header=None) df.rename(columns={ 0: 'MONTH', 1: 'USERID', 2: 'SESSIONID', 3: 'STARTHOUR', 4: 'ENDHOUR', 5: 'engagement' }, inplace=True) print(df.head()) df.engagement = df.engagement.astype(float) df.MONTH = df.MONTH.astype(int) df = df.loc[df.MONTH >= 201812] total_df = df.groupby('USERID')['engagement'].sum().to_frame() total_df.engagement = total_df.engagement / 60.0 print(total_df.head()) toCSV(total_df, 'results/overall_engagement.csv')
diverse = pd.read_csv("../../data/customer_feature_matrix.csv") diverse.columns = diverse.columns.str.upper() diverse = diverse.loc[diverse.FREQUENCY != 1] diverse = diverse.loc[diverse.LABEL != 'NEW'] diverse = diverse[['USERID', 'LABEL']] uniquecust = diverse.USERID.unique() print(len(uniquecust)) # np.savetxt('results/diversecustomers.txt', uniquecust, delimiter = ',', fmt = "%s") file = "../../data/regularity_cleaned_ordered.csv" df = readChunk(file, header=None) df.rename(columns={ 0: 'USERID', 1: 'SESSIONID', 2: 'MONTH', 3: 'WEEK', 4: 'DATE', 5: 'DAY_OF_WEEK' }, inplace=True) print(len(df)) old = 0 for i in range(1, 63): new = 50000 * i print(old, new) temp = uniquecust[old:new] temp2 = df[df['USERID'].isin(temp)] print(len(temp2)) toCSV(temp2, 'results/all/' + str(i) + '.csv', index=False) # np.savetxt('results/all/'+str(i)+'.txt', temp, delimiter = ',', fmt = "%s") old = new + 1
def prepare_all_data(): for series_id, csv_file_name in series_dict.items(): resp_data = retrieve_uri_data(f'http://api.eia.gov/series/?api_key={constants.EIA_API_KEY}&series_id={series_id}') data = resp_data['series'][0]['data'] toCSV(data, csv_file_name)
df.DATE = pd.to_datetime(df.DATE) df.sort_values("DATE", inplace = True) df.drop_duplicates(subset = ['USERID'], keep = 'last', inplace = True) recency = [] for i in range(len(df)): recency.append((pd.to_datetime('2019-09-01') - df.iloc[i]["DATE"]).days) df["RECENCY"] = recency print(df.head(10)) print(len(df)) df = df[['USERID', 'RECENCY']] print('getting engagement') file2 = "../data/eng_current.csv" df2 = readChunk(file2, sep = '\t') df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "TIMESTAMPDIFF(MINUTE, MIN(SESSION_STARTDT), MAX(SESSION_ENDDT))":'ENGAGEMENT'}, inplace = True) file3 = "../data/eng_old.csv" df3 = readChunk(file3, sep = '\t') df3.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "COUNT(DISTINCT SESSIONID)":'FREQUENCY'}, inplace = True) df2 = pd.concat([df2, df3]) df2.ENGAGEMENT = df2.ENGAGEMENT.astype(int) eng = df2.groupby('USERID')['ENGAGEMENT'].sum().to_frame() print(eng.head(10)) print(len(eng)) df = df.merge(freq, how = 'left', on = 'USERID') df = df.merge(eng, how = 'left', on = 'USERID') print(len(df)) toCSV(df, 'results/channel2.csv', index = False)