def combineMonth(): new_df = pd.DataFrame() for f in sorted(os.listdir(data_dir + '/' + str(content_type))): if f.endswith(".csv"): file = os.path.join(data_dir + '/' + str(content_type), f) if len(new_df) == 0: new_df = readChunk(file) else: df = readChunk(file) new_df = new_df.merge(df, how='left', on='USERID') new_df.set_index('USERID', inplace=True) cols = new_df.columns new_df['first_occurence'] = new_df.apply(func, axis=1) for i in cols: new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1') new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' * (32 - int(x))) new_df[cols] = new_df[cols].astype(str) new_df['all'] = new_df[cols].apply(''.join, axis=1) print(new_df[['all', 'total']]) new_df[colname] = new_df[['all', 'total' ]].apply(lambda x: int(x[0], 2) / int(x[1], 2), axis=1) print(new_df[colname]) print(cols) cols.append(colname) print(cols) toCSV(new_df[cols], outfile, index=True)
def plotWeeklyRegularity3(file, file2 = None, ylim = None): df = readChunk(file, header = None) df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True) if file2: df2 = readChunk(file2, header = None) df2.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True) df = pd.concat([df, df2]) print(df.head()) print('Number of customers: ', len(df.USERID.unique())) df['RWEEK'] = df['RWEEK'].astype(int) df['WEEK'] = df["WEEK"].astype(int) df.sort_values('WEEK', inplace = True) df = df.loc[df.WEEK != 201904] new_df = df.groupby(['RWEEK', 'WEEK'])['USERID'].count().to_frame().reset_index() print(new_df.head(20)) new_df = new_df.groupby('RWEEK')['USERID'].mean().to_frame() new_df['USERID'] = round(new_df['USERID']) new_df['USERID'] = new_df['USERID'].astype(int) print(new_df.head(20)) plot = new_df.plot(kind = 'bar', legend = False, rot = 0) for i in range(len(new_df)): plot.text(i, new_df.iloc[i]['USERID'], new_df.iloc[i]['USERID'], horizontalalignment = 'center') plot.set_xlabel('REGULARITY') plt.savefig("weekly_average_regularity.png", dpi = 300)
def plotRegularityFreq(): file = "../status/results/regularity_combined_monthly.csv" df = readChunk(file) print('Number of customers: ', len(df.USERID.unique())) print(df.head()) df['RMONTH'] = df['RMONTH'].astype(int) df['MONTH'] = df['MONTH'].astype(int) df = df.loc[df.MONTH != 1] new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT']) new_df.index.name = 'REGULARITY' for i in range(1, 31): temp = df.loc[df.RMONTH == i] new_df.loc[i]['COUNT'] = len(temp) print(new_df.head()) barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True) new_df = df.groupby('USERID')['RMONTH'].mean().to_frame() new_df['RMONTH'] = round(new_df['RMONTH']) print(new_df.head()) new_df2 = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT']) new_df2.index.name = 'REGULARITY' for i in range(1, 31): temp = new_df.loc[new_df.RMONTH == i] new_df2.loc[i]['COUNT'] = len(temp) barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)
def getWeekPresent(): file = "../data/yearweek_correct.csv" df = readChunk(file, header=None) df.rename(columns={ 0: "USERID", 1: "SESSIONID", 2: "YEARWEEK" }, inplace=True) df.YEARWEEK = df.YEARWEEK.astype(int) df.YEARWEEK = df.YEARWEEK - 201900 df = df.loc[df.YEARWEEK != 4] df.drop_duplicates(subset=["USERID", "YEARWEEK"], keep="first", inplace=True) print(df.head(10)) with open("customer_present.csv", "a") as f: writer = csv.writer(f, delimiter=',') for i in df.USERID.unique(): temp = df.loc[df.USERID == i] new_df = pd.DataFrame(index=[i], data=0, columns=list(range(5, 35))) for j in range(len(temp)): week = temp.iloc[j]['YEARWEEK'] new_df.loc[i][int(week)] = 1 writer.writerow(new_df.reset_index().iloc[0])
def getQuantitative(file, usecols): s = time.time() print("Getting the quantitative features: ", file) transact = readChunk(file, usecols) transact = transact.loc[transact.gigyaid.notnull()] if len(transact) == 0: return pd.DataFrame() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("No unique customer") else: transact = transact.loc[transact.viewpageduration.notnull()] transact["viewpageduration"] = transact["viewpageduration"].astype(int) totalviewpageduration = transact.groupby("gigyaid")["viewpageduration"].sum().to_frame() totalnumbersession = transact.groupby("gigyaid")["bigdatasessionid"].nunique().to_frame() quanti = pd.concat([totalviewpageduration, totalnumbersession], axis = 1) quanti = quanti.loc[:, ~quanti.columns.duplicated()] actions = list(set(transact["actiontaken"].unique().tolist())) for action in actions: temp = transact.loc[transact["actiontaken"] == action] quanti[action] = temp.groupby("gigyaid")["actiontaken"].count() temp = transact.loc[transact["actiontaken"].notnull()] quanti["watched"] = temp.groupby("gigyaid")["videotitle"].nunique() quanti["contentswatched"] = temp.groupby("gigyaid")["videotitle"].unique().tolist() quanti.fillna(0, inplace=True) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Finish getting quantitative features: ", total_time) return quanti
def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mode', mode_type = None): cust_type = pd.read_csv("results/customer_type.csv", usecols = ['USERID', 'CUSTOMERTYPE']) df = readChunk("status/results/regularity_combined.csv") print(len(df)) if type(custids) is list: df = df[df['USERID'].isin(custids)] print('Number of customers: ', len(df.USERID.unique())) print(df.columns) df.dropna(subset = ['RWEEK'], inplace = True) print('Number of customers: ', len(df.USERID.unique())) df['RWEEK'] = df['RWEEK'].astype(int) df['WEEK'] = df["WEEK"].astype(int) df = df.loc[df.WEEK != 201904] if regularity_type == 'mode': if mode_type == 'min': df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame() elif mode_type == 'max': df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame() else: df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame() df.reset_index(inplace = True) print(df.head()) df = df.merge(cust_type, how = 'left', on = 'USERID') for z in ['ACTIVE', 'LOST']: df_2 = df.loc[df.CUSTOMERTYPE == z] fig, axes = plt.subplots(8,4, sharey = 'row', constrained_layout = True) x = 0 y = 0 for i in sorted(df_2.WEEK.unique()): temp = df_2.loc[df_2.WEEK == i] new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT']) new_df.index.name = 'REGULARITY' print(len(temp)) for j in range(1, 8): temp2 = temp.loc[temp.RWEEK == j] new_df.loc[j]['COUNT'] = len(temp2) print(new_df) plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0) # plot.set_ylabel('NUMBER OF CUSTOMERS') # plot.set_xlabel('REGULARITY') plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2) plot.set_title(i, size = 6, pad = 2) x_axis = plot.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) if ylim: plot.set_ylim(0,ylim) y = y + 1 if y == 4: y = 0 x = x + 1 new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv') fig.delaxes(axes[7,3]) fig.delaxes(axes[7,2]) outfile = "results/customerregfreq"+z+str(i)+'.png' if outfile: plt.savefig(outfile, dpi = 600)
def transactionDates(): print('getting first and last transaction dates of the customers..') file = "results/first_and_last_transaction_correct.csv" df = readChunk(file, header=None) df.rename(columns={ 0: 'USERID', 1: 'FIRST_TRANSACTION', 2: 'LAST_TRANSACTION' }, inplace=True) file2 = 'results/average_regularity.csv' df2 = readChunk(file2) df2 = df2.merge(df, how='left', on='USERID') df2.drop(['RWEEK'], axis=1, inplace=True) toCSV(df2, 'results/transaction_dates.csv', index=False)
def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mean', mode_type = None): df = readChunk("../status/results/regularity_combined_monthly.csv") print(len(df)) print(df.head()) if type(custids) is list: df = df[df['USERID'].isin(custids)] print('Number of customers: ', len(df.USERID.unique())) df.dropna(subset = ['RMONTH'], inplace = True) print('Number of customers: ', len(df.USERID.unique())) df['RMONTH'] = df['RMONTH'].astype(int) df['MONTH'] = df["MONTH"].astype(int) df = df.loc[df.MONTH != 1] if regularity_type == 'mode': if mode_type == 'min': df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame() elif mode_type == 'max': df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame() else: df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame() elif regularity_type == 'mean': df = df.groupby('USERID', 'MONTH')['RMONTH'].mean().to_frame() df['RMONTH'] = round(df.RMONTH) else: print('What regularity type?') fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True) x = 0 y = 0 print(df.head()) for i in sorted(df.MONTH.unique()): temp = df.loc[df.MONTH == i] new_df = pd.DataFrame(index = list(range(1, 32)), columns = ['COUNT']) new_df.index.name = 'REGULARITY' for j in range(1, 32): temp2 = temp.loc[temp.RMONTH == j] new_df.loc[j]['COUNT'] = len(temp2) print(new_df) plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0) plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2) plot.set_title(i, size = 6, pad = 2) x_axis = plot.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) if ylim: plot.set_ylim(0,ylim) y = y + 1 if y == 2: y = 0 x = x + 1 new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv') # fig.delaxes(axes[7,3]) fig.delaxes(axes[3,1]) if outfile: plt.savefig(outfile, dpi = 600)
def getFile(file1, file2 = None): df = readChunk(file1, sep = '\t') df.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True) if file2: df2 = readChunk(file2, sep = '\t') df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True) df = pd.concat([df, df2]) print(df.head()) df.drop_duplicates(subset = ['USERID', 'DATE'], inplace = True) label = pd.read_csv("../data/customer_feature_matrix.csv", usecols = ["userid", "label"]) label.columns = label.columns.str.upper() label = label.loc[label.LABEL == 'ACTIVE'] df = df.merge(label, how = 'left', on = 'USERID') return df
def cleanData(data_dir): print(data_dir) for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): file = os.path.join(data_dir, f) df = readChunk(file) df = df[cols] for col in remove_comma: df[col] = df[col].astype(str) df[col] = df[col].apply(lambda x: x.replace(",", " ") if x.replace(",", " ") else x) toCSV(df, file, index=False)
def getCustomerType(): print('getting customer types...') transact = readChunk('results/transaction_dates.csv') aver = readChunk('results/average_regularity.csv') intersession = pd.read_csv('results/intersession.csv') intersession.columns = intersession.columns.str.upper() transact = transact.merge(aver, how='left', on='USERID') transact = transact.merge(intersession, how='right', on='USERID') transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION']) print(transact.head()) transact['RWEEK'] = transact['RWEEK'].astype(float) s = time.time() transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply( lambda x: (pd.to_datetime('2019-09-01') - x).days) transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply( lambda x: 0 if x == -1 else x).astype(float) transact = customerType2(transact, how='new') print(transact.head(10)) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) toCSV(transact, 'results/customer_type.csv', index=False)
def extractColumns(data_dir, outdir): print(data_dir) for f in sorted(os.listdir(data_dir)): if f.endswith('.csv'): file = os.path.join(data_dir, f) df = readChunk(file) df.dropna(subset=['USERID'], inplace=True) df.USERID = df.USERID.astype(str) df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str) df = removeLurkers(df) outfile = os.path.join(outdir, f[-12:]) toCSV(df, outfile, index=False)
def addJoinedWeek(): file = "customer_present.csv" df = readChunk(file, header=None) df.rename(columns={0: "USERID"}, inplace=True) df.set_index('USERID', inplace=True) joined = [] for i in range(len(df)): joined.append(getJoinedWeek(df.iloc[i])) df['joinedweek'] = joined print(df.joinedweek) df.to_csv('week_present_and_joined.csv')
def plotRegularityTenure(): file = 'results/tenure.csv' df = readChunk(file) df['RWEEK'] = df['RWEEK'].astype(float) df['TENURE'] = df['TENURE'].astype(float) for i in df.RWEEK.unique(): temp = df.loc[df.RWEEK == i] plot = sns.distplot(a = temp['TENURE'].values, kde = False) plot.set_ylim(0,4000) plt.title('Regularity = {}'.format(str(i)[0])) plot.set_xlabel('TENURE (days)') plot.set_ylabel('NUMBER OF CUSTOMERS') plt.savefig(str(i)+'.png', dpi = 600) plt.clf()
def plotWeeklyRegularity(weekno = None, custids = None, ylim = None, outfile = None): df = readChunk("../status/results/regularity_combined_monthly.csv") print(len(df)) if type(custids) is list: df = df[df['USERID'].isin(custids)] print('Number of customers: ', len(df.USERID.unique())) df.dropna(subset = ['RMONTH'], inplace = True) print('Number of customers: ', len(df.USERID.unique())) df['RMONTH'] = df['RMONTH'].astype(int) df['MONTH'] = df["MONTH"].astype(int) df = df.loc[df.MONTH != 1] df.sort_values('MONTH', inplace = True) fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True) x = 0 y = 0 months = ['FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST'] count = 0 for i in df.MONTH.unique(): temp = df.loc[df.MONTH == i] new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT']) new_df.index.name = 'REGULARITY' for j in range(1,31): temp2 = temp.loc[temp.RMONTH == j] new_df.loc[j]['COUNT'] = len(temp2) plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0) plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2) plot.set_title(months[count], size = 6, pad = 2) x_axis = plot.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) if ylim: plot.set_ylim(0,ylim) y = y + 1 if y == 2: y = 0 x = x + 1 new_df.to_csv('results/reqfreq/week_'+str(i)+'.csv') count = count + 1 # fig.delaxes(axes[7,3]) fig.delaxes(axes[3,1]) # outfile = "results/regfreq"+z+str(i)+'.png' if outfile: plt.savefig(outfile, dpi = 600)
def customerRegularity(file, regularity_type='mean'): print('calculating regularity of type: ', regularity_type) df = readChunk(file) # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True) print('Number of customers: ', len(df.USERID.unique())) s = time.time() df['RWEEK'] = df['RWEEK'].astype(int) if regularity_type == 'mean': new_df = df.groupby('USERID')['RWEEK'].mean().to_frame() elif regularity_type == 'mode': new_df = df.groupby('USERID')['RWEEK'].agg( lambda x: pd.Series.mode(x)[0]).to_frame() new_df['RWEEK'] = round(new_df['RWEEK']) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) toCSV(new_df, 'results/average_regularity.csv')
def getDate(file, usecols): s = time.time() print("Getting the time features: ", file) transact = readChunk(file, usecols) transact = transact.loc[transact.gigyaid.notnull()] if len(transact) == 0: return pd.DataFrame() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("No unique customer") else: transact = transact.loc[transact.viewpageduration.notnull()] transact["viewpageduration"] = transact["viewpageduration"].astype(int) group = transact.groupby(["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp"])["viewpageduration"].sum().to_frame() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Finish getting date features: ", total_time) return group
def main(data_dir, out_dir): for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): df = readChunk(os.path.join(data_dir, f)) df = df[[ 'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE', 'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT' ]] s = time.time() df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT']) df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT']) df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Finish getting hour in {}".format(total_time)) toCSV(df, os.path.join(out_dir, f), index=False)
def getQualitative(file, usecols): s = time.time() print("Getting the qualitative features: ", file) transact = readChunk(file, usecols) transact = transact.loc[transact.gigyaid.notnull()] transact.loc[transact.browsertype.notnull(), "browsertype"] = "WEB APPLICATION" transact.browsertype.replace(np.nan, "MOBILE APPLICATION", inplace=True) if len(transact) == 0: return pd.DataFrame() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("No unique customer") else: group = transact.groupby("gigyaid") devicetype = group.apply(lambda x: x["devicetype"].unique().tolist() ).reset_index(name="devicetype") deviceos = group.apply(lambda x: x["deviceos"].unique().tolist() ).reset_index(name="deviceos") ipaddress = group.apply(lambda x: x["ipaddress"].unique().tolist() ).reset_index(name="ipaddress") browsertype = group.apply(lambda x: x["browsertype"].unique().tolist() ).reset_index(name="browsertype") connectivitytype = group.apply(lambda x: x["connectivitytype"].unique( ).tolist()).reset_index(name="connectivitytype") screensize = group.apply(lambda x: x["screensize"].unique().tolist() ).reset_index(name="screensize") videoquality = group.apply(lambda x: x["videoquality"].unique().tolist( )).reset_index(name="videoquality") devicename = group.apply(lambda x: x["devicename"].unique().tolist() ).reset_index(name="devicename") mobiledevice = group.apply(lambda x: x["mobiledevice"].unique().tolist( )).reset_index(name="mobiledevice") df = pd.concat([ devicetype, deviceos, ipaddress, browsertype, connectivitytype, screensize, videoquality, devicename, mobiledevice ], axis=1) df = df.loc[:, ~df.columns.duplicated()] df = df.set_index('gigyaid') e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Finish getting qualitative features: ", total_time) return (df)
def calculateTenure(): print('calculating tenure of the active and lost customers..') df = readChunk('results/customer_type.csv') s = time.time() tenure = [] df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION']) df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION']) for i in range(len(df)): if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE': tenure.append((pd.to_datetime('2019-09-01') - df.iloc[i]['FIRST_TRANSACTION']).days) else: tenure.append((df.iloc[i]['LAST_TRANSACTION'] - df.iloc[i]['FIRST_TRANSACTION']).days) df['TENURE'] = tenure e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e - s)) print("Total process time is {}".format(total_time)) print(df.head(10)) toCSV(df, 'results/tenure.csv', index=False)
def plotDayofWeek(): df = pd.read_csv("status/rweek.csv") df = readChunk("status/results/regularity_combined.csv") df.columns = ['WEEK', 'SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'RWEEK', 'USERID'] df.dropna(subset = ['RWEEK'], inplace = True) print('Number of customers: ', len(df.USERID.unique())) df['RWEEK'] = df['RWEEK'].astype(int) df['WEEK'] = df["WEEK"].astype(int) df.sort_values('RWEEK', inplace = True) df = df.loc[df.WEEK != 201904] dayofweek = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY'] for i in dayofweek: df[i] = df[i].astype(int) fig, axes = plt.subplots(3,3, sharey = 'row', constrained_layout = True) x = 0 y = 0 for i in df.RWEEK.unique(): new_df = pd.DataFrame(index = dayofweek, columns = ['COUNT']) temp = df.loc[df.RWEEK == i] for j in dayofweek: new_df.loc[j]['COUNT'] = temp[j].sum() plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0) plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2) plot.set_title("Regularity = {}".format(i), size = 6, pad = 2) x_axis = plot.axes.get_xaxis() x_label = x_axis.get_label() x_label.set_visible(False) if ylim: plot.set_ylim(0,ylim) y = y + 1 if y == 3: y = 0 x = x + 1 fig.delaxes(axes[2,1]) fig.delaxes(axes[2,2]) outfile = 'results/dayofweek.png' if outfile: plt.savefig(outfile, dpi = 600)
def generateMonth(): for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): file = os.path.join(data_dir, f) df = readChunk(file) df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int) df.DAY = df.DAY.astype(int) df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int) df = df.loc[df.SESSION_STARTDT_MONTH != 11] new_df = pd.DataFrame(index=df.USERID.unique()) new_df.index.name = 'USERID' temp = df.loc[df.CONTENT_TYPE == content_type] for i in range(df.DAY.min(), df.DAY.max() + 1): temp2 = temp.loc[temp.DAY == i] group = temp2.groupby(['USERID'])['DAY'].count().to_frame() group.DAY = group.DAY.apply(lambda x: np.nan if np.isnan(x) else '1') group.rename(columns={'DAY': str(i)}, inplace=True) new_df = new_df.merge(group, how='left', on='USERID') toCSV(new_df, 'results/' + str(content_type) + '/' + f)
def plotRegularityFreq(): file = "status/results/regularity_combined.csv" df = readChunk(file) print('Number of customers: ', len(df.USERID.unique())) print(df.head()) df['RWEEK'] = df['RWEEK'].astype(int) new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT']) new_df.index.name = 'REGULARITY' for i in range(1, 8): temp = df.loc[df.RWEEK == i] new_df.loc[i]['COUNT'] = len(temp) print(new_df.head()) barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True) new_df = df.groupby('USERID')['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame() print(new_df.head()) new_df2 = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT']) new_df2.index.name = 'REGULARITY' for i in range(1, 8): temp = new_df.loc[new_df.RWEEK == i] new_df2.loc[i]['COUNT'] = len(temp) barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)
def combineMonth(data_dir, outfile, check_login=False): all_df = [] for f in sorted(os.listdir(data_dir)): if f.endswith(".csv"): df = readChunk(os.path.join(data_dir, f)) df.dropna(subset=['USERID'], inplace=True) if check_login: df.USERID = df.USERID.astype(str) df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str) df = removeNotLoggedIn(df) df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str) df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int) df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int) df = df.loc[df.SESSION_STARTDT_MONTH != 11] df['DAY'] = df[[ 'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY' ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1) df = df.loc[df.CONTENT_TYPE != 'nan'] df.replace({'CONTENT_TYPE': content_type}, inplace=True) all_df.append(df) all_df = pd.concat(all_df) all_df = all_df[keepcols] toCSV(all_df, outfile, index=False)
# df.WATCHING_DUR = pd.to_numeric(df.WATCHING_DUR, errors = "coerce") # df.VID_DUR = pd.to_numeric(df.VID_DUR, errors = "coerce") # df.dropna(subset = ['VID_DUR'], inplace = True) # watching = df.groupby('USERID')['WATCHING_DUR'].sum().to_frame() # video = df.groupby('USERID')['VID_DUR'].sum().to_frame() # watching = watching.merge(vide, how = 'left', on = 'USERID') # watching['COMPLETION'] = (watching['WATCHING_DUR']/watching['VID_DUR'])*100 # print(watching.head()) # print(watching.COMPLETION.min()) # print(watching.COMPLETION.max()) # toCSV(watching, 'completion.csv') df = readChunk('click.csv', header=None) df.rename(columns={ 0: 'USERID', 1: 'SESSIONID', 2: 'ADPLAY_COUNT', 3: 'PLAY_COUNT', 4: 'PAUSE_COUNT', 5: 'RESUME_COUNT', 6: 'SEEK_COUNT' }, inplace=True) df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True) cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT'] for i in cols: df[i] = df[i].astype(int)
import warnings warnings.filterwarnings("ignore") import sys sys.path.append("../") import os import time import pandas as pd import numpy as np from utils import readChunk, toCSV df = readChunk("../characterization/session_information.csv", header=None) df.rename(columns={ 0: "USERID", 1: "SESSIONID", 2: "MONTH", 3: "WEEK", 4: "DATE", 5: "START_HOUR", 6: "END_HOUR", 7: "SESSION_DURATION", 8: "WATCHING_DURATION", 9: "VIDEO_DURATION" }, inplace=True) cols = ["SESSION_DURATION", "WATCHING_DURATION"]
import time import pandas as pd import numpy as np from utils import readChunk from matplotlib import pyplot as plt import seaborn as sns import matplotlib.style as style from statsmodels.tsa.stattools import adfuller sns.set() style.use('seaborn-poster') type_sess = ['total', 'less', '70'] less70 = readChunk("../sql/query_results/date_count_50_less.csv") more70 = readChunk("../sql/query_results/date_count_50.csv") df = more70.merge(less70, on='DATE') print(df.columns) df.rename(columns={ 'NUMSESSIONS_x': 'COMPLETION_70', 'NUMSESSIONS_y': 'COMPLETION_LESS_THAN_70' }, inplace=True) print(df.DATE.unique()) df.COMPLETION_70 = df.COMPLETION_70.astype(float) df.COMPLETION_LESS_THAN_70 = df.COMPLETION_LESS_THAN_70.astype(float) df['DATE'] = pd.to_datetime(df['DATE'])
import sys sys.path.append("../") import pandas as pd import numpy as np from utils import readChunk, toCSV df = readChunk( "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv", header=None) df.rename(columns={ 0: 'MONTH', 1: 'USERID', 2: 'SESSIONID', 3: 'STARTHOUR', 4: 'ENDHOUR', 5: 'engagement' }, inplace=True) print(df.head()) df.engagement = df.engagement.astype(float) df.MONTH = df.MONTH.astype(int) df = df.loc[df.MONTH >= 201812] total_df = df.groupby('USERID')['engagement'].sum().to_frame() total_df.engagement = total_df.engagement / 60.0 print(total_df.head()) toCSV(total_df, 'results/overall_engagement.csv')
sys.path.append("../") import os import time import pandas as pd import numpy as np from utils import readChunk from matplotlib import pyplot as plt import seaborn as sns import matplotlib.style as style sns.set() style.use('seaborn-poster') df = readChunk('results/50_WEEK_RETURN_VALUE.csv') df.WEEK_RETURN_VALUE = df.WEEK_RETURN_VALUE.astype(int) print(df.head()) print(len(df)) tohist = pd.DataFrame(index=df.WEEK_RETURN_VALUE.unique(), columns=['NUMCUST']) tohist.index.name = 'WEEK_RETURN_VALUE' for i in df.WEEK_RETURN_VALUE.unique(): print(i) temp = df.loc[df.WEEK_RETURN_VALUE == i] tohist.loc[i]['NUMCUST'] = len(temp) tohist.sort_index(axis=0, inplace=True) plot = tohist.plot(kind='bar', colormap='Pastel2') plt.show()
import warnings warnings.filterwarnings("ignore") import sys sys.path.append("../") import os import time import pandas as pd import numpy as np from utils import readChunk file = "query_results/for_completion_total.csv" df = readChunk(file) def countCompletion70(df): df.COMPLETION_70 = df.COMPLETION_70.astype(float) # print("Total Number of Customers: {}".format(len(df.USERID.unique()))) df = df.loc[df.COMPLETION_70 >= 70] print("Total Number of Customers with 70% Completion: {}".format( len(df.USERID.unique()))) print("\n") return df def timeCompletion(df, col): for time_comp in df[col].unique(): print(time_comp) temp = df.loc[df[col] == time_comp]