Exemple #1
0
def combineMonth():
    new_df = pd.DataFrame()
    for f in sorted(os.listdir(data_dir + '/' + str(content_type))):
        if f.endswith(".csv"):
            file = os.path.join(data_dir + '/' + str(content_type), f)
            if len(new_df) == 0:
                new_df = readChunk(file)
            else:
                df = readChunk(file)
                new_df = new_df.merge(df, how='left', on='USERID')
    new_df.set_index('USERID', inplace=True)
    cols = new_df.columns
    new_df['first_occurence'] = new_df.apply(func, axis=1)
    for i in cols:
        new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1')
    new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' *
                                                      (32 - int(x)))
    new_df[cols] = new_df[cols].astype(str)
    new_df['all'] = new_df[cols].apply(''.join, axis=1)
    print(new_df[['all', 'total']])
    new_df[colname] = new_df[['all', 'total'
                              ]].apply(lambda x: int(x[0], 2) / int(x[1], 2),
                                       axis=1)
    print(new_df[colname])
    print(cols)
    cols.append(colname)
    print(cols)
    toCSV(new_df[cols], outfile, index=True)
Exemple #2
0
def plotWeeklyRegularity3(file, file2 = None, ylim = None):
	df = readChunk(file, header = None)
	df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
	if file2:
		df2 = readChunk(file2, header = None)
		df2.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
		df = pd.concat([df, df2])
	print(df.head())
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('WEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	new_df = df.groupby(['RWEEK', 'WEEK'])['USERID'].count().to_frame().reset_index()
	print(new_df.head(20))

	new_df = new_df.groupby('RWEEK')['USERID'].mean().to_frame()
	new_df['USERID'] = round(new_df['USERID'])
	new_df['USERID'] = new_df['USERID'].astype(int)
	print(new_df.head(20))
	plot = new_df.plot(kind = 'bar', legend = False, rot = 0)
	for i in range(len(new_df)):
		plot.text(i, new_df.iloc[i]['USERID'], new_df.iloc[i]['USERID'], horizontalalignment = 'center')
	plot.set_xlabel('REGULARITY')
	plt.savefig("weekly_average_regularity.png", dpi = 300)
Exemple #3
0
def plotRegularityFreq():
	file = "../status/results/regularity_combined_monthly.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df['MONTH'].astype(int)
	df = df.loc[df.MONTH != 1]
	new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = df.loc[df.RMONTH == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RMONTH'].mean().to_frame()
	new_df['RMONTH'] = round(new_df['RMONTH'])
	print(new_df.head())
	new_df2 = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = new_df.loc[new_df.RMONTH == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)
Exemple #4
0
def getWeekPresent():
    file = "../data/yearweek_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: "USERID",
        1: "SESSIONID",
        2: "YEARWEEK"
    },
              inplace=True)
    df.YEARWEEK = df.YEARWEEK.astype(int)
    df.YEARWEEK = df.YEARWEEK - 201900
    df = df.loc[df.YEARWEEK != 4]
    df.drop_duplicates(subset=["USERID", "YEARWEEK"],
                       keep="first",
                       inplace=True)
    print(df.head(10))

    with open("customer_present.csv", "a") as f:
        writer = csv.writer(f, delimiter=',')
        for i in df.USERID.unique():
            temp = df.loc[df.USERID == i]
            new_df = pd.DataFrame(index=[i],
                                  data=0,
                                  columns=list(range(5, 35)))

            for j in range(len(temp)):
                week = temp.iloc[j]['YEARWEEK']
                new_df.loc[i][int(week)] = 1

            writer.writerow(new_df.reset_index().iloc[0])
Exemple #5
0
def getQuantitative(file, usecols):
	s = time.time()
	print("Getting the quantitative features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		totalviewpageduration = transact.groupby("gigyaid")["viewpageduration"].sum().to_frame()
		totalnumbersession = transact.groupby("gigyaid")["bigdatasessionid"].nunique().to_frame()
		quanti = pd.concat([totalviewpageduration, totalnumbersession], axis = 1)
		quanti = quanti.loc[:, ~quanti.columns.duplicated()]

		actions = list(set(transact["actiontaken"].unique().tolist()))
		for action in actions:
			temp = transact.loc[transact["actiontaken"] == action]
			quanti[action] = temp.groupby("gigyaid")["actiontaken"].count()

		temp = transact.loc[transact["actiontaken"].notnull()]
		quanti["watched"] = temp.groupby("gigyaid")["videotitle"].nunique()
		quanti["contentswatched"] = temp.groupby("gigyaid")["videotitle"].unique().tolist()
		quanti.fillna(0, inplace=True)
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting quantitative features: ", total_time)
		return quanti
Exemple #6
0
def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mode', mode_type = None):
	cust_type = pd.read_csv("results/customer_type.csv", usecols = ['USERID', 'CUSTOMERTYPE'])
	df = readChunk("status/results/regularity_combined.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	
	print(df.columns)
	df.dropna(subset = ['RWEEK'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df = df.loc[df.WEEK != 201904]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	df.reset_index(inplace = True)
	print(df.head())
	df = df.merge(cust_type, how = 'left', on = 'USERID')
	for z in ['ACTIVE', 'LOST']:
		df_2 = df.loc[df.CUSTOMERTYPE == z]
		fig, axes = plt.subplots(8,4, sharey = 'row', constrained_layout = True)
		x = 0
		y = 0
		for i in sorted(df_2.WEEK.unique()):
			temp = df_2.loc[df_2.WEEK == i]
			new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
			new_df.index.name = 'REGULARITY'
			print(len(temp))
			for j in range(1, 8):
				temp2 = temp.loc[temp.RWEEK == j]
				new_df.loc[j]['COUNT'] = len(temp2)
				print(new_df)
			plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
			# plot.set_ylabel('NUMBER OF CUSTOMERS')
			# plot.set_xlabel('REGULARITY')
			plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
			plot.set_title(i, size = 6, pad = 2)
			x_axis = plot.axes.get_xaxis()
			x_label = x_axis.get_label()
			x_label.set_visible(False)
			if ylim:
				plot.set_ylim(0,ylim)
			y = y + 1
			if y == 4:
				y = 0
				x = x + 1
			new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
		fig.delaxes(axes[7,3])
		fig.delaxes(axes[7,2])
		outfile = "results/customerregfreq"+z+str(i)+'.png'
		if outfile:
			plt.savefig(outfile, dpi = 600)
Exemple #7
0
def transactionDates():
    print('getting first and last transaction dates of the customers..')
    file = "results/first_and_last_transaction_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: 'USERID',
        1: 'FIRST_TRANSACTION',
        2: 'LAST_TRANSACTION'
    },
              inplace=True)

    file2 = 'results/average_regularity.csv'
    df2 = readChunk(file2)

    df2 = df2.merge(df, how='left', on='USERID')
    df2.drop(['RWEEK'], axis=1, inplace=True)
    toCSV(df2, 'results/transaction_dates.csv', index=False)
Exemple #8
0
def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mean', mode_type = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	print(df.head())
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	

	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	elif regularity_type == 'mean':
			df = df.groupby('USERID', 'MONTH')['RMONTH'].mean().to_frame()
			df['RMONTH'] = round(df.RMONTH)
	else:
		print('What regularity type?')
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0
	print(df.head())
	for i in sorted(df.MONTH.unique()):
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1, 32)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1, 32):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
			print(new_df)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)

		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(i, size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	if outfile:
		plt.savefig(outfile, dpi = 600)
Exemple #9
0
def getFile(file1, file2 = None):
	df = readChunk(file1, sep = '\t')
	df.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

	if file2:
		df2 = readChunk(file2, sep = '\t')
		df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

		df = pd.concat([df, df2])
	
	print(df.head())
	df.drop_duplicates(subset = ['USERID', 'DATE'], inplace = True)

	label = pd.read_csv("../data/customer_feature_matrix.csv", usecols = ["userid", "label"])
	label.columns = label.columns.str.upper()

	label = label.loc[label.LABEL == 'ACTIVE']
	df = df.merge(label, how = 'left', on = 'USERID')
	return df
Exemple #10
0
def cleanData(data_dir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df = df[cols]
            for col in remove_comma:
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda x: x.replace(",", " ")
                                        if x.replace(",", " ") else x)
            toCSV(df, file, index=False)
Exemple #11
0
def getCustomerType():
    print('getting customer types...')
    transact = readChunk('results/transaction_dates.csv')
    aver = readChunk('results/average_regularity.csv')
    intersession = pd.read_csv('results/intersession.csv')
    intersession.columns = intersession.columns.str.upper()
    transact = transact.merge(aver, how='left', on='USERID')
    transact = transact.merge(intersession, how='right', on='USERID')
    transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION'])
    print(transact.head())
    transact['RWEEK'] = transact['RWEEK'].astype(float)
    s = time.time()
    transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply(
        lambda x: (pd.to_datetime('2019-09-01') - x).days)
    transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply(
        lambda x: 0 if x == -1 else x).astype(float)
    transact = customerType2(transact, how='new')
    print(transact.head(10))
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(transact, 'results/customer_type.csv', index=False)
Exemple #12
0
def extractColumns(data_dir, outdir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith('.csv'):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.dropna(subset=['USERID'], inplace=True)
            df.USERID = df.USERID.astype(str)
            df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
            df = removeLurkers(df)
            outfile = os.path.join(outdir, f[-12:])

            toCSV(df, outfile, index=False)
Exemple #13
0
def addJoinedWeek():
    file = "customer_present.csv"
    df = readChunk(file, header=None)
    df.rename(columns={0: "USERID"}, inplace=True)

    df.set_index('USERID', inplace=True)

    joined = []
    for i in range(len(df)):
        joined.append(getJoinedWeek(df.iloc[i]))
    df['joinedweek'] = joined
    print(df.joinedweek)

    df.to_csv('week_present_and_joined.csv')
Exemple #14
0
def plotRegularityTenure():
	file = 'results/tenure.csv'
	df = readChunk(file)
	df['RWEEK'] = df['RWEEK'].astype(float)
	df['TENURE'] = df['TENURE'].astype(float)

	for i in df.RWEEK.unique():
		temp = df.loc[df.RWEEK == i]
		plot = sns.distplot(a = temp['TENURE'].values, kde = False)
		
		plot.set_ylim(0,4000)
		plt.title('Regularity = {}'.format(str(i)[0]))
		plot.set_xlabel('TENURE (days)')
		plot.set_ylabel('NUMBER OF CUSTOMERS')
		plt.savefig(str(i)+'.png', dpi = 600)
		plt.clf()
Exemple #15
0
def plotWeeklyRegularity(weekno = None, custids = None, ylim = None, outfile = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]
		print('Number of customers: ', len(df.USERID.unique()))
	
	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	df.sort_values('MONTH', inplace = True)
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0

	months = ['FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST']
	count = 0

	for i in df.MONTH.unique():
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1,31):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(months[count], size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/reqfreq/week_'+str(i)+'.csv')
		count = count + 1
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	# outfile = "results/regfreq"+z+str(i)+'.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)
Exemple #16
0
def customerRegularity(file, regularity_type='mean'):

    print('calculating regularity of type: ', regularity_type)
    df = readChunk(file)
    # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
    print('Number of customers: ', len(df.USERID.unique()))
    s = time.time()
    df['RWEEK'] = df['RWEEK'].astype(int)
    if regularity_type == 'mean':
        new_df = df.groupby('USERID')['RWEEK'].mean().to_frame()
    elif regularity_type == 'mode':
        new_df = df.groupby('USERID')['RWEEK'].agg(
            lambda x: pd.Series.mode(x)[0]).to_frame()
    new_df['RWEEK'] = round(new_df['RWEEK'])
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(new_df, 'results/average_regularity.csv')
Exemple #17
0
def getDate(file, usecols):
	s = time.time()
	print("Getting the time features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		group = transact.groupby(["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp"])["viewpageduration"].sum().to_frame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting date features: ", total_time)
		return group
Exemple #18
0
def main(data_dir, out_dir):
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df = df[[
                'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE',
                'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH',
                'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT'
            ]]
            s = time.time()
            df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT'])
            df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour
            df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT'])
            df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour
            e = time.time()
            total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
            print("Finish getting hour in {}".format(total_time))
            toCSV(df, os.path.join(out_dir, f), index=False)
Exemple #19
0
def getQualitative(file, usecols):
    s = time.time()
    print("Getting the qualitative features: ", file)
    transact = readChunk(file, usecols)
    transact = transact.loc[transact.gigyaid.notnull()]
    transact.loc[transact.browsertype.notnull(),
                 "browsertype"] = "WEB APPLICATION"
    transact.browsertype.replace(np.nan, "MOBILE APPLICATION", inplace=True)
    if len(transact) == 0:
        return pd.DataFrame()
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("No unique customer")
    else:
        group = transact.groupby("gigyaid")
        devicetype = group.apply(lambda x: x["devicetype"].unique().tolist()
                                 ).reset_index(name="devicetype")
        deviceos = group.apply(lambda x: x["deviceos"].unique().tolist()
                               ).reset_index(name="deviceos")
        ipaddress = group.apply(lambda x: x["ipaddress"].unique().tolist()
                                ).reset_index(name="ipaddress")
        browsertype = group.apply(lambda x: x["browsertype"].unique().tolist()
                                  ).reset_index(name="browsertype")
        connectivitytype = group.apply(lambda x: x["connectivitytype"].unique(
        ).tolist()).reset_index(name="connectivitytype")
        screensize = group.apply(lambda x: x["screensize"].unique().tolist()
                                 ).reset_index(name="screensize")
        videoquality = group.apply(lambda x: x["videoquality"].unique().tolist(
        )).reset_index(name="videoquality")
        devicename = group.apply(lambda x: x["devicename"].unique().tolist()
                                 ).reset_index(name="devicename")
        mobiledevice = group.apply(lambda x: x["mobiledevice"].unique().tolist(
        )).reset_index(name="mobiledevice")
        df = pd.concat([
            devicetype, deviceos, ipaddress, browsertype, connectivitytype,
            screensize, videoquality, devicename, mobiledevice
        ],
                       axis=1)
        df = df.loc[:, ~df.columns.duplicated()]
        df = df.set_index('gigyaid')
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("Finish getting qualitative features: ", total_time)
        return (df)
Exemple #20
0
def calculateTenure():
    print('calculating tenure of the active and lost customers..')
    df = readChunk('results/customer_type.csv')
    s = time.time()
    tenure = []
    df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION'])
    df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION'])
    for i in range(len(df)):
        if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE':
            tenure.append((pd.to_datetime('2019-09-01') -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
        else:
            tenure.append((df.iloc[i]['LAST_TRANSACTION'] -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
    df['TENURE'] = tenure
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    print(df.head(10))
    toCSV(df, 'results/tenure.csv', index=False)
Exemple #21
0
def plotDayofWeek():
	df = pd.read_csv("status/rweek.csv")
	df = readChunk("status/results/regularity_combined.csv")

	df.columns = ['WEEK', 'SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'RWEEK', 'USERID']
	df.dropna(subset = ['RWEEK'], inplace = True)
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('RWEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	dayofweek = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY']
	for i in dayofweek:
		df[i] = df[i].astype(int)

	fig, axes = plt.subplots(3,3, sharey = 'row', constrained_layout = True)	
	x = 0
	y = 0
	for i in df.RWEEK.unique():
		new_df = pd.DataFrame(index = dayofweek, columns = ['COUNT'])
		temp = df.loc[df.RWEEK == i]
		for j in dayofweek:
			new_df.loc[j]['COUNT'] = temp[j].sum()
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title("Regularity = {}".format(i), size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 3:
			y = 0
			x = x + 1
	fig.delaxes(axes[2,1])
	fig.delaxes(axes[2,2])
	outfile = 'results/dayofweek.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)
Exemple #22
0
def generateMonth():
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int)
            df.DAY = df.DAY.astype(int)
            df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
            df = df.loc[df.SESSION_STARTDT_MONTH != 11]

            new_df = pd.DataFrame(index=df.USERID.unique())
            new_df.index.name = 'USERID'
            temp = df.loc[df.CONTENT_TYPE == content_type]
            for i in range(df.DAY.min(), df.DAY.max() + 1):
                temp2 = temp.loc[temp.DAY == i]
                group = temp2.groupby(['USERID'])['DAY'].count().to_frame()
                group.DAY = group.DAY.apply(lambda x: np.nan
                                            if np.isnan(x) else '1')
                group.rename(columns={'DAY': str(i)}, inplace=True)
                new_df = new_df.merge(group, how='left', on='USERID')
            toCSV(new_df, 'results/' + str(content_type) + '/' + f)
Exemple #23
0
def plotRegularityFreq():
	file = "status/results/regularity_combined.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RWEEK'] = df['RWEEK'].astype(int)
	new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = df.loc[df.RWEEK == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	print(new_df.head())
	new_df2 = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = new_df.loc[new_df.RWEEK == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)
Exemple #24
0
def combineMonth(data_dir, outfile, check_login=False):
    all_df = []
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df.dropna(subset=['USERID'], inplace=True)
            if check_login:
                df.USERID = df.USERID.astype(str)
                df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
                df = removeNotLoggedIn(df)
                df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str)
                df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
                df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int)
                df = df.loc[df.SESSION_STARTDT_MONTH != 11]
                df['DAY'] = df[[
                    'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY'
                ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1)
                df = df.loc[df.CONTENT_TYPE != 'nan']

                df.replace({'CONTENT_TYPE': content_type}, inplace=True)
            all_df.append(df)
    all_df = pd.concat(all_df)
    all_df = all_df[keepcols]
    toCSV(all_df, outfile, index=False)
Exemple #25
0
# df.WATCHING_DUR = pd.to_numeric(df.WATCHING_DUR, errors = "coerce")
# df.VID_DUR = pd.to_numeric(df.VID_DUR, errors = "coerce")
# df.dropna(subset = ['VID_DUR'], inplace = True)

# watching = df.groupby('USERID')['WATCHING_DUR'].sum().to_frame()
# video = df.groupby('USERID')['VID_DUR'].sum().to_frame()

# watching = watching.merge(vide, how = 'left', on = 'USERID')
# watching['COMPLETION'] = (watching['WATCHING_DUR']/watching['VID_DUR'])*100
# print(watching.head())
# print(watching.COMPLETION.min())
# print(watching.COMPLETION.max())
# toCSV(watching, 'completion.csv')

df = readChunk('click.csv', header=None)
df.rename(columns={
    0: 'USERID',
    1: 'SESSIONID',
    2: 'ADPLAY_COUNT',
    3: 'PLAY_COUNT',
    4: 'PAUSE_COUNT',
    5: 'RESUME_COUNT',
    6: 'SEEK_COUNT'
},
          inplace=True)
df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True)

cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT']
for i in cols:
    df[i] = df[i].astype(int)
Exemple #26
0
import warnings

warnings.filterwarnings("ignore")

import sys

sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk("../characterization/session_information.csv", header=None)
df.rename(columns={
    0: "USERID",
    1: "SESSIONID",
    2: "MONTH",
    3: "WEEK",
    4: "DATE",
    5: "START_HOUR",
    6: "END_HOUR",
    7: "SESSION_DURATION",
    8: "WATCHING_DURATION",
    9: "VIDEO_DURATION"
},
          inplace=True)

cols = ["SESSION_DURATION", "WATCHING_DURATION"]
Exemple #27
0
import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style
from statsmodels.tsa.stattools import adfuller

sns.set()
style.use('seaborn-poster')

type_sess = ['total', 'less', '70']
less70 = readChunk("../sql/query_results/date_count_50_less.csv")
more70 = readChunk("../sql/query_results/date_count_50.csv")

df = more70.merge(less70, on='DATE')
print(df.columns)

df.rename(columns={
    'NUMSESSIONS_x': 'COMPLETION_70',
    'NUMSESSIONS_y': 'COMPLETION_LESS_THAN_70'
},
          inplace=True)

print(df.DATE.unique())
df.COMPLETION_70 = df.COMPLETION_70.astype(float)
df.COMPLETION_LESS_THAN_70 = df.COMPLETION_LESS_THAN_70.astype(float)
df['DATE'] = pd.to_datetime(df['DATE'])
Exemple #28
0
import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk(
    "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv",
    header=None)

df.rename(columns={
    0: 'MONTH',
    1: 'USERID',
    2: 'SESSIONID',
    3: 'STARTHOUR',
    4: 'ENDHOUR',
    5: 'engagement'
},
          inplace=True)
print(df.head())
df.engagement = df.engagement.astype(float)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]

total_df = df.groupby('USERID')['engagement'].sum().to_frame()
total_df.engagement = total_df.engagement / 60.0
print(total_df.head())
toCSV(total_df, 'results/overall_engagement.csv')
Exemple #29
0
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style

sns.set()
style.use('seaborn-poster')

df = readChunk('results/50_WEEK_RETURN_VALUE.csv')
df.WEEK_RETURN_VALUE = df.WEEK_RETURN_VALUE.astype(int)
print(df.head())
print(len(df))

tohist = pd.DataFrame(index=df.WEEK_RETURN_VALUE.unique(), columns=['NUMCUST'])
tohist.index.name = 'WEEK_RETURN_VALUE'
for i in df.WEEK_RETURN_VALUE.unique():
    print(i)
    temp = df.loc[df.WEEK_RETURN_VALUE == i]
    tohist.loc[i]['NUMCUST'] = len(temp)

tohist.sort_index(axis=0, inplace=True)
plot = tohist.plot(kind='bar', colormap='Pastel2')
plt.show()
Exemple #30
0
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk

file = "query_results/for_completion_total.csv"
df = readChunk(file)


def countCompletion70(df):
    df.COMPLETION_70 = df.COMPLETION_70.astype(float)
    # print("Total Number of Customers: {}".format(len(df.USERID.unique())))
    df = df.loc[df.COMPLETION_70 >= 70]
    print("Total Number of Customers with 70% Completion: {}".format(
        len(df.USERID.unique())))
    print("\n")
    return df


def timeCompletion(df, col):
    for time_comp in df[col].unique():
        print(time_comp)
        temp = df.loc[df[col] == time_comp]