Python readChunk Exemples, utils.readChunk Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : per_month.py Projet : ririgi/YaeRegularity

def combineMonth():
    new_df = pd.DataFrame()
    for f in sorted(os.listdir(data_dir + '/' + str(content_type))):
        if f.endswith(".csv"):
            file = os.path.join(data_dir + '/' + str(content_type), f)
            if len(new_df) == 0:
                new_df = readChunk(file)
            else:
                df = readChunk(file)
                new_df = new_df.merge(df, how='left', on='USERID')
    new_df.set_index('USERID', inplace=True)
    cols = new_df.columns
    new_df['first_occurence'] = new_df.apply(func, axis=1)
    for i in cols:
        new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1')
    new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' *
                                                      (32 - int(x)))
    new_df[cols] = new_df[cols].astype(str)
    new_df['all'] = new_df[cols].apply(''.join, axis=1)
    print(new_df[['all', 'total']])
    new_df[colname] = new_df[['all', 'total'
                              ]].apply(lambda x: int(x[0], 2) / int(x[1], 2),
                                       axis=1)
    print(new_df[colname])
    print(cols)
    cols.append(colname)
    print(cols)
    toCSV(new_df[cols], outfile, index=True)

Exemple #2

0

Afficher le fichier

Fichier : plot.py Projet : ririgi/seele

def plotWeeklyRegularity3(file, file2 = None, ylim = None):
	df = readChunk(file, header = None)
	df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
	if file2:
		df2 = readChunk(file2, header = None)
		df2.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
		df = pd.concat([df, df2])
	print(df.head())
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('WEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	new_df = df.groupby(['RWEEK', 'WEEK'])['USERID'].count().to_frame().reset_index()
	print(new_df.head(20))

	new_df = new_df.groupby('RWEEK')['USERID'].mean().to_frame()
	new_df['USERID'] = round(new_df['USERID'])
	new_df['USERID'] = new_df['USERID'].astype(int)
	print(new_df.head(20))
	plot = new_df.plot(kind = 'bar', legend = False, rot = 0)
	for i in range(len(new_df)):
		plot.text(i, new_df.iloc[i]['USERID'], new_df.iloc[i]['USERID'], horizontalalignment = 'center')
	plot.set_xlabel('REGULARITY')
	plt.savefig("weekly_average_regularity.png", dpi = 300)

Exemple #3

0

Afficher le fichier

def plotRegularityFreq():
	file = "../status/results/regularity_combined_monthly.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df['MONTH'].astype(int)
	df = df.loc[df.MONTH != 1]
	new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = df.loc[df.RMONTH == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RMONTH'].mean().to_frame()
	new_df['RMONTH'] = round(new_df['RMONTH'])
	print(new_df.head())
	new_df2 = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = new_df.loc[new_df.RMONTH == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)

Exemple #4

0

Afficher le fichier

Fichier : joindate.py Projet : ririgi/seele

def getWeekPresent():
    file = "../data/yearweek_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: "USERID",
        1: "SESSIONID",
        2: "YEARWEEK"
    },
              inplace=True)
    df.YEARWEEK = df.YEARWEEK.astype(int)
    df.YEARWEEK = df.YEARWEEK - 201900
    df = df.loc[df.YEARWEEK != 4]
    df.drop_duplicates(subset=["USERID", "YEARWEEK"],
                       keep="first",
                       inplace=True)
    print(df.head(10))

    with open("customer_present.csv", "a") as f:
        writer = csv.writer(f, delimiter=',')
        for i in df.USERID.unique():
            temp = df.loc[df.USERID == i]
            new_df = pd.DataFrame(index=[i],
                                  data=0,
                                  columns=list(range(5, 35)))

            for j in range(len(temp)):
                week = temp.iloc[j]['YEARWEEK']
                new_df.loc[i][int(week)] = 1

            writer.writerow(new_df.reset_index().iloc[0])

Exemple #5

0

Afficher le fichier

Fichier : quanti.py Projet : ririgi/ProjectA

def getQuantitative(file, usecols):
	s = time.time()
	print("Getting the quantitative features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		totalviewpageduration = transact.groupby("gigyaid")["viewpageduration"].sum().to_frame()
		totalnumbersession = transact.groupby("gigyaid")["bigdatasessionid"].nunique().to_frame()
		quanti = pd.concat([totalviewpageduration, totalnumbersession], axis = 1)
		quanti = quanti.loc[:, ~quanti.columns.duplicated()]

		actions = list(set(transact["actiontaken"].unique().tolist()))
		for action in actions:
			temp = transact.loc[transact["actiontaken"] == action]
			quanti[action] = temp.groupby("gigyaid")["actiontaken"].count()

		temp = transact.loc[transact["actiontaken"].notnull()]
		quanti["watched"] = temp.groupby("gigyaid")["videotitle"].nunique()
		quanti["contentswatched"] = temp.groupby("gigyaid")["videotitle"].unique().tolist()
		quanti.fillna(0, inplace=True)
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting quantitative features: ", total_time)
		return quanti

Exemple #6

0

Afficher le fichier

Fichier : plot.py Projet : ririgi/seele

def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mode', mode_type = None):
	cust_type = pd.read_csv("results/customer_type.csv", usecols = ['USERID', 'CUSTOMERTYPE'])
	df = readChunk("status/results/regularity_combined.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	
	print(df.columns)
	df.dropna(subset = ['RWEEK'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df = df.loc[df.WEEK != 201904]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	df.reset_index(inplace = True)
	print(df.head())
	df = df.merge(cust_type, how = 'left', on = 'USERID')
	for z in ['ACTIVE', 'LOST']:
		df_2 = df.loc[df.CUSTOMERTYPE == z]
		fig, axes = plt.subplots(8,4, sharey = 'row', constrained_layout = True)
		x = 0
		y = 0
		for i in sorted(df_2.WEEK.unique()):
			temp = df_2.loc[df_2.WEEK == i]
			new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
			new_df.index.name = 'REGULARITY'
			print(len(temp))
			for j in range(1, 8):
				temp2 = temp.loc[temp.RWEEK == j]
				new_df.loc[j]['COUNT'] = len(temp2)
				print(new_df)
			plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
			# plot.set_ylabel('NUMBER OF CUSTOMERS')
			# plot.set_xlabel('REGULARITY')
			plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
			plot.set_title(i, size = 6, pad = 2)
			x_axis = plot.axes.get_xaxis()
			x_label = x_axis.get_label()
			x_label.set_visible(False)
			if ylim:
				plot.set_ylim(0,ylim)
			y = y + 1
			if y == 4:
				y = 0
				x = x + 1
			new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
		fig.delaxes(axes[7,3])
		fig.delaxes(axes[7,2])
		outfile = "results/customerregfreq"+z+str(i)+'.png'
		if outfile:
			plt.savefig(outfile, dpi = 600)

Exemple #7

0

Afficher le fichier

Fichier : customer_type.py Projet : ririgi/seele

def transactionDates():
    print('getting first and last transaction dates of the customers..')
    file = "results/first_and_last_transaction_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: 'USERID',
        1: 'FIRST_TRANSACTION',
        2: 'LAST_TRANSACTION'
    },
              inplace=True)

    file2 = 'results/average_regularity.csv'
    df2 = readChunk(file2)

    df2 = df2.merge(df, how='left', on='USERID')
    df2.drop(['RWEEK'], axis=1, inplace=True)
    toCSV(df2, 'results/transaction_dates.csv', index=False)

Exemple #8

0

Afficher le fichier

def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mean', mode_type = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	print(df.head())
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	

	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	elif regularity_type == 'mean':
			df = df.groupby('USERID', 'MONTH')['RMONTH'].mean().to_frame()
			df['RMONTH'] = round(df.RMONTH)
	else:
		print('What regularity type?')
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0
	print(df.head())
	for i in sorted(df.MONTH.unique()):
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1, 32)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1, 32):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
			print(new_df)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)

		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(i, size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	if outfile:
		plt.savefig(outfile, dpi = 600)

Exemple #9

0

Afficher le fichier

def getFile(file1, file2 = None):
	df = readChunk(file1, sep = '\t')
	df.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

	if file2:
		df2 = readChunk(file2, sep = '\t')
		df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

		df = pd.concat([df, df2])
	
	print(df.head())
	df.drop_duplicates(subset = ['USERID', 'DATE'], inplace = True)

	label = pd.read_csv("../data/customer_feature_matrix.csv", usecols = ["userid", "label"])
	label.columns = label.columns.str.upper()

	label = label.loc[label.LABEL == 'ACTIVE']
	df = df.merge(label, how = 'left', on = 'USERID')
	return df

Exemple #10

0

Afficher le fichier

Fichier : clean_import.py Projet : ririgi/FinalProjectA

def cleanData(data_dir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df = df[cols]
            for col in remove_comma:
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda x: x.replace(",", " ")
                                        if x.replace(",", " ") else x)
            toCSV(df, file, index=False)

Exemple #11

0

Afficher le fichier

Fichier : customer_type.py Projet : ririgi/seele

def getCustomerType():
    print('getting customer types...')
    transact = readChunk('results/transaction_dates.csv')
    aver = readChunk('results/average_regularity.csv')
    intersession = pd.read_csv('results/intersession.csv')
    intersession.columns = intersession.columns.str.upper()
    transact = transact.merge(aver, how='left', on='USERID')
    transact = transact.merge(intersession, how='right', on='USERID')
    transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION'])
    print(transact.head())
    transact['RWEEK'] = transact['RWEEK'].astype(float)
    s = time.time()
    transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply(
        lambda x: (pd.to_datetime('2019-09-01') - x).days)
    transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply(
        lambda x: 0 if x == -1 else x).astype(float)
    transact = customerType2(transact, how='new')
    print(transact.head(10))
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(transact, 'results/customer_type.csv', index=False)

Exemple #12

0

Afficher le fichier

def extractColumns(data_dir, outdir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith('.csv'):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.dropna(subset=['USERID'], inplace=True)
            df.USERID = df.USERID.astype(str)
            df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
            df = removeLurkers(df)
            outfile = os.path.join(outdir, f[-12:])

            toCSV(df, outfile, index=False)

Exemple #13

0

Afficher le fichier

Fichier : joindate.py Projet : ririgi/seele

def addJoinedWeek():
    file = "customer_present.csv"
    df = readChunk(file, header=None)
    df.rename(columns={0: "USERID"}, inplace=True)

    df.set_index('USERID', inplace=True)

    joined = []
    for i in range(len(df)):
        joined.append(getJoinedWeek(df.iloc[i]))
    df['joinedweek'] = joined
    print(df.joinedweek)

    df.to_csv('week_present_and_joined.csv')

Exemple #14

0

Afficher le fichier

def plotRegularityTenure():
	file = 'results/tenure.csv'
	df = readChunk(file)
	df['RWEEK'] = df['RWEEK'].astype(float)
	df['TENURE'] = df['TENURE'].astype(float)

	for i in df.RWEEK.unique():
		temp = df.loc[df.RWEEK == i]
		plot = sns.distplot(a = temp['TENURE'].values, kde = False)
		
		plot.set_ylim(0,4000)
		plt.title('Regularity = {}'.format(str(i)[0]))
		plot.set_xlabel('TENURE (days)')
		plot.set_ylabel('NUMBER OF CUSTOMERS')
		plt.savefig(str(i)+'.png', dpi = 600)
		plt.clf()

Exemple #15

0

Afficher le fichier

def plotWeeklyRegularity(weekno = None, custids = None, ylim = None, outfile = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]
		print('Number of customers: ', len(df.USERID.unique()))
	
	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	df.sort_values('MONTH', inplace = True)
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0

	months = ['FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST']
	count = 0

	for i in df.MONTH.unique():
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1,31):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(months[count], size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/reqfreq/week_'+str(i)+'.csv')
		count = count + 1
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	# outfile = "results/regfreq"+z+str(i)+'.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)

Exemple #16

0

Afficher le fichier

Fichier : customer_type.py Projet : ririgi/seele

def customerRegularity(file, regularity_type='mean'):

    print('calculating regularity of type: ', regularity_type)
    df = readChunk(file)
    # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
    print('Number of customers: ', len(df.USERID.unique()))
    s = time.time()
    df['RWEEK'] = df['RWEEK'].astype(int)
    if regularity_type == 'mean':
        new_df = df.groupby('USERID')['RWEEK'].mean().to_frame()
    elif regularity_type == 'mode':
        new_df = df.groupby('USERID')['RWEEK'].agg(
            lambda x: pd.Series.mode(x)[0]).to_frame()
    new_df['RWEEK'] = round(new_df['RWEEK'])
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(new_df, 'results/average_regularity.csv')

Exemple #17

0

Afficher le fichier

Fichier : quanti.py Projet : ririgi/ProjectA

def getDate(file, usecols):
	s = time.time()
	print("Getting the time features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		group = transact.groupby(["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp"])["viewpageduration"].sum().to_frame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting date features: ", total_time)
		return group

Exemple #18

0

Afficher le fichier

def main(data_dir, out_dir):
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df = df[[
                'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE',
                'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH',
                'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT'
            ]]
            s = time.time()
            df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT'])
            df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour
            df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT'])
            df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour
            e = time.time()
            total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
            print("Finish getting hour in {}".format(total_time))
            toCSV(df, os.path.join(out_dir, f), index=False)

Exemple #19

0

Afficher le fichier

Fichier : quali.py Projet : ririgi/ProjectA

def getQualitative(file, usecols):
    s = time.time()
    print("Getting the qualitative features: ", file)
    transact = readChunk(file, usecols)
    transact = transact.loc[transact.gigyaid.notnull()]
    transact.loc[transact.browsertype.notnull(),
                 "browsertype"] = "WEB APPLICATION"
    transact.browsertype.replace(np.nan, "MOBILE APPLICATION", inplace=True)
    if len(transact) == 0:
        return pd.DataFrame()
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("No unique customer")
    else:
        group = transact.groupby("gigyaid")
        devicetype = group.apply(lambda x: x["devicetype"].unique().tolist()
                                 ).reset_index(name="devicetype")
        deviceos = group.apply(lambda x: x["deviceos"].unique().tolist()
                               ).reset_index(name="deviceos")
        ipaddress = group.apply(lambda x: x["ipaddress"].unique().tolist()
                                ).reset_index(name="ipaddress")
        browsertype = group.apply(lambda x: x["browsertype"].unique().tolist()
                                  ).reset_index(name="browsertype")
        connectivitytype = group.apply(lambda x: x["connectivitytype"].unique(
        ).tolist()).reset_index(name="connectivitytype")
        screensize = group.apply(lambda x: x["screensize"].unique().tolist()
                                 ).reset_index(name="screensize")
        videoquality = group.apply(lambda x: x["videoquality"].unique().tolist(
        )).reset_index(name="videoquality")
        devicename = group.apply(lambda x: x["devicename"].unique().tolist()
                                 ).reset_index(name="devicename")
        mobiledevice = group.apply(lambda x: x["mobiledevice"].unique().tolist(
        )).reset_index(name="mobiledevice")
        df = pd.concat([
            devicetype, deviceos, ipaddress, browsertype, connectivitytype,
            screensize, videoquality, devicename, mobiledevice
        ],
                       axis=1)
        df = df.loc[:, ~df.columns.duplicated()]
        df = df.set_index('gigyaid')
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("Finish getting qualitative features: ", total_time)
        return (df)

Exemple #20

0

Afficher le fichier

Fichier : customer_type.py Projet : ririgi/seele

def calculateTenure():
    print('calculating tenure of the active and lost customers..')
    df = readChunk('results/customer_type.csv')
    s = time.time()
    tenure = []
    df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION'])
    df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION'])
    for i in range(len(df)):
        if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE':
            tenure.append((pd.to_datetime('2019-09-01') -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
        else:
            tenure.append((df.iloc[i]['LAST_TRANSACTION'] -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
    df['TENURE'] = tenure
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    print(df.head(10))
    toCSV(df, 'results/tenure.csv', index=False)

Exemple #21

0

Afficher le fichier

Fichier : plot.py Projet : ririgi/seele

def plotDayofWeek():
	df = pd.read_csv("status/rweek.csv")
	df = readChunk("status/results/regularity_combined.csv")

	df.columns = ['WEEK', 'SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'RWEEK', 'USERID']
	df.dropna(subset = ['RWEEK'], inplace = True)
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('RWEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	dayofweek = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY']
	for i in dayofweek:
		df[i] = df[i].astype(int)

	fig, axes = plt.subplots(3,3, sharey = 'row', constrained_layout = True)	
	x = 0
	y = 0
	for i in df.RWEEK.unique():
		new_df = pd.DataFrame(index = dayofweek, columns = ['COUNT'])
		temp = df.loc[df.RWEEK == i]
		for j in dayofweek:
			new_df.loc[j]['COUNT'] = temp[j].sum()
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title("Regularity = {}".format(i), size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 3:
			y = 0
			x = x + 1
	fig.delaxes(axes[2,1])
	fig.delaxes(axes[2,2])
	outfile = 'results/dayofweek.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)

Exemple #22

0

Afficher le fichier

Fichier : per_month.py Projet : ririgi/YaeRegularity

def generateMonth():
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int)
            df.DAY = df.DAY.astype(int)
            df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
            df = df.loc[df.SESSION_STARTDT_MONTH != 11]

            new_df = pd.DataFrame(index=df.USERID.unique())
            new_df.index.name = 'USERID'
            temp = df.loc[df.CONTENT_TYPE == content_type]
            for i in range(df.DAY.min(), df.DAY.max() + 1):
                temp2 = temp.loc[temp.DAY == i]
                group = temp2.groupby(['USERID'])['DAY'].count().to_frame()
                group.DAY = group.DAY.apply(lambda x: np.nan
                                            if np.isnan(x) else '1')
                group.rename(columns={'DAY': str(i)}, inplace=True)
                new_df = new_df.merge(group, how='left', on='USERID')
            toCSV(new_df, 'results/' + str(content_type) + '/' + f)

Exemple #23

0

Afficher le fichier

Fichier : plot.py Projet : ririgi/seele

def plotRegularityFreq():
	file = "status/results/regularity_combined.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RWEEK'] = df['RWEEK'].astype(int)
	new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = df.loc[df.RWEEK == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	print(new_df.head())
	new_df2 = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = new_df.loc[new_df.RWEEK == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)

Exemple #24

0

Afficher le fichier

Fichier : combine.py Projet : ririgi/YaeRegularity

def combineMonth(data_dir, outfile, check_login=False):
    all_df = []
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df.dropna(subset=['USERID'], inplace=True)
            if check_login:
                df.USERID = df.USERID.astype(str)
                df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
                df = removeNotLoggedIn(df)
                df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str)
                df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
                df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int)
                df = df.loc[df.SESSION_STARTDT_MONTH != 11]
                df['DAY'] = df[[
                    'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY'
                ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1)
                df = df.loc[df.CONTENT_TYPE != 'nan']

                df.replace({'CONTENT_TYPE': content_type}, inplace=True)
            all_df.append(df)
    all_df = pd.concat(all_df)
    all_df = all_df[keepcols]
    toCSV(all_df, outfile, index=False)

Exemple #25

0

Afficher le fichier

# df.WATCHING_DUR = pd.to_numeric(df.WATCHING_DUR, errors = "coerce")
# df.VID_DUR = pd.to_numeric(df.VID_DUR, errors = "coerce")
# df.dropna(subset = ['VID_DUR'], inplace = True)

# watching = df.groupby('USERID')['WATCHING_DUR'].sum().to_frame()
# video = df.groupby('USERID')['VID_DUR'].sum().to_frame()

# watching = watching.merge(vide, how = 'left', on = 'USERID')
# watching['COMPLETION'] = (watching['WATCHING_DUR']/watching['VID_DUR'])*100
# print(watching.head())
# print(watching.COMPLETION.min())
# print(watching.COMPLETION.max())
# toCSV(watching, 'completion.csv')

df = readChunk('click.csv', header=None)
df.rename(columns={
    0: 'USERID',
    1: 'SESSIONID',
    2: 'ADPLAY_COUNT',
    3: 'PLAY_COUNT',
    4: 'PAUSE_COUNT',
    5: 'RESUME_COUNT',
    6: 'SEEK_COUNT'
},
          inplace=True)
df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True)

cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT']
for i in cols:
    df[i] = df[i].astype(int)

Exemple #26

0

Afficher le fichier

import warnings

warnings.filterwarnings("ignore")

import sys

sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk("../characterization/session_information.csv", header=None)
df.rename(columns={
    0: "USERID",
    1: "SESSIONID",
    2: "MONTH",
    3: "WEEK",
    4: "DATE",
    5: "START_HOUR",
    6: "END_HOUR",
    7: "SESSION_DURATION",
    8: "WATCHING_DURATION",
    9: "VIDEO_DURATION"
},
          inplace=True)

cols = ["SESSION_DURATION", "WATCHING_DURATION"]

Exemple #27

0

Afficher le fichier

import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style
from statsmodels.tsa.stattools import adfuller

sns.set()
style.use('seaborn-poster')

type_sess = ['total', 'less', '70']
less70 = readChunk("../sql/query_results/date_count_50_less.csv")
more70 = readChunk("../sql/query_results/date_count_50.csv")

df = more70.merge(less70, on='DATE')
print(df.columns)

df.rename(columns={
    'NUMSESSIONS_x': 'COMPLETION_70',
    'NUMSESSIONS_y': 'COMPLETION_LESS_THAN_70'
},
          inplace=True)

print(df.DATE.unique())
df.COMPLETION_70 = df.COMPLETION_70.astype(float)
df.COMPLETION_LESS_THAN_70 = df.COMPLETION_LESS_THAN_70.astype(float)
df['DATE'] = pd.to_datetime(df['DATE'])

Exemple #28

0

Afficher le fichier

import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk(
    "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv",
    header=None)

df.rename(columns={
    0: 'MONTH',
    1: 'USERID',
    2: 'SESSIONID',
    3: 'STARTHOUR',
    4: 'ENDHOUR',
    5: 'engagement'
},
          inplace=True)
print(df.head())
df.engagement = df.engagement.astype(float)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]

total_df = df.groupby('USERID')['engagement'].sum().to_frame()
total_df.engagement = total_df.engagement / 60.0
print(total_df.head())
toCSV(total_df, 'results/overall_engagement.csv')

Exemple #29

0

Afficher le fichier

Fichier : return_plot.py Projet : ririgi/FinalProjectA

sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style

sns.set()
style.use('seaborn-poster')

df = readChunk('results/50_WEEK_RETURN_VALUE.csv')
df.WEEK_RETURN_VALUE = df.WEEK_RETURN_VALUE.astype(int)
print(df.head())
print(len(df))

tohist = pd.DataFrame(index=df.WEEK_RETURN_VALUE.unique(), columns=['NUMCUST'])
tohist.index.name = 'WEEK_RETURN_VALUE'
for i in df.WEEK_RETURN_VALUE.unique():
    print(i)
    temp = df.loc[df.WEEK_RETURN_VALUE == i]
    tohist.loc[i]['NUMCUST'] = len(temp)

tohist.sort_index(axis=0, inplace=True)
plot = tohist.plot(kind='bar', colormap='Pastel2')
plt.show()

Exemple #30

0

Afficher le fichier

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk

file = "query_results/for_completion_total.csv"
df = readChunk(file)


def countCompletion70(df):
    df.COMPLETION_70 = df.COMPLETION_70.astype(float)
    # print("Total Number of Customers: {}".format(len(df.USERID.unique())))
    df = df.loc[df.COMPLETION_70 >= 70]
    print("Total Number of Customers with 70% Completion: {}".format(
        len(df.USERID.unique())))
    print("\n")
    return df


def timeCompletion(df, col):
    for time_comp in df[col].unique():
        print(time_comp)
        temp = df.loc[df[col] == time_comp]