Python readChunkの例、utils.readChunk Pythonの例

コード例 #1

0

ファイルを表示

ファイル: per_month.py プロジェクト: ririgi/YaeRegularity

def combineMonth():
    new_df = pd.DataFrame()
    for f in sorted(os.listdir(data_dir + '/' + str(content_type))):
        if f.endswith(".csv"):
            file = os.path.join(data_dir + '/' + str(content_type), f)
            if len(new_df) == 0:
                new_df = readChunk(file)
            else:
                df = readChunk(file)
                new_df = new_df.merge(df, how='left', on='USERID')
    new_df.set_index('USERID', inplace=True)
    cols = new_df.columns
    new_df['first_occurence'] = new_df.apply(func, axis=1)
    for i in cols:
        new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1')
    new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' *
                                                      (32 - int(x)))
    new_df[cols] = new_df[cols].astype(str)
    new_df['all'] = new_df[cols].apply(''.join, axis=1)
    print(new_df[['all', 'total']])
    new_df[colname] = new_df[['all', 'total'
                              ]].apply(lambda x: int(x[0], 2) / int(x[1], 2),
                                       axis=1)
    print(new_df[colname])
    print(cols)
    cols.append(colname)
    print(cols)
    toCSV(new_df[cols], outfile, index=True)

コード例 #2

0

ファイルを表示

ファイル: plot.py プロジェクト: ririgi/seele

def plotWeeklyRegularity3(file, file2 = None, ylim = None):
	df = readChunk(file, header = None)
	df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
	if file2:
		df2 = readChunk(file2, header = None)
		df2.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
		df = pd.concat([df, df2])
	print(df.head())
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('WEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	new_df = df.groupby(['RWEEK', 'WEEK'])['USERID'].count().to_frame().reset_index()
	print(new_df.head(20))

	new_df = new_df.groupby('RWEEK')['USERID'].mean().to_frame()
	new_df['USERID'] = round(new_df['USERID'])
	new_df['USERID'] = new_df['USERID'].astype(int)
	print(new_df.head(20))
	plot = new_df.plot(kind = 'bar', legend = False, rot = 0)
	for i in range(len(new_df)):
		plot.text(i, new_df.iloc[i]['USERID'], new_df.iloc[i]['USERID'], horizontalalignment = 'center')
	plot.set_xlabel('REGULARITY')
	plt.savefig("weekly_average_regularity.png", dpi = 300)

コード例 #3

0

ファイルを表示

def plotRegularityFreq():
	file = "../status/results/regularity_combined_monthly.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df['MONTH'].astype(int)
	df = df.loc[df.MONTH != 1]
	new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = df.loc[df.RMONTH == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RMONTH'].mean().to_frame()
	new_df['RMONTH'] = round(new_df['RMONTH'])
	print(new_df.head())
	new_df2 = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 31):
		temp = new_df.loc[new_df.RMONTH == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)

コード例 #4

0

ファイルを表示

ファイル: joindate.py プロジェクト: ririgi/seele

def getWeekPresent():
    file = "../data/yearweek_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: "USERID",
        1: "SESSIONID",
        2: "YEARWEEK"
    },
              inplace=True)
    df.YEARWEEK = df.YEARWEEK.astype(int)
    df.YEARWEEK = df.YEARWEEK - 201900
    df = df.loc[df.YEARWEEK != 4]
    df.drop_duplicates(subset=["USERID", "YEARWEEK"],
                       keep="first",
                       inplace=True)
    print(df.head(10))

    with open("customer_present.csv", "a") as f:
        writer = csv.writer(f, delimiter=',')
        for i in df.USERID.unique():
            temp = df.loc[df.USERID == i]
            new_df = pd.DataFrame(index=[i],
                                  data=0,
                                  columns=list(range(5, 35)))

            for j in range(len(temp)):
                week = temp.iloc[j]['YEARWEEK']
                new_df.loc[i][int(week)] = 1

            writer.writerow(new_df.reset_index().iloc[0])

コード例 #5

0

ファイルを表示

ファイル: quanti.py プロジェクト: ririgi/ProjectA

def getQuantitative(file, usecols):
	s = time.time()
	print("Getting the quantitative features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		totalviewpageduration = transact.groupby("gigyaid")["viewpageduration"].sum().to_frame()
		totalnumbersession = transact.groupby("gigyaid")["bigdatasessionid"].nunique().to_frame()
		quanti = pd.concat([totalviewpageduration, totalnumbersession], axis = 1)
		quanti = quanti.loc[:, ~quanti.columns.duplicated()]

		actions = list(set(transact["actiontaken"].unique().tolist()))
		for action in actions:
			temp = transact.loc[transact["actiontaken"] == action]
			quanti[action] = temp.groupby("gigyaid")["actiontaken"].count()

		temp = transact.loc[transact["actiontaken"].notnull()]
		quanti["watched"] = temp.groupby("gigyaid")["videotitle"].nunique()
		quanti["contentswatched"] = temp.groupby("gigyaid")["videotitle"].unique().tolist()
		quanti.fillna(0, inplace=True)
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting quantitative features: ", total_time)
		return quanti

コード例 #6

0

ファイルを表示

ファイル: plot.py プロジェクト: ririgi/seele

def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mode', mode_type = None):
	cust_type = pd.read_csv("results/customer_type.csv", usecols = ['USERID', 'CUSTOMERTYPE'])
	df = readChunk("status/results/regularity_combined.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	
	print(df.columns)
	df.dropna(subset = ['RWEEK'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df = df.loc[df.WEEK != 201904]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	df.reset_index(inplace = True)
	print(df.head())
	df = df.merge(cust_type, how = 'left', on = 'USERID')
	for z in ['ACTIVE', 'LOST']:
		df_2 = df.loc[df.CUSTOMERTYPE == z]
		fig, axes = plt.subplots(8,4, sharey = 'row', constrained_layout = True)
		x = 0
		y = 0
		for i in sorted(df_2.WEEK.unique()):
			temp = df_2.loc[df_2.WEEK == i]
			new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
			new_df.index.name = 'REGULARITY'
			print(len(temp))
			for j in range(1, 8):
				temp2 = temp.loc[temp.RWEEK == j]
				new_df.loc[j]['COUNT'] = len(temp2)
				print(new_df)
			plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
			# plot.set_ylabel('NUMBER OF CUSTOMERS')
			# plot.set_xlabel('REGULARITY')
			plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
			plot.set_title(i, size = 6, pad = 2)
			x_axis = plot.axes.get_xaxis()
			x_label = x_axis.get_label()
			x_label.set_visible(False)
			if ylim:
				plot.set_ylim(0,ylim)
			y = y + 1
			if y == 4:
				y = 0
				x = x + 1
			new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
		fig.delaxes(axes[7,3])
		fig.delaxes(axes[7,2])
		outfile = "results/customerregfreq"+z+str(i)+'.png'
		if outfile:
			plt.savefig(outfile, dpi = 600)

コード例 #7

0

ファイルを表示

ファイル: customer_type.py プロジェクト: ririgi/seele

def transactionDates():
    print('getting first and last transaction dates of the customers..')
    file = "results/first_and_last_transaction_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: 'USERID',
        1: 'FIRST_TRANSACTION',
        2: 'LAST_TRANSACTION'
    },
              inplace=True)

    file2 = 'results/average_regularity.csv'
    df2 = readChunk(file2)

    df2 = df2.merge(df, how='left', on='USERID')
    df2.drop(['RWEEK'], axis=1, inplace=True)
    toCSV(df2, 'results/transaction_dates.csv', index=False)

コード例 #8

0

ファイルを表示

def plotWeeklyRegularity2(weekno = None, custids = None, ylim = None, outfile = None, regularity_type = 'mean', mode_type = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	print(df.head())
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]

	print('Number of customers: ', len(df.USERID.unique()))
	

	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	if regularity_type == 'mode':
		if mode_type == 'min':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: min(pd.Series.mode(x))).to_frame()
		elif mode_type == 'max':
			df = df.groupby('USERID')['RWEEK'].agg(lambda x: max(pd.Series.mode(x))).to_frame()
		else:
			df = df.groupby(['USERID'])['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	elif regularity_type == 'mean':
			df = df.groupby('USERID', 'MONTH')['RMONTH'].mean().to_frame()
			df['RMONTH'] = round(df.RMONTH)
	else:
		print('What regularity type?')
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0
	print(df.head())
	for i in sorted(df.MONTH.unique()):
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1, 32)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1, 32):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
			print(new_df)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)

		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(i, size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/customerregfreq/week_'+z+str(i)+'.csv')
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	if outfile:
		plt.savefig(outfile, dpi = 600)

コード例 #9

0

ファイルを表示

def getFile(file1, file2 = None):
	df = readChunk(file1, sep = '\t')
	df.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

	if file2:
		df2 = readChunk(file2, sep = '\t')
		df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "DAYOFWEEK(MIN(MODIFIEDDATE))":'DAYOFWEEK'}, inplace = True)

		df = pd.concat([df, df2])
	
	print(df.head())
	df.drop_duplicates(subset = ['USERID', 'DATE'], inplace = True)

	label = pd.read_csv("../data/customer_feature_matrix.csv", usecols = ["userid", "label"])
	label.columns = label.columns.str.upper()

	label = label.loc[label.LABEL == 'ACTIVE']
	df = df.merge(label, how = 'left', on = 'USERID')
	return df

コード例 #10

0

ファイルを表示

ファイル: clean_import.py プロジェクト: ririgi/FinalProjectA

def cleanData(data_dir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df = df[cols]
            for col in remove_comma:
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda x: x.replace(",", " ")
                                        if x.replace(",", " ") else x)
            toCSV(df, file, index=False)

コード例 #11

0

ファイルを表示

ファイル: customer_type.py プロジェクト: ririgi/seele

def getCustomerType():
    print('getting customer types...')
    transact = readChunk('results/transaction_dates.csv')
    aver = readChunk('results/average_regularity.csv')
    intersession = pd.read_csv('results/intersession.csv')
    intersession.columns = intersession.columns.str.upper()
    transact = transact.merge(aver, how='left', on='USERID')
    transact = transact.merge(intersession, how='right', on='USERID')
    transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION'])
    print(transact.head())
    transact['RWEEK'] = transact['RWEEK'].astype(float)
    s = time.time()
    transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply(
        lambda x: (pd.to_datetime('2019-09-01') - x).days)
    transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply(
        lambda x: 0 if x == -1 else x).astype(float)
    transact = customerType2(transact, how='new')
    print(transact.head(10))
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(transact, 'results/customer_type.csv', index=False)

コード例 #12

0

ファイルを表示

def extractColumns(data_dir, outdir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith('.csv'):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.dropna(subset=['USERID'], inplace=True)
            df.USERID = df.USERID.astype(str)
            df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
            df = removeLurkers(df)
            outfile = os.path.join(outdir, f[-12:])

            toCSV(df, outfile, index=False)

コード例 #13

0

ファイルを表示

ファイル: joindate.py プロジェクト: ririgi/seele

def addJoinedWeek():
    file = "customer_present.csv"
    df = readChunk(file, header=None)
    df.rename(columns={0: "USERID"}, inplace=True)

    df.set_index('USERID', inplace=True)

    joined = []
    for i in range(len(df)):
        joined.append(getJoinedWeek(df.iloc[i]))
    df['joinedweek'] = joined
    print(df.joinedweek)

    df.to_csv('week_present_and_joined.csv')

コード例 #14

0

ファイルを表示

def plotRegularityTenure():
	file = 'results/tenure.csv'
	df = readChunk(file)
	df['RWEEK'] = df['RWEEK'].astype(float)
	df['TENURE'] = df['TENURE'].astype(float)

	for i in df.RWEEK.unique():
		temp = df.loc[df.RWEEK == i]
		plot = sns.distplot(a = temp['TENURE'].values, kde = False)
		
		plot.set_ylim(0,4000)
		plt.title('Regularity = {}'.format(str(i)[0]))
		plot.set_xlabel('TENURE (days)')
		plot.set_ylabel('NUMBER OF CUSTOMERS')
		plt.savefig(str(i)+'.png', dpi = 600)
		plt.clf()

コード例 #15

0

ファイルを表示

def plotWeeklyRegularity(weekno = None, custids = None, ylim = None, outfile = None):
	df = readChunk("../status/results/regularity_combined_monthly.csv")
	print(len(df))
	if type(custids) is list:
		df = df[df['USERID'].isin(custids)]
		print('Number of customers: ', len(df.USERID.unique()))
	
	df.dropna(subset = ['RMONTH'], inplace = True)

	print('Number of customers: ', len(df.USERID.unique()))
	df['RMONTH'] = df['RMONTH'].astype(int)
	df['MONTH'] = df["MONTH"].astype(int)
	df = df.loc[df.MONTH != 1]
	df.sort_values('MONTH', inplace = True)
	fig, axes = plt.subplots(4,2, sharey = 'row', constrained_layout = True)
	x = 0
	y = 0

	months = ['FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST']
	count = 0

	for i in df.MONTH.unique():
		temp = df.loc[df.MONTH == i]
		new_df = pd.DataFrame(index = list(range(1,31)), columns = ['COUNT'])
		new_df.index.name = 'REGULARITY'
		for j in range(1,31):
			temp2 = temp.loc[temp.RMONTH == j]
			new_df.loc[j]['COUNT'] = len(temp2)
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title(months[count], size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 2:
			y = 0
			x = x + 1
		new_df.to_csv('results/reqfreq/week_'+str(i)+'.csv')
		count = count + 1
	# fig.delaxes(axes[7,3])
	fig.delaxes(axes[3,1])
	# outfile = "results/regfreq"+z+str(i)+'.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)

コード例 #16

0

ファイルを表示

ファイル: customer_type.py プロジェクト: ririgi/seele

def customerRegularity(file, regularity_type='mean'):

    print('calculating regularity of type: ', regularity_type)
    df = readChunk(file)
    # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
    print('Number of customers: ', len(df.USERID.unique()))
    s = time.time()
    df['RWEEK'] = df['RWEEK'].astype(int)
    if regularity_type == 'mean':
        new_df = df.groupby('USERID')['RWEEK'].mean().to_frame()
    elif regularity_type == 'mode':
        new_df = df.groupby('USERID')['RWEEK'].agg(
            lambda x: pd.Series.mode(x)[0]).to_frame()
    new_df['RWEEK'] = round(new_df['RWEEK'])
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(new_df, 'results/average_regularity.csv')

コード例 #17

0

ファイルを表示

ファイル: quanti.py プロジェクト: ririgi/ProjectA

def getDate(file, usecols):
	s = time.time()
	print("Getting the time features: ", file)
	transact = readChunk(file, usecols)
	transact = transact.loc[transact.gigyaid.notnull()]
	if len(transact) == 0:
		return pd.DataFrame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("No unique customer")
	else:
		transact = transact.loc[transact.viewpageduration.notnull()]
		transact["viewpageduration"] = transact["viewpageduration"].astype(int)
		group = transact.groupby(["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp"])["viewpageduration"].sum().to_frame()
		e = time.time()
		total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
		print("Finish getting date features: ", total_time)
		return group

コード例 #18

0

ファイルを表示

def main(data_dir, out_dir):
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df = df[[
                'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE',
                'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH',
                'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT'
            ]]
            s = time.time()
            df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT'])
            df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour
            df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT'])
            df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour
            e = time.time()
            total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
            print("Finish getting hour in {}".format(total_time))
            toCSV(df, os.path.join(out_dir, f), index=False)

コード例 #19

0

ファイルを表示

ファイル: quali.py プロジェクト: ririgi/ProjectA

def getQualitative(file, usecols):
    s = time.time()
    print("Getting the qualitative features: ", file)
    transact = readChunk(file, usecols)
    transact = transact.loc[transact.gigyaid.notnull()]
    transact.loc[transact.browsertype.notnull(),
                 "browsertype"] = "WEB APPLICATION"
    transact.browsertype.replace(np.nan, "MOBILE APPLICATION", inplace=True)
    if len(transact) == 0:
        return pd.DataFrame()
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("No unique customer")
    else:
        group = transact.groupby("gigyaid")
        devicetype = group.apply(lambda x: x["devicetype"].unique().tolist()
                                 ).reset_index(name="devicetype")
        deviceos = group.apply(lambda x: x["deviceos"].unique().tolist()
                               ).reset_index(name="deviceos")
        ipaddress = group.apply(lambda x: x["ipaddress"].unique().tolist()
                                ).reset_index(name="ipaddress")
        browsertype = group.apply(lambda x: x["browsertype"].unique().tolist()
                                  ).reset_index(name="browsertype")
        connectivitytype = group.apply(lambda x: x["connectivitytype"].unique(
        ).tolist()).reset_index(name="connectivitytype")
        screensize = group.apply(lambda x: x["screensize"].unique().tolist()
                                 ).reset_index(name="screensize")
        videoquality = group.apply(lambda x: x["videoquality"].unique().tolist(
        )).reset_index(name="videoquality")
        devicename = group.apply(lambda x: x["devicename"].unique().tolist()
                                 ).reset_index(name="devicename")
        mobiledevice = group.apply(lambda x: x["mobiledevice"].unique().tolist(
        )).reset_index(name="mobiledevice")
        df = pd.concat([
            devicetype, deviceos, ipaddress, browsertype, connectivitytype,
            screensize, videoquality, devicename, mobiledevice
        ],
                       axis=1)
        df = df.loc[:, ~df.columns.duplicated()]
        df = df.set_index('gigyaid')
        e = time.time()
        total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
        print("Finish getting qualitative features: ", total_time)
        return (df)

コード例 #20

0

ファイルを表示

ファイル: customer_type.py プロジェクト: ririgi/seele

def calculateTenure():
    print('calculating tenure of the active and lost customers..')
    df = readChunk('results/customer_type.csv')
    s = time.time()
    tenure = []
    df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION'])
    df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION'])
    for i in range(len(df)):
        if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE':
            tenure.append((pd.to_datetime('2019-09-01') -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
        else:
            tenure.append((df.iloc[i]['LAST_TRANSACTION'] -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
    df['TENURE'] = tenure
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    print(df.head(10))
    toCSV(df, 'results/tenure.csv', index=False)

コード例 #21

0

ファイルを表示

ファイル: plot.py プロジェクト: ririgi/seele

def plotDayofWeek():
	df = pd.read_csv("status/rweek.csv")
	df = readChunk("status/results/regularity_combined.csv")

	df.columns = ['WEEK', 'SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'RWEEK', 'USERID']
	df.dropna(subset = ['RWEEK'], inplace = True)
	print('Number of customers: ', len(df.USERID.unique()))
	df['RWEEK'] = df['RWEEK'].astype(int)
	df['WEEK'] = df["WEEK"].astype(int)
	df.sort_values('RWEEK', inplace = True)
	df = df.loc[df.WEEK != 201904]

	dayofweek = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY']
	for i in dayofweek:
		df[i] = df[i].astype(int)

	fig, axes = plt.subplots(3,3, sharey = 'row', constrained_layout = True)	
	x = 0
	y = 0
	for i in df.RWEEK.unique():
		new_df = pd.DataFrame(index = dayofweek, columns = ['COUNT'])
		temp = df.loc[df.RWEEK == i]
		for j in dayofweek:
			new_df.loc[j]['COUNT'] = temp[j].sum()
		plot = new_df.plot(kind = 'bar', legend = False, ax = axes[x, y], rot = 0)
		plot.tick_params(axis = 'both', which = 'major', labelsize = 6, pad = 2)
		plot.set_title("Regularity = {}".format(i), size = 6, pad = 2)
		x_axis = plot.axes.get_xaxis()
		x_label = x_axis.get_label()
		x_label.set_visible(False)
		if ylim:
			plot.set_ylim(0,ylim)
		y = y + 1
		if y == 3:
			y = 0
			x = x + 1
	fig.delaxes(axes[2,1])
	fig.delaxes(axes[2,2])
	outfile = 'results/dayofweek.png'
	if outfile:
		plt.savefig(outfile, dpi = 600)

コード例 #22

0

ファイルを表示

ファイル: per_month.py プロジェクト: ririgi/YaeRegularity

def generateMonth():
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int)
            df.DAY = df.DAY.astype(int)
            df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
            df = df.loc[df.SESSION_STARTDT_MONTH != 11]

            new_df = pd.DataFrame(index=df.USERID.unique())
            new_df.index.name = 'USERID'
            temp = df.loc[df.CONTENT_TYPE == content_type]
            for i in range(df.DAY.min(), df.DAY.max() + 1):
                temp2 = temp.loc[temp.DAY == i]
                group = temp2.groupby(['USERID'])['DAY'].count().to_frame()
                group.DAY = group.DAY.apply(lambda x: np.nan
                                            if np.isnan(x) else '1')
                group.rename(columns={'DAY': str(i)}, inplace=True)
                new_df = new_df.merge(group, how='left', on='USERID')
            toCSV(new_df, 'results/' + str(content_type) + '/' + f)

コード例 #23

0

ファイルを表示

ファイル: plot.py プロジェクト: ririgi/seele

def plotRegularityFreq():
	file = "status/results/regularity_combined.csv"
	df = readChunk(file)
	print('Number of customers: ', len(df.USERID.unique()))
	print(df.head())
	df['RWEEK'] = df['RWEEK'].astype(int)
	new_df = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = df.loc[df.RWEEK == i]
		new_df.loc[i]['COUNT'] = len(temp)
	print(new_df.head())
	barPlot(new_df, 'REGULARITY', 'COUNT', 'regfreq_many.png', print_number = True, savefig = True)

	new_df = df.groupby('USERID')['RWEEK'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()
	print(new_df.head())
	new_df2 = pd.DataFrame(index = [1,2,3,4,5,6,7], columns = ['COUNT'])
	new_df2.index.name = 'REGULARITY'
	for i in range(1, 8):
		temp = new_df.loc[new_df.RWEEK == i]
		new_df2.loc[i]['COUNT'] = len(temp)
	barPlot(new_df2, 'REGULARITY', 'NUMBER OF CUSTOMERS', 'customerregfreq_many.png', print_number = True, savefig = True)

コード例 #24

0

ファイルを表示

ファイル: combine.py プロジェクト: ririgi/YaeRegularity

def combineMonth(data_dir, outfile, check_login=False):
    all_df = []
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df.dropna(subset=['USERID'], inplace=True)
            if check_login:
                df.USERID = df.USERID.astype(str)
                df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
                df = removeNotLoggedIn(df)
                df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str)
                df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
                df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int)
                df = df.loc[df.SESSION_STARTDT_MONTH != 11]
                df['DAY'] = df[[
                    'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY'
                ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1)
                df = df.loc[df.CONTENT_TYPE != 'nan']

                df.replace({'CONTENT_TYPE': content_type}, inplace=True)
            all_df.append(df)
    all_df = pd.concat(all_df)
    all_df = all_df[keepcols]
    toCSV(all_df, outfile, index=False)

コード例 #25

0

ファイルを表示

# df.WATCHING_DUR = pd.to_numeric(df.WATCHING_DUR, errors = "coerce")
# df.VID_DUR = pd.to_numeric(df.VID_DUR, errors = "coerce")
# df.dropna(subset = ['VID_DUR'], inplace = True)

# watching = df.groupby('USERID')['WATCHING_DUR'].sum().to_frame()
# video = df.groupby('USERID')['VID_DUR'].sum().to_frame()

# watching = watching.merge(vide, how = 'left', on = 'USERID')
# watching['COMPLETION'] = (watching['WATCHING_DUR']/watching['VID_DUR'])*100
# print(watching.head())
# print(watching.COMPLETION.min())
# print(watching.COMPLETION.max())
# toCSV(watching, 'completion.csv')

df = readChunk('click.csv', header=None)
df.rename(columns={
    0: 'USERID',
    1: 'SESSIONID',
    2: 'ADPLAY_COUNT',
    3: 'PLAY_COUNT',
    4: 'PAUSE_COUNT',
    5: 'RESUME_COUNT',
    6: 'SEEK_COUNT'
},
          inplace=True)
df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True)

cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT']
for i in cols:
    df[i] = df[i].astype(int)

コード例 #26

0

ファイルを表示

import warnings

warnings.filterwarnings("ignore")

import sys

sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk("../characterization/session_information.csv", header=None)
df.rename(columns={
    0: "USERID",
    1: "SESSIONID",
    2: "MONTH",
    3: "WEEK",
    4: "DATE",
    5: "START_HOUR",
    6: "END_HOUR",
    7: "SESSION_DURATION",
    8: "WATCHING_DURATION",
    9: "VIDEO_DURATION"
},
          inplace=True)

cols = ["SESSION_DURATION", "WATCHING_DURATION"]

コード例 #27

0

ファイルを表示

import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style
from statsmodels.tsa.stattools import adfuller

sns.set()
style.use('seaborn-poster')

type_sess = ['total', 'less', '70']
less70 = readChunk("../sql/query_results/date_count_50_less.csv")
more70 = readChunk("../sql/query_results/date_count_50.csv")

df = more70.merge(less70, on='DATE')
print(df.columns)

df.rename(columns={
    'NUMSESSIONS_x': 'COMPLETION_70',
    'NUMSESSIONS_y': 'COMPLETION_LESS_THAN_70'
},
          inplace=True)

print(df.DATE.unique())
df.COMPLETION_70 = df.COMPLETION_70.astype(float)
df.COMPLETION_LESS_THAN_70 = df.COMPLETION_LESS_THAN_70.astype(float)
df['DATE'] = pd.to_datetime(df['DATE'])

コード例 #28

0

ファイルを表示

import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk(
    "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv",
    header=None)

df.rename(columns={
    0: 'MONTH',
    1: 'USERID',
    2: 'SESSIONID',
    3: 'STARTHOUR',
    4: 'ENDHOUR',
    5: 'engagement'
},
          inplace=True)
print(df.head())
df.engagement = df.engagement.astype(float)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]

total_df = df.groupby('USERID')['engagement'].sum().to_frame()
total_df.engagement = total_df.engagement / 60.0
print(total_df.head())
toCSV(total_df, 'results/overall_engagement.csv')

コード例 #29

0

ファイルを表示

ファイル: return_plot.py プロジェクト: ririgi/FinalProjectA

sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style

sns.set()
style.use('seaborn-poster')

df = readChunk('results/50_WEEK_RETURN_VALUE.csv')
df.WEEK_RETURN_VALUE = df.WEEK_RETURN_VALUE.astype(int)
print(df.head())
print(len(df))

tohist = pd.DataFrame(index=df.WEEK_RETURN_VALUE.unique(), columns=['NUMCUST'])
tohist.index.name = 'WEEK_RETURN_VALUE'
for i in df.WEEK_RETURN_VALUE.unique():
    print(i)
    temp = df.loc[df.WEEK_RETURN_VALUE == i]
    tohist.loc[i]['NUMCUST'] = len(temp)

tohist.sort_index(axis=0, inplace=True)
plot = tohist.plot(kind='bar', colormap='Pastel2')
plt.show()

コード例 #30

0

ファイルを表示

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk

file = "query_results/for_completion_total.csv"
df = readChunk(file)


def countCompletion70(df):
    df.COMPLETION_70 = df.COMPLETION_70.astype(float)
    # print("Total Number of Customers: {}".format(len(df.USERID.unique())))
    df = df.loc[df.COMPLETION_70 >= 70]
    print("Total Number of Customers with 70% Completion: {}".format(
        len(df.USERID.unique())))
    print("\n")
    return df


def timeCompletion(df, col):
    for time_comp in df[col].unique():
        print(time_comp)
        temp = df.loc[df[col] == time_comp]