def scrape_weekly_data():

    resp = urllib.request.urlopen(constants.WEEKLY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[4]

    all_years_months = data_table.find_all('td', {"class": "B6"})
    all_weekend_dates = data_table.find_all('td', {"class": "B5"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for year_month in all_years_months:
        week = 1
        year_month_string = year_month.string.strip()

        for weekend_date in all_weekend_dates:
            weekend_date_string = weekend_date.string.strip()

            if not (len(weekend_date_string) == 0):
                date = (year_month_string[:4] + '-' +
                        weekend_date_string.replace('/', '-'))
                records.append([date, all_prices[week - 1].string.strip()])

            week = week + 1

            if week > 5:
                all_weekend_dates = all_weekend_dates[5:]
                all_prices = all_prices[5:]
                break

    toCSV(records, csv_file_weekly)
def scrape_daily_data():
    resp = urllib.request.urlopen(constants.DAILY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[5]

    all_weeks = data_table.find_all('td', {"class": "B6"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for week in all_weeks:
        day = 0

        for price in all_prices:
            week_string = week.string.strip()
            date = (week_string[:4] + '-' +
                    f'{month_num(week_string[5:8]):02}' + '-' +
                    f'{(int(week_string[9:11]) + day):02}')

            day = day + 1
            records.append([date, price.string])

            if day >= 5:
                all_prices = all_prices[5:]
                break

    toCSV(records, csv_file_daily)
def scrape_monthly_data():

    resp = urllib.request.urlopen(constants.MONTHLY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[4].find('table')

    all_years = data_table.find_all('td', {"class": "B4"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for year in all_years:
        month = 1
        for price in all_prices:
            date = year.string.strip() + '-' + f'{month:02}' + '-' + '01'

            month = month + 1
            records.append([date, price.string])

            if month > 12:
                all_prices = all_prices[12:]
                break

    toCSV(records, csv_file_monthly)
def scrape_annual_data():

    resp = urllib.request.urlopen(constants.ANNUAL_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[5]

    all_decades = data_table.find_all('td', {"class": "B4"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for decade in all_decades:
        year = 0
        for price in all_prices:
            decade_prefix = int(decade.string.strip()[:3])
            date = str(decade_prefix) + str(year)

            year = year + 1
            records.append([date, price.string])

            if year > 9:
                all_prices = all_prices[10:]
                break

    toCSV(records, csv_file_annual)
Esempio n. 5
0
def combineMonth():
    new_df = pd.DataFrame()
    for f in sorted(os.listdir(data_dir + '/' + str(content_type))):
        if f.endswith(".csv"):
            file = os.path.join(data_dir + '/' + str(content_type), f)
            if len(new_df) == 0:
                new_df = readChunk(file)
            else:
                df = readChunk(file)
                new_df = new_df.merge(df, how='left', on='USERID')
    new_df.set_index('USERID', inplace=True)
    cols = new_df.columns
    new_df['first_occurence'] = new_df.apply(func, axis=1)
    for i in cols:
        new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1')
    new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' *
                                                      (32 - int(x)))
    new_df[cols] = new_df[cols].astype(str)
    new_df['all'] = new_df[cols].apply(''.join, axis=1)
    print(new_df[['all', 'total']])
    new_df[colname] = new_df[['all', 'total'
                              ]].apply(lambda x: int(x[0], 2) / int(x[1], 2),
                                       axis=1)
    print(new_df[colname])
    print(cols)
    cols.append(colname)
    print(cols)
    toCSV(new_df[cols], outfile, index=True)
Esempio n. 6
0
def getQuantiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quanti/' + month):
        df = readCSV(os.path.join('../../data/quanti/' + month, f), dtype=str)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        if col == "contentswatched":
            # print(getContentsUnique(all_df, col))
            contents = getContentsUnique(all_df, col)
            new_df = pd.merge(new_df,
                              contents,
                              left_index=True,
                              right_on='gigyaid')
            new_df.drop("contentswatched_x", axis=1, inplace=True)
            new_df.rename({"contentswatched_y": "contentswatched"},
                          axis=1,
                          inplace=True)
        else:
            all_df[col] = all_df[col].astype(float)
            new_df[col] = getSum(all_df, col)[col].values
    toCSV(new_df, "../../data/aggregated/quanti" + month + ".csv")
Esempio n. 7
0
def main(cursor, query, outfile):
    s = time.time()
    df = pd.read_sql(query, con=cursor, chunksize=5000000)
    df = pd.concat(df)
    print(df.head())
    toCSV(df, outfile, index=False)
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total query time: ", total_time)
Esempio n. 8
0
def cleanData(data_dir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df = df[cols]
            for col in remove_comma:
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda x: x.replace(",", " ")
                                        if x.replace(",", " ") else x)
            toCSV(df, file, index=False)
Esempio n. 9
0
def extractUser(users, df, outdir):
	for user in df.USERID.unique():
		if user in users:
			print(user)
			temp = df.loc[df.USERID == user]
			new_df = pd.DataFrame(index = list(range(0, 24)), columns = list(range(1, 183)))
			for i in temp.index.unique():
				for j in range(temp.loc[i]['STARTHOUR'], temp.loc[i]['ENDHOUR']+1):
					new_df.iloc[j][temp.loc[i]['DAY']] = temp.loc[i]['CONTENT_TYPE']
			new_df.index.name = 'HOUR'
			new_df.fillna(0, inplace = True)
			toCSV(new_df, outdir+user+".csv")
Esempio n. 10
0
def extractColumns(data_dir, outdir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith('.csv'):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.dropna(subset=['USERID'], inplace=True)
            df.USERID = df.USERID.astype(str)
            df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
            df = removeLurkers(df)
            outfile = os.path.join(outdir, f[-12:])

            toCSV(df, outfile, index=False)
Esempio n. 11
0
def transactionDates():
    print('getting first and last transaction dates of the customers..')
    file = "results/first_and_last_transaction_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: 'USERID',
        1: 'FIRST_TRANSACTION',
        2: 'LAST_TRANSACTION'
    },
              inplace=True)

    file2 = 'results/average_regularity.csv'
    df2 = readChunk(file2)

    df2 = df2.merge(df, how='left', on='USERID')
    df2.drop(['RWEEK'], axis=1, inplace=True)
    toCSV(df2, 'results/transaction_dates.csv', index=False)
Esempio n. 12
0
def getQualiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quali/' + month):
        df = readCSV(os.path.join('../../data/quali/' + month, f),
                     converters=converters)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        all_df[col] = all_df[col].apply(lambda x: [i.upper() for i in x])
        new_df[col] = getUnique(all_df, col)[col].values
    new_df.index.name = "gigyaid"

    toCSV(new_df, "../../data/aggregated/quali" + month + ".csv")
Esempio n. 13
0
def main(data_dir, out_dir):
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df = df[[
                'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE',
                'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH',
                'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT'
            ]]
            s = time.time()
            df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT'])
            df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour
            df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT'])
            df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour
            e = time.time()
            total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
            print("Finish getting hour in {}".format(total_time))
            toCSV(df, os.path.join(out_dir, f), index=False)
Esempio n. 14
0
def customerRegularity(file, regularity_type='mean'):

    print('calculating regularity of type: ', regularity_type)
    df = readChunk(file)
    # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
    print('Number of customers: ', len(df.USERID.unique()))
    s = time.time()
    df['RWEEK'] = df['RWEEK'].astype(int)
    if regularity_type == 'mean':
        new_df = df.groupby('USERID')['RWEEK'].mean().to_frame()
    elif regularity_type == 'mode':
        new_df = df.groupby('USERID')['RWEEK'].agg(
            lambda x: pd.Series.mode(x)[0]).to_frame()
    new_df['RWEEK'] = round(new_df['RWEEK'])
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(new_df, 'results/average_regularity.csv')
Esempio n. 15
0
def calculateTenure():
    print('calculating tenure of the active and lost customers..')
    df = readChunk('results/customer_type.csv')
    s = time.time()
    tenure = []
    df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION'])
    df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION'])
    for i in range(len(df)):
        if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE':
            tenure.append((pd.to_datetime('2019-09-01') -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
        else:
            tenure.append((df.iloc[i]['LAST_TRANSACTION'] -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
    df['TENURE'] = tenure
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    print(df.head(10))
    toCSV(df, 'results/tenure.csv', index=False)
Esempio n. 16
0
def generateMonth():
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int)
            df.DAY = df.DAY.astype(int)
            df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
            df = df.loc[df.SESSION_STARTDT_MONTH != 11]

            new_df = pd.DataFrame(index=df.USERID.unique())
            new_df.index.name = 'USERID'
            temp = df.loc[df.CONTENT_TYPE == content_type]
            for i in range(df.DAY.min(), df.DAY.max() + 1):
                temp2 = temp.loc[temp.DAY == i]
                group = temp2.groupby(['USERID'])['DAY'].count().to_frame()
                group.DAY = group.DAY.apply(lambda x: np.nan
                                            if np.isnan(x) else '1')
                group.rename(columns={'DAY': str(i)}, inplace=True)
                new_df = new_df.merge(group, how='left', on='USERID')
            toCSV(new_df, 'results/' + str(content_type) + '/' + f)
Esempio n. 17
0
def getCustomerType():
    print('getting customer types...')
    transact = readChunk('results/transaction_dates.csv')
    aver = readChunk('results/average_regularity.csv')
    intersession = pd.read_csv('results/intersession.csv')
    intersession.columns = intersession.columns.str.upper()
    transact = transact.merge(aver, how='left', on='USERID')
    transact = transact.merge(intersession, how='right', on='USERID')
    transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION'])
    print(transact.head())
    transact['RWEEK'] = transact['RWEEK'].astype(float)
    s = time.time()
    transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply(
        lambda x: (pd.to_datetime('2019-09-01') - x).days)
    transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply(
        lambda x: 0 if x == -1 else x).astype(float)
    transact = customerType2(transact, how='new')
    print(transact.head(10))
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(transact, 'results/customer_type.csv', index=False)
Esempio n. 18
0
def combineMonth(data_dir, outfile, check_login=False):
    all_df = []
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df.dropna(subset=['USERID'], inplace=True)
            if check_login:
                df.USERID = df.USERID.astype(str)
                df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
                df = removeNotLoggedIn(df)
                df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str)
                df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
                df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int)
                df = df.loc[df.SESSION_STARTDT_MONTH != 11]
                df['DAY'] = df[[
                    'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY'
                ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1)
                df = df.loc[df.CONTENT_TYPE != 'nan']

                df.replace({'CONTENT_TYPE': content_type}, inplace=True)
            all_df.append(df)
    all_df = pd.concat(all_df)
    all_df = all_df[keepcols]
    toCSV(all_df, outfile, index=False)
Esempio n. 19
0
def main(file, f):
	s = time.time()
	quali_out = os.path.join("../../data/quali/"+month, f[-12:])
	quanti_out = os.path.join("../../data/quanti/"+month, f[-12:])
	month_out = os.path.join("../../data/month/"+month, f[-12:])
	pool = Pool()
	df = pool.apply(getQualitative, args = (file, ["gigyaid", "devicetype", "deviceos", "browsertype",
															"connectivitytype", "devicename", "mobiledevice",
															"screensize", "videoquality", "ipaddress"]))
	toCSV(df, quali_out)
	df = pool.apply(getQuantitative, args = (file, ["gigyaid", "viewpageduration", "pagedepth", "actiontaken", "videotitle", "bigdatasessionid"]))
	toCSV(df, quanti_out)
	df = pool.apply(getDate, args = (file, ["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp", "viewpageduration"]))
	toCSV(df, month_out)
	pool.close()
	pool.join()
	e = time.time()
	total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
	print("Total process time: ", total_time)
Esempio n. 20
0
    1: 'SESSIONID',
    2: 'ADPLAY_COUNT',
    3: 'PLAY_COUNT',
    4: 'PAUSE_COUNT',
    5: 'RESUME_COUNT',
    6: 'SEEK_COUNT'
},
          inplace=True)
df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True)

cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT']
for i in cols:
    df[i] = df[i].astype(int)

new_df = pd.DataFrame(index=df.USERID.unique())
new_df.index.name = "USERID"
new_df.reset_index(inplace=True)
for i in cols:
    new_df = new_df.merge(df.groupby('USERID')[i].sum().to_frame(),
                          how='left',
                          on='USERID')

df = readChunk('seek2.csv', header=None)
df.rename(columns={0: 'USERID', 1: 'SESSIONID', 2: 'SEEK_COUNT'}, inplace=True)
df.SEEK_COUNT = df.SEEK_COUNT.astype(int)
new_df = new_df.merge(df.groupby('USERID')['SEEK_COUNT'].sum().to_frame(),
                      how='left',
                      on='USERID')
print(new_df.head())
toCSV(new_df, 'CLICK.csv')
Esempio n. 21
0
    5: "RESUME",
    6: "SEEK"
},
          inplace=True)
df.drop(columns=['SEEK'], inplace=True)
cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME"]

for i in cols:
    df[i] = pd.to_numeric(df[i], errors="coerce")
    new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(),
                          how='left',
                          on='USERID')

print(new_df.head())
df = readChunk("../characterization/seek2.csv")
df.rename(columns={0: "USERID", 1: "SESSIONID", 3: "SEEK"}, inplace=True)
df.SEEK = pd.to_numeric(df.SEEK, errors="coerce")
new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(),
                      how='left',
                      on="USERID")

cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME", "SEEK"]
new_df["TOTAL"] = 0
for i in cols:
    new_df["TOTAL"] = new_df["TOTAL"] + new_df[i]

for i in cols:
    new_df[i] = new_df[i] / new_df[:"TOTAL"]
print(new_df.head())
toCSV(new_df, "engagement_attributes.csv", index=False)
Esempio n. 22
0
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style

sns.set()
style.use('seaborn-poster')
style.use('bmh')

df = readChunk("tv_completion.csv", header = None)
df.VIDEO_DURATION = pd.to_numeric(df.VIDEO_DURATION, errors = 'coerce')
df.rename(columns = {0:'USERID', 1: 'TITLE', 2:'VIDEO_DURATION', 3:'WATCHING_DURATION'}, inplace = True)

df['CONTENT_COMPLETION'] = (df.WATCHING_DURATION/df.VIDEO_DURATION)*100
toCSV(df[['CONTENT_COMPLETION']], 'content_completion.csv', index = False)
Esempio n. 23
0
convert = {'12': '0'}
df.replace({'SESSION_STARTDT_MONTH': convert}, inplace=True)
toint = ['SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY', 'STARTHOUR']
for i in toint:
    df[i] = df[i].astype(int)
df['ORDER'] = (df.SESSION_STARTDT_MONTH * 100) + (df.SESSION_STARTDT_DAY *
                                                  10) + df.STARTHOUR

df.sort_values('ORDER', inplace=True)

print(len(df))
df.drop_duplicates(subset=['USERID'], keep='first', inplace=True)
print(len(df))

labels = pd.read_csv('clustering_6.csv')
labels.columns = labels.columns.str.upper()

df = df.merge(labels, how='left', on='USERID')

for i in df.LABEL:
    print(df.label)
    temp = df.loc[df.label == i]
    new_df = pd.DataFrame(index=list(range(1, 11)), columns=['COUNT'])
    for j in df.CONTENT_TYPE:
        temp2 = temp.loc[temp.CONTENT_TYPE == j]
        new_df.loc[int(j)]['COUNT'] = len(temp2)

    print(new_df)
    new_df.index.name = 'CONTENT_TYPE'
    toCSV(new_df, 'results/firstwatched/' + str(i) + '.csv')
Esempio n. 24
0
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV

file = "../data/year_week.csv"
df = readChunk(file, sep="\t")

print(df.head())
df.WEEK = df.WEEK.astype(int)
df.sort_values('WEEK', inplace=True)
print(len(df))
df.drop_duplicates(subset=['USERID'], keep='first', inplace=True)
print(len(df))

df.rename(columns={'WEEK': 'INCEPTION_WEEK'}, inplace=True)
toCSV(df[['USERID', 'INCEPTION_WEEK']], "inception_week.csv", index=False)
Esempio n. 25
0
import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk("../sql/query_results/plateu_month.csv")

df.rename(columns={'COUNT(SESSIONID)': 'FREQUENCY'}, inplace=True)
df.FREQUENCY = df.FREQUENCY.astype(int)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]
total_df = pd.DataFrame(index=df.index.unique(), columns=['frequency'])

total_df = df.groupby('USERID')['FREQUENCY'].sum().to_frame()

# for i in df.MONTH.unique():
# 	print(i)
# 	temp = df.loc[df.MONTH == i]
# 	users = temp.index.unique()
# 	for j in users:
# 		total_df.loc[j]['frequency'] = total_df.loc[j]['frequency'] + temp.loc[j]['FREQUENCY']

print(total_df.head())
total_df.index.name = 'USERID'
toCSV(total_df, 'results/overall_frequency.csv')
Esempio n. 26
0
elif col == 'WEEK':
    init = list(df.loc[df.WEEK == 201848.0].USERID.unique())
    df = df.loc[df.WEEK >= 201848.0]

new_df = pd.DataFrame(index=df.USERID.unique(), columns=[colname], data=0)
new_df.index.name = 'USERID'

for i in df[col].unique():
    print(i)
    if col == 'MONTH':
        if i == 201812.0: continue
    elif col == 'WEEK':
        if i == 201848.0: continue
    temp = df.loc[df[col] == i]
    users = list(temp.USERID.unique())
    common = list(set(init).intersection(users))
    for j in range(len(common)):
        new_df.loc[common[j]][colname] = new_df.loc[common[j]][colname] + 1

    # new_df[colname] = new_df.apply(lambda x: addvalue(x[colname]) if str(x.index) in common else x[colname], axis = 1)
    init.extend(users)
    init = list(set(init))

print(new_df.head())
print(new_df[colname].unique())
toCSV(new_df, out)

e = time.time()
total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
print("Total time: ", total_time)
Esempio n. 27
0
import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk(
    "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv",
    header=None)

df.rename(columns={
    0: 'MONTH',
    1: 'USERID',
    2: 'SESSIONID',
    3: 'STARTHOUR',
    4: 'ENDHOUR',
    5: 'engagement'
},
          inplace=True)
print(df.head())
df.engagement = df.engagement.astype(float)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]

total_df = df.groupby('USERID')['engagement'].sum().to_frame()
total_df.engagement = total_df.engagement / 60.0
print(total_df.head())
toCSV(total_df, 'results/overall_engagement.csv')
Esempio n. 28
0
diverse = pd.read_csv("../../data/customer_feature_matrix.csv")
diverse.columns = diverse.columns.str.upper()
diverse = diverse.loc[diverse.FREQUENCY != 1]
diverse = diverse.loc[diverse.LABEL != 'NEW']
diverse = diverse[['USERID', 'LABEL']]
uniquecust = diverse.USERID.unique()
print(len(uniquecust))
# np.savetxt('results/diversecustomers.txt', uniquecust, delimiter = ',', fmt = "%s")
file = "../../data/regularity_cleaned_ordered.csv"
df = readChunk(file, header=None)
df.rename(columns={
    0: 'USERID',
    1: 'SESSIONID',
    2: 'MONTH',
    3: 'WEEK',
    4: 'DATE',
    5: 'DAY_OF_WEEK'
},
          inplace=True)
print(len(df))

old = 0
for i in range(1, 63):
    new = 50000 * i
    print(old, new)
    temp = uniquecust[old:new]
    temp2 = df[df['USERID'].isin(temp)]
    print(len(temp2))
    toCSV(temp2, 'results/all/' + str(i) + '.csv', index=False)
    # np.savetxt('results/all/'+str(i)+'.txt', temp, delimiter = ',', fmt = "%s")
    old = new + 1
Esempio n. 29
0
def prepare_all_data():
    for series_id, csv_file_name in series_dict.items():
        resp_data = retrieve_uri_data(f'http://api.eia.gov/series/?api_key={constants.EIA_API_KEY}&series_id={series_id}') 
        data = resp_data['series'][0]['data']
        toCSV(data, csv_file_name)
Esempio n. 30
0
File: rfe.py Progetto: ririgi/seele
df.DATE = pd.to_datetime(df.DATE)
df.sort_values("DATE", inplace = True)
df.drop_duplicates(subset = ['USERID'], keep = 'last', inplace = True)
recency = []
for i in range(len(df)):
	recency.append((pd.to_datetime('2019-09-01') - df.iloc[i]["DATE"]).days)
df["RECENCY"] = recency
print(df.head(10))
print(len(df))
df = df[['USERID', 'RECENCY']]

print('getting engagement')
file2 = "../data/eng_current.csv"
df2 = readChunk(file2, sep = '\t')
df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "TIMESTAMPDIFF(MINUTE, MIN(SESSION_STARTDT), MAX(SESSION_ENDDT))":'ENGAGEMENT'}, inplace = True)
file3 = "../data/eng_old.csv"
df3 = readChunk(file3, sep = '\t')
df3.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "COUNT(DISTINCT SESSIONID)":'FREQUENCY'}, inplace = True)

df2 = pd.concat([df2, df3])
df2.ENGAGEMENT = df2.ENGAGEMENT.astype(int)
eng = df2.groupby('USERID')['ENGAGEMENT'].sum().to_frame()
print(eng.head(10))
print(len(eng))

df = df.merge(freq, how = 'left', on = 'USERID')
df = df.merge(eng, how = 'left', on = 'USERID')
print(len(df))

toCSV(df, 'results/channel2.csv', index = False)