Esempi in Python per toCSV, esempi in Python per utils.toCSV

Esempio n. 1

0

Mostra file

File: gas_prices_scrape.py Progetto: leabstrait/gas-prices-data

def scrape_weekly_data():

    resp = urllib.request.urlopen(constants.WEEKLY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[4]

    all_years_months = data_table.find_all('td', {"class": "B6"})
    all_weekend_dates = data_table.find_all('td', {"class": "B5"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for year_month in all_years_months:
        week = 1
        year_month_string = year_month.string.strip()

        for weekend_date in all_weekend_dates:
            weekend_date_string = weekend_date.string.strip()

            if not (len(weekend_date_string) == 0):
                date = (year_month_string[:4] + '-' +
                        weekend_date_string.replace('/', '-'))
                records.append([date, all_prices[week - 1].string.strip()])

            week = week + 1

            if week > 5:
                all_weekend_dates = all_weekend_dates[5:]
                all_prices = all_prices[5:]
                break

    toCSV(records, csv_file_weekly)

Esempio n. 2

0

Mostra file

File: gas_prices_scrape.py Progetto: leabstrait/gas-prices-data

def scrape_daily_data():
    resp = urllib.request.urlopen(constants.DAILY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[5]

    all_weeks = data_table.find_all('td', {"class": "B6"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for week in all_weeks:
        day = 0

        for price in all_prices:
            week_string = week.string.strip()
            date = (week_string[:4] + '-' +
                    f'{month_num(week_string[5:8]):02}' + '-' +
                    f'{(int(week_string[9:11]) + day):02}')

            day = day + 1
            records.append([date, price.string])

            if day >= 5:
                all_prices = all_prices[5:]
                break

    toCSV(records, csv_file_daily)

Esempio n. 3

0

Mostra file

File: gas_prices_scrape.py Progetto: leabstrait/gas-prices-data

def scrape_monthly_data():

    resp = urllib.request.urlopen(constants.MONTHLY_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[4].find('table')

    all_years = data_table.find_all('td', {"class": "B4"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for year in all_years:
        month = 1
        for price in all_prices:
            date = year.string.strip() + '-' + f'{month:02}' + '-' + '01'

            month = month + 1
            records.append([date, price.string])

            if month > 12:
                all_prices = all_prices[12:]
                break

    toCSV(records, csv_file_monthly)

Esempio n. 4

0

Mostra file

File: gas_prices_scrape.py Progetto: leabstrait/gas-prices-data

def scrape_annual_data():

    resp = urllib.request.urlopen(constants.ANNUAL_SERIES_URI)
    resp_data = resp.read()

    soup = BeautifulSoup(resp_data, 'html.parser')
    data_table = soup.find_all('table')[5]

    all_decades = data_table.find_all('td', {"class": "B4"})
    all_prices = data_table.find_all('td', {"class": "B3"})

    records = []
    for decade in all_decades:
        year = 0
        for price in all_prices:
            decade_prefix = int(decade.string.strip()[:3])
            date = str(decade_prefix) + str(year)

            year = year + 1
            records.append([date, price.string])

            if year > 9:
                all_prices = all_prices[10:]
                break

    toCSV(records, csv_file_annual)

Esempio n. 5

0

Mostra file

File: per_month.py Progetto: ririgi/YaeRegularity

def combineMonth():
    new_df = pd.DataFrame()
    for f in sorted(os.listdir(data_dir + '/' + str(content_type))):
        if f.endswith(".csv"):
            file = os.path.join(data_dir + '/' + str(content_type), f)
            if len(new_df) == 0:
                new_df = readChunk(file)
            else:
                df = readChunk(file)
                new_df = new_df.merge(df, how='left', on='USERID')
    new_df.set_index('USERID', inplace=True)
    cols = new_df.columns
    new_df['first_occurence'] = new_df.apply(func, axis=1)
    for i in cols:
        new_df[i] = new_df[i].apply(lambda x: '0' if np.isnan(x) else '1')
    new_df['total'] = new_df['first_occurence'].apply(lambda x: '1' *
                                                      (32 - int(x)))
    new_df[cols] = new_df[cols].astype(str)
    new_df['all'] = new_df[cols].apply(''.join, axis=1)
    print(new_df[['all', 'total']])
    new_df[colname] = new_df[['all', 'total'
                              ]].apply(lambda x: int(x[0], 2) / int(x[1], 2),
                                       axis=1)
    print(new_df[colname])
    print(cols)
    cols.append(colname)
    print(cols)
    toCSV(new_df[cols], outfile, index=True)

Esempio n. 6

0

Mostra file

def getQuantiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quanti/' + month):
        df = readCSV(os.path.join('../../data/quanti/' + month, f), dtype=str)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        if col == "contentswatched":
            # print(getContentsUnique(all_df, col))
            contents = getContentsUnique(all_df, col)
            new_df = pd.merge(new_df,
                              contents,
                              left_index=True,
                              right_on='gigyaid')
            new_df.drop("contentswatched_x", axis=1, inplace=True)
            new_df.rename({"contentswatched_y": "contentswatched"},
                          axis=1,
                          inplace=True)
        else:
            all_df[col] = all_df[col].astype(float)
            new_df[col] = getSum(all_df, col)[col].values
    toCSV(new_df, "../../data/aggregated/quanti" + month + ".csv")

Esempio n. 7

0

Mostra file

File: query.py Progetto: ririgi/FinalProjectA

def main(cursor, query, outfile):
    s = time.time()
    df = pd.read_sql(query, con=cursor, chunksize=5000000)
    df = pd.concat(df)
    print(df.head())
    toCSV(df, outfile, index=False)
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total query time: ", total_time)

Esempio n. 8

0

Mostra file

File: clean_import.py Progetto: ririgi/FinalProjectA

def cleanData(data_dir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df = df[cols]
            for col in remove_comma:
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda x: x.replace(",", " ")
                                        if x.replace(",", " ") else x)
            toCSV(df, file, index=False)

Esempio n. 9

0

Mostra file

def extractUser(users, df, outdir):
	for user in df.USERID.unique():
		if user in users:
			print(user)
			temp = df.loc[df.USERID == user]
			new_df = pd.DataFrame(index = list(range(0, 24)), columns = list(range(1, 183)))
			for i in temp.index.unique():
				for j in range(temp.loc[i]['STARTHOUR'], temp.loc[i]['ENDHOUR']+1):
					new_df.iloc[j][temp.loc[i]['DAY']] = temp.loc[i]['CONTENT_TYPE']
			new_df.index.name = 'HOUR'
			new_df.fillna(0, inplace = True)
			toCSV(new_df, outdir+user+".csv")

Esempio n. 10

0

Mostra file

def extractColumns(data_dir, outdir):
    print(data_dir)
    for f in sorted(os.listdir(data_dir)):
        if f.endswith('.csv'):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.dropna(subset=['USERID'], inplace=True)
            df.USERID = df.USERID.astype(str)
            df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
            df = removeLurkers(df)
            outfile = os.path.join(outdir, f[-12:])

            toCSV(df, outfile, index=False)

Esempio n. 11

0

Mostra file

File: customer_type.py Progetto: ririgi/seele

def transactionDates():
    print('getting first and last transaction dates of the customers..')
    file = "results/first_and_last_transaction_correct.csv"
    df = readChunk(file, header=None)
    df.rename(columns={
        0: 'USERID',
        1: 'FIRST_TRANSACTION',
        2: 'LAST_TRANSACTION'
    },
              inplace=True)

    file2 = 'results/average_regularity.csv'
    df2 = readChunk(file2)

    df2 = df2.merge(df, how='left', on='USERID')
    df2.drop(['RWEEK'], axis=1, inplace=True)
    toCSV(df2, 'results/transaction_dates.csv', index=False)

Esempio n. 12

0

Mostra file

def getQualiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quali/' + month):
        df = readCSV(os.path.join('../../data/quali/' + month, f),
                     converters=converters)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        all_df[col] = all_df[col].apply(lambda x: [i.upper() for i in x])
        new_df[col] = getUnique(all_df, col)[col].values
    new_df.index.name = "gigyaid"

    toCSV(new_df, "../../data/aggregated/quali" + month + ".csv")

Esempio n. 13

0

Mostra file

def main(data_dir, out_dir):
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df = df[[
                'USERID', 'SESSIONID', 'PRIMARY_FINGERPRINT', 'CONTENT_TYPE',
                'VIDEO_CATEGORY_TITLE', 'SESSION_STARTDT_MONTH',
                'SESSION_STARTDT_DAY', 'SESSION_STARTDT', 'SESSION_ENDDT'
            ]]
            s = time.time()
            df['SESSION_STARTDT'] = pd.to_datetime(df['SESSION_STARTDT'])
            df['STARTHOUR'] = df.SESSION_STARTDT.dt.hour
            df['SESSION_ENDDT'] = pd.to_datetime(df['SESSION_ENDDT'])
            df['ENDHOUR'] = df.SESSION_ENDDT.dt.hour
            e = time.time()
            total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
            print("Finish getting hour in {}".format(total_time))
            toCSV(df, os.path.join(out_dir, f), index=False)

Esempio n. 14

0

Mostra file

File: customer_type.py Progetto: ririgi/seele

def customerRegularity(file, regularity_type='mean'):

    print('calculating regularity of type: ', regularity_type)
    df = readChunk(file)
    # df.rename(columns = {0:'WEEK', 8:'RWEEK', 9:'USERID'}, inplace = True)
    print('Number of customers: ', len(df.USERID.unique()))
    s = time.time()
    df['RWEEK'] = df['RWEEK'].astype(int)
    if regularity_type == 'mean':
        new_df = df.groupby('USERID')['RWEEK'].mean().to_frame()
    elif regularity_type == 'mode':
        new_df = df.groupby('USERID')['RWEEK'].agg(
            lambda x: pd.Series.mode(x)[0]).to_frame()
    new_df['RWEEK'] = round(new_df['RWEEK'])
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(new_df, 'results/average_regularity.csv')

Esempio n. 15

0

Mostra file

File: customer_type.py Progetto: ririgi/seele

def calculateTenure():
    print('calculating tenure of the active and lost customers..')
    df = readChunk('results/customer_type.csv')
    s = time.time()
    tenure = []
    df['FIRST_TRANSACTION'] = pd.to_datetime(df['FIRST_TRANSACTION'])
    df['LAST_TRANSACTION'] = pd.to_datetime(df['LAST_TRANSACTION'])
    for i in range(len(df)):
        if df.iloc[i]['CUSTOMERTYPE'] == 'ACTIVE':
            tenure.append((pd.to_datetime('2019-09-01') -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
        else:
            tenure.append((df.iloc[i]['LAST_TRANSACTION'] -
                           df.iloc[i]['FIRST_TRANSACTION']).days)
    df['TENURE'] = tenure
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    print(df.head(10))
    toCSV(df, 'results/tenure.csv', index=False)

Esempio n. 16

0

Mostra file

File: per_month.py Progetto: ririgi/YaeRegularity

def generateMonth():
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            file = os.path.join(data_dir, f)
            df = readChunk(file)
            df.CONTENT_TYPE = df.CONTENT_TYPE.astype(int)
            df.DAY = df.DAY.astype(int)
            df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
            df = df.loc[df.SESSION_STARTDT_MONTH != 11]

            new_df = pd.DataFrame(index=df.USERID.unique())
            new_df.index.name = 'USERID'
            temp = df.loc[df.CONTENT_TYPE == content_type]
            for i in range(df.DAY.min(), df.DAY.max() + 1):
                temp2 = temp.loc[temp.DAY == i]
                group = temp2.groupby(['USERID'])['DAY'].count().to_frame()
                group.DAY = group.DAY.apply(lambda x: np.nan
                                            if np.isnan(x) else '1')
                group.rename(columns={'DAY': str(i)}, inplace=True)
                new_df = new_df.merge(group, how='left', on='USERID')
            toCSV(new_df, 'results/' + str(content_type) + '/' + f)

Esempio n. 17

0

Mostra file

File: customer_type.py Progetto: ririgi/seele

def getCustomerType():
    print('getting customer types...')
    transact = readChunk('results/transaction_dates.csv')
    aver = readChunk('results/average_regularity.csv')
    intersession = pd.read_csv('results/intersession.csv')
    intersession.columns = intersession.columns.str.upper()
    transact = transact.merge(aver, how='left', on='USERID')
    transact = transact.merge(intersession, how='right', on='USERID')
    transact['LAST_TRANSACTION'] = pd.to_datetime(transact['LAST_TRANSACTION'])
    print(transact.head())
    transact['RWEEK'] = transact['RWEEK'].astype(float)
    s = time.time()
    transact['INACTIVITY_DAYS'] = transact['LAST_TRANSACTION'].apply(
        lambda x: (pd.to_datetime('2019-09-01') - x).days)
    transact['INACTIVITY_DAYS'] = transact['INACTIVITY_DAYS'].apply(
        lambda x: 0 if x == -1 else x).astype(float)
    transact = customerType2(transact, how='new')
    print(transact.head(10))
    e = time.time()
    total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
    print("Total process time is {}".format(total_time))
    toCSV(transact, 'results/customer_type.csv', index=False)

Esempio n. 18

0

Mostra file

File: combine.py Progetto: ririgi/YaeRegularity

def combineMonth(data_dir, outfile, check_login=False):
    all_df = []
    for f in sorted(os.listdir(data_dir)):
        if f.endswith(".csv"):
            df = readChunk(os.path.join(data_dir, f))
            df.dropna(subset=['USERID'], inplace=True)
            if check_login:
                df.USERID = df.USERID.astype(str)
                df.PRIMARY_FINGERPRINT = df.PRIMARY_FINGERPRINT.astype(str)
                df = removeNotLoggedIn(df)
                df.CONTENT_TYPE = df.CONTENT_TYPE.astype(str)
                df.SESSION_STARTDT_MONTH = df.SESSION_STARTDT_MONTH.astype(int)
                df.SESSION_STARTDT_DAY = df.SESSION_STARTDT_DAY.astype(int)
                df = df.loc[df.SESSION_STARTDT_MONTH != 11]
                df['DAY'] = df[[
                    'SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY'
                ]].apply(lambda x: getCustomerDay(x[0], x[1]), axis=1)
                df = df.loc[df.CONTENT_TYPE != 'nan']

                df.replace({'CONTENT_TYPE': content_type}, inplace=True)
            all_df.append(df)
    all_df = pd.concat(all_df)
    all_df = all_df[keepcols]
    toCSV(all_df, outfile, index=False)

Esempio n. 19

0

Mostra file

def main(file, f):
	s = time.time()
	quali_out = os.path.join("../../data/quali/"+month, f[-12:])
	quanti_out = os.path.join("../../data/quanti/"+month, f[-12:])
	month_out = os.path.join("../../data/month/"+month, f[-12:])
	pool = Pool()
	df = pool.apply(getQualitative, args = (file, ["gigyaid", "devicetype", "deviceos", "browsertype",
															"connectivitytype", "devicename", "mobiledevice",
															"screensize", "videoquality", "ipaddress"]))
	toCSV(df, quali_out)
	df = pool.apply(getQuantitative, args = (file, ["gigyaid", "viewpageduration", "pagedepth", "actiontaken", "videotitle", "bigdatasessionid"]))
	toCSV(df, quanti_out)
	df = pool.apply(getDate, args = (file, ["gigyaid", "bigdatasessionid", "sessionstarttimestamp", "sessionendtimestamp", "viewpageduration"]))
	toCSV(df, month_out)
	pool.close()
	pool.join()
	e = time.time()
	total_time = time.strftime("%H:%M:%S", time.gmtime(e-s))
	print("Total process time: ", total_time)

Esempio n. 20

0

Mostra file

    1: 'SESSIONID',
    2: 'ADPLAY_COUNT',
    3: 'PLAY_COUNT',
    4: 'PAUSE_COUNT',
    5: 'RESUME_COUNT',
    6: 'SEEK_COUNT'
},
          inplace=True)
df.drop(columns=['SEEK_COUNT'], axis=1, inplace=True)

cols = ['ADPLAY_COUNT', 'PLAY_COUNT', 'PAUSE_COUNT', 'RESUME_COUNT']
for i in cols:
    df[i] = df[i].astype(int)

new_df = pd.DataFrame(index=df.USERID.unique())
new_df.index.name = "USERID"
new_df.reset_index(inplace=True)
for i in cols:
    new_df = new_df.merge(df.groupby('USERID')[i].sum().to_frame(),
                          how='left',
                          on='USERID')

df = readChunk('seek2.csv', header=None)
df.rename(columns={0: 'USERID', 1: 'SESSIONID', 2: 'SEEK_COUNT'}, inplace=True)
df.SEEK_COUNT = df.SEEK_COUNT.astype(int)
new_df = new_df.merge(df.groupby('USERID')['SEEK_COUNT'].sum().to_frame(),
                      how='left',
                      on='USERID')
print(new_df.head())
toCSV(new_df, 'CLICK.csv')

Esempio n. 21

0

Mostra file

    5: "RESUME",
    6: "SEEK"
},
          inplace=True)
df.drop(columns=['SEEK'], inplace=True)
cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME"]

for i in cols:
    df[i] = pd.to_numeric(df[i], errors="coerce")
    new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(),
                          how='left',
                          on='USERID')

print(new_df.head())
df = readChunk("../characterization/seek2.csv")
df.rename(columns={0: "USERID", 1: "SESSIONID", 3: "SEEK"}, inplace=True)
df.SEEK = pd.to_numeric(df.SEEK, errors="coerce")
new_df = new_df.merge(df.groupby("USERID")[i].sum().to_frame(),
                      how='left',
                      on="USERID")

cols = ["ADPLAY", "PLAY", "PAUSE", "RESUME", "SEEK"]
new_df["TOTAL"] = 0
for i in cols:
    new_df["TOTAL"] = new_df["TOTAL"] + new_df[i]

for i in cols:
    new_df[i] = new_df[i] / new_df[:"TOTAL"]
print(new_df.head())
toCSV(new_df, "engagement_attributes.csv", index=False)

Esempio n. 22

0

Mostra file

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.style as style

sns.set()
style.use('seaborn-poster')
style.use('bmh')

df = readChunk("tv_completion.csv", header = None)
df.VIDEO_DURATION = pd.to_numeric(df.VIDEO_DURATION, errors = 'coerce')
df.rename(columns = {0:'USERID', 1: 'TITLE', 2:'VIDEO_DURATION', 3:'WATCHING_DURATION'}, inplace = True)

df['CONTENT_COMPLETION'] = (df.WATCHING_DURATION/df.VIDEO_DURATION)*100
toCSV(df[['CONTENT_COMPLETION']], 'content_completion.csv', index = False)

Esempio n. 23

0

Mostra file

convert = {'12': '0'}
df.replace({'SESSION_STARTDT_MONTH': convert}, inplace=True)
toint = ['SESSION_STARTDT_MONTH', 'SESSION_STARTDT_DAY', 'STARTHOUR']
for i in toint:
    df[i] = df[i].astype(int)
df['ORDER'] = (df.SESSION_STARTDT_MONTH * 100) + (df.SESSION_STARTDT_DAY *
                                                  10) + df.STARTHOUR

df.sort_values('ORDER', inplace=True)

print(len(df))
df.drop_duplicates(subset=['USERID'], keep='first', inplace=True)
print(len(df))

labels = pd.read_csv('clustering_6.csv')
labels.columns = labels.columns.str.upper()

df = df.merge(labels, how='left', on='USERID')

for i in df.LABEL:
    print(df.label)
    temp = df.loc[df.label == i]
    new_df = pd.DataFrame(index=list(range(1, 11)), columns=['COUNT'])
    for j in df.CONTENT_TYPE:
        temp2 = temp.loc[temp.CONTENT_TYPE == j]
        new_df.loc[int(j)]['COUNT'] = len(temp2)

    print(new_df)
    new_df.index.name = 'CONTENT_TYPE'
    toCSV(new_df, 'results/firstwatched/' + str(i) + '.csv')

Esempio n. 24

0

Mostra file

File: inception_week.py Progetto: ririgi/seele

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")

import os
import time
import pandas as pd
import numpy as np

from utils import readChunk, toCSV

file = "../data/year_week.csv"
df = readChunk(file, sep="\t")

print(df.head())
df.WEEK = df.WEEK.astype(int)
df.sort_values('WEEK', inplace=True)
print(len(df))
df.drop_duplicates(subset=['USERID'], keep='first', inplace=True)
print(len(df))

df.rename(columns={'WEEK': 'INCEPTION_WEEK'}, inplace=True)
toCSV(df[['USERID', 'INCEPTION_WEEK']], "inception_week.csv", index=False)

Esempio n. 25

0

Mostra file

import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk("../sql/query_results/plateu_month.csv")

df.rename(columns={'COUNT(SESSIONID)': 'FREQUENCY'}, inplace=True)
df.FREQUENCY = df.FREQUENCY.astype(int)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]
total_df = pd.DataFrame(index=df.index.unique(), columns=['frequency'])

total_df = df.groupby('USERID')['FREQUENCY'].sum().to_frame()

# for i in df.MONTH.unique():
# 	print(i)
# 	temp = df.loc[df.MONTH == i]
# 	users = temp.index.unique()
# 	for j in users:
# 		total_df.loc[j]['frequency'] = total_df.loc[j]['frequency'] + temp.loc[j]['FREQUENCY']

print(total_df.head())
total_df.index.name = 'USERID'
toCSV(total_df, 'results/overall_frequency.csv')

Esempio n. 26

0

Mostra file

File: returning.py Progetto: ririgi/FinalProjectA

elif col == 'WEEK':
    init = list(df.loc[df.WEEK == 201848.0].USERID.unique())
    df = df.loc[df.WEEK >= 201848.0]

new_df = pd.DataFrame(index=df.USERID.unique(), columns=[colname], data=0)
new_df.index.name = 'USERID'

for i in df[col].unique():
    print(i)
    if col == 'MONTH':
        if i == 201812.0: continue
    elif col == 'WEEK':
        if i == 201848.0: continue
    temp = df.loc[df[col] == i]
    users = list(temp.USERID.unique())
    common = list(set(init).intersection(users))
    for j in range(len(common)):
        new_df.loc[common[j]][colname] = new_df.loc[common[j]][colname] + 1

    # new_df[colname] = new_df.apply(lambda x: addvalue(x[colname]) if str(x.index) in common else x[colname], axis = 1)
    init.extend(users)
    init = list(set(init))

print(new_df.head())
print(new_df[colname].unique())
toCSV(new_df, out)

e = time.time()
total_time = time.strftime("%H:%M:%S", time.gmtime(e - s))
print("Total time: ", total_time)

Esempio n. 27

0

Mostra file

import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from utils import readChunk, toCSV

df = readChunk(
    "../../events/MONTH_SESSION_TIME_CATEGORY_WITH_TIME_DURATION.csv",
    header=None)

df.rename(columns={
    0: 'MONTH',
    1: 'USERID',
    2: 'SESSIONID',
    3: 'STARTHOUR',
    4: 'ENDHOUR',
    5: 'engagement'
},
          inplace=True)
print(df.head())
df.engagement = df.engagement.astype(float)
df.MONTH = df.MONTH.astype(int)
df = df.loc[df.MONTH >= 201812]

total_df = df.groupby('USERID')['engagement'].sum().to_frame()
total_df.engagement = total_df.engagement / 60.0
print(total_df.head())
toCSV(total_df, 'results/overall_engagement.csv')

Esempio n. 28

0

Mostra file

diverse = pd.read_csv("../../data/customer_feature_matrix.csv")
diverse.columns = diverse.columns.str.upper()
diverse = diverse.loc[diverse.FREQUENCY != 1]
diverse = diverse.loc[diverse.LABEL != 'NEW']
diverse = diverse[['USERID', 'LABEL']]
uniquecust = diverse.USERID.unique()
print(len(uniquecust))
# np.savetxt('results/diversecustomers.txt', uniquecust, delimiter = ',', fmt = "%s")
file = "../../data/regularity_cleaned_ordered.csv"
df = readChunk(file, header=None)
df.rename(columns={
    0: 'USERID',
    1: 'SESSIONID',
    2: 'MONTH',
    3: 'WEEK',
    4: 'DATE',
    5: 'DAY_OF_WEEK'
},
          inplace=True)
print(len(df))

old = 0
for i in range(1, 63):
    new = 50000 * i
    print(old, new)
    temp = uniquecust[old:new]
    temp2 = df[df['USERID'].isin(temp)]
    print(len(temp2))
    toCSV(temp2, 'results/all/' + str(i) + '.csv', index=False)
    # np.savetxt('results/all/'+str(i)+'.txt', temp, delimiter = ',', fmt = "%s")
    old = new + 1

Esempio n. 29

0

Mostra file

File: gas_prices_api.py Progetto: leabstrait/gas-prices-data

def prepare_all_data():
    for series_id, csv_file_name in series_dict.items():
        resp_data = retrieve_uri_data(f'http://api.eia.gov/series/?api_key={constants.EIA_API_KEY}&series_id={series_id}') 
        data = resp_data['series'][0]['data']
        toCSV(data, csv_file_name)

Esempio n. 30

0

Mostra file

File: rfe.py Progetto: ririgi/seele

df.DATE = pd.to_datetime(df.DATE)
df.sort_values("DATE", inplace = True)
df.drop_duplicates(subset = ['USERID'], keep = 'last', inplace = True)
recency = []
for i in range(len(df)):
	recency.append((pd.to_datetime('2019-09-01') - df.iloc[i]["DATE"]).days)
df["RECENCY"] = recency
print(df.head(10))
print(len(df))
df = df[['USERID', 'RECENCY']]

print('getting engagement')
file2 = "../data/eng_current.csv"
df2 = readChunk(file2, sep = '\t')
df2.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "TIMESTAMPDIFF(MINUTE, MIN(SESSION_STARTDT), MAX(SESSION_ENDDT))":'ENGAGEMENT'}, inplace = True)
file3 = "../data/eng_old.csv"
df3 = readChunk(file3, sep = '\t')
df3.rename(columns = {"DATE(MODIFIEDDATE)":'DATE', "EXTRACT(YEAR_MONTH FROM MIN(MODIFIEDDATE))":'MONTH', "YEARWEEK(MIN(MODIFIEDDATE))":'WEEK', "COUNT(DISTINCT SESSIONID)":'FREQUENCY'}, inplace = True)

df2 = pd.concat([df2, df3])
df2.ENGAGEMENT = df2.ENGAGEMENT.astype(int)
eng = df2.groupby('USERID')['ENGAGEMENT'].sum().to_frame()
print(eng.head(10))
print(len(eng))

df = df.merge(freq, how = 'left', on = 'USERID')
df = df.merge(eng, how = 'left', on = 'USERID')
print(len(df))

toCSV(df, 'results/channel2.csv', index = False)