コード例 #1
0
def to_redshift():
    pr.connect_to_redshift(
        dbname='pricing',
        host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com',
        port='5432',
        user='******',
        password='******')
    print('Connect to RedShift')
    return pr.redshift_to_pandas('select * from kroton_pricing.bd_ccr')
コード例 #2
0
def get_raw(sample_flag):
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(dbname = 'muni',
                        host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
                        port = '5439',
                        user = credentials['user'],
                        password = credentials['password'])

    if sample_flag:
        df = pr.redshift_to_pandas("""select * from vehicle_monitoring limit 1000""")
        df.to_csv('data/vehicle_monitoring_sample.csv', index=False)
    else:
        df = pr.redshift_to_pandas("""select * from vehicle_monitoring""")
        df.to_csv('data/vehicle_monitoring.csv', index=False)
    pr.close_up_shop()
    return df
コード例 #3
0
def rs_data_select(query):
    pr.connect_to_redshift(dbname=DBNAME,
                           host=HOST,
                           port=PORT,
                           user=RS_ID,
                           password=RS_PW)
    df = pr.redshift_to_pandas(query)
    pr.close_up_shop()
    df = df.round(2)
    return df
コード例 #4
0
def db_pandas_query(query):
    """
    Read Redshift table into a pandas data frame
    """
    pr.connect_to_redshift(dbname=DB_NAME,
                           host=DB_HOST,
                           port=DB_PORT,
                           user=DB_USER,
                           password=DB_PASSWORD)
    data = pr.redshift_to_pandas(query)
    pr.close_up_shop()
    return data
コード例 #5
0
ファイル: data_processing_planB.py プロジェクト: bmander/muni
def get_distributions(sample_flag):
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(
        dbname='muni',
        host='jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
        port='5439',
        user=credentials['user'],
        password=credentials['password'])

    if sample_flag:
        df = pr.redshift_to_pandas(
            """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean
                                        from distributions_gamma limit 1000""")
        df.to_csv('data/distributions_gamma_sample.csv', index=False)
    else:
        df = pr.redshift_to_pandas(
            """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean
                                        from distributions_gamma""")
        df.to_csv('data/distributions_gamma.csv', index=False)
    pr.close_up_shop()
    return df
コード例 #6
0
def get_distributions():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(dbname = 'muni',
                        host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
                        port = '5439',
                        user = credentials['user'],
                        password = credentials['password'])

    df = pr.redshift_to_pandas("""select *,
                                    convert_timezone('US/Pacific', departure_time_hour) as local_departure_time_hour
                                     from distributions_gamma""")
    pr.close_up_shop()

    return df
コード例 #7
0
def stops_to_durations():

    connect_to_redshift()

    df = pr.redshift_to_pandas("""select a.* from
        (select data_frame_ref, stop_id from stop_events group by data_frame_ref, stop_id) a
        left join
        (select data_frame_ref, departure_stop_id from trip_durations group by data_frame_ref, departure_stop_id) b
        on a.data_frame_ref = b.data_frame_ref
        	and a.stop_id = b.departure_stop_id
        where b.data_frame_ref is null
        	and b.departure_stop_id is null
            and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()))
            order by a.data_frame_ref, a.stop_id;""")

    n_days_dep_stops = df.shape[0]

    for i, row in df.iterrows():
        data_frame_ref = row['data_frame_ref']
        dep_stop_id = row['stop_id']
        print("Processing data_frame_ref {}, departure_stop_id {} ({} of {})".
              format(data_frame_ref, dep_stop_id, (i + 1), n_days_dep_stops))

        pr.exec_commit("""insert into trip_durations
            select a.data_frame_ref,
            	a.trip_id,
            	a.stop_id as departure_stop_id,
            	a.stop_time as departure_time,
            	a.stop_time_unix as departure_time_unix,
            	s.stop_id as arrival_stop_id,
            	s.stop_time as arrival_time,
            	s.stop_time_unix as arrival_time_unix,
            	s.stop_time_unix - a.stop_time_unix as trip_duration,
                date_trunc('hour', a.stop_time) as departure_time_hour
            from
            (select * from stop_events
            where data_frame_ref = '{}'
            and stop_id = {}) a
            join stop_events s
            on a.data_frame_ref = s.data_frame_ref
            and a.trip_id = s.trip_id
            and s.stop_time_unix > a.stop_time_unix""".format(
            data_frame_ref, dep_stop_id))

    pr.close_up_shop()
コード例 #8
0
db_name = "info7374dbassignment2"#-------------------------------------Redshift: Database Name for gaming data

master_username = "******"#----------------------------------------Redshift: Admin Username
master_password = "******"#---------------------------------Redshift: Admin Password


hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com" #----------------Redshift: Hostname for database
port_number = 5439    #----------------Redshift: Port Number for databse

pr.connect_to_redshift(dbname = db_name ,
                        host = hostname,
                        port = port_number,
                        user = master_username,
                        password =master_password)

online = pr.redshift_to_pandas('select * from sales')

online.head(5)

# drop the row missing customerID
online = online[online.customerid.notnull()]

# extract year, month and day 
online['invoiceday'] = online.invoicedate.apply(lambda x: dt.datetime(x.year, x.month, x.day))
online.head()

monthly_unique_customers_df = online.set_index('invoiceday')['customerid'].resample('M').nunique()

monthly_unique_customers_df

pd.DataFrame(monthly_unique_customers_df)['invoicedate']=pd.DataFrame(monthly_unique_customers_df).index
コード例 #9
0
DOWNTIMEDOWNTIMEfrom flask import Flask, jsonify, json
import pandas as pd
import pandas_redshift as pr

pr.connect_to_redshift(dbname = 'habladb',
                        host = 'habla-ai.csvoexx0fghm.us-west-2.redshift.amazonaws.com',
                        port = 5439,
                        user = '******',
                        password = '******')


# MEAN UPTIME AND DOWNTIME ACROSS MULTIPLE PLANTS
LAMB_WESTON_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_uptime t')
LAMB_WESTON_DOWNTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_downtime t')


# PASCO UPTIME BY DAY BY LINE
PASCO_L1_S6_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s6_up_groupby t')
PASCO_L1_S7_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s7_up_groupby t')
PASCO_L1_S8_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s8_up_groupby t')
PASCO_L1_S9_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s9_up_groupby t')
PASCO_L1_S10_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s10_up_groupby t')
PASCO_L2_S1_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s1_up t')
PASCO_L2_S2_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s2_up t')
PASCO_L2_S3_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s3_up t')
PASCO_L2_S4_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s4_up t')
PASCO_L2_S5_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s5_up t')

# PASCO REASON LEVEL 1
PASCO_L1_S6_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s6 t')
PASCO_L1_S7_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s7 t')
コード例 #10
0
                        password = credentials['password'])

def connect_to_s3():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_s3(aws_access_key_id = credentials['aws_access_key_id'],
                aws_secret_access_key = credentials['aws_secret_access_key'],
                bucket = 'jonobate-bucket')

if __name__ == '__main__':
    #Get raw data from processing
    connect_to_redshift()
    print('Getting vehicle_monitoring data from Redshift...')
    df = pr.redshift_to_pandas("""select * from vehicle_monitoring
                                where data_frame_ref not in (select distinct data_frame_ref from stop_events)
                                and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""")
    pr.close_up_shop()

    #Parse into stop events
    df = raw_to_stops(df)

    #Write results to stop_events
    connect_to_s3()
    connect_to_redshift()
    print('Writing stop_events data to Redshift...')
    pr.pandas_to_redshift(data_frame = df,
                        redshift_table_name = 'stop_events',
                        append = True)

    #Get stop events for processing
コード例 #11
0
pr.pandas_to_redshift(data_frame = df,
                      redshift_table_name = 'analytics.trip_fact')

dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \
  .sort_values(ascending=False) \
  .reset_index(name='count'))

dfroutes.columns = ['start_station_id','end_station_id','count']
#print(type(dfroutes))

pr.pandas_to_redshift(data_frame = dfroutes,
                      redshift_table_name = 'analytics.most_used_routes', append=True)



dataframecount=pr.redshift_to_pandas("select * from analytics.most_used_routes")

newdataframecount=pd.DataFrame(columns=('start_station_id', 'end_station_id', 'num_trips'))
print(dataframecount)

for index, row in df.iterrows():
    for routeindex, routerow in dataframecount.iterrows():
        if int(row['\"start station id\"']) == int(routerow['start_station_id']) and int(row['\"end station id\"']) == int(routerow['end_station_id']):
            routerow['num_trips'] += 1
            break

        newdataframecount.append({'start_station_id': row['\"start station id\"']},
                                 {'end_station_id': row['\"end station id\"']}, {'num_trips': 1})


コード例 #12
0
from flask import Flask, jsonify, json
import pandas as pd
import pandas_redshift as pr

pr.connect_to_redshift(
    dbname='habladb',
    host='habla-ai.csvoexx0fghm.us-west-2.redshift.amazonaws.com',
    port=5439,
    user='******',
    password='******')

pasco_L1_S6_down = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s6_down t')
pasco_L1_S6_up = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s6_up t')
pasco_L1_S7_down = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s7_down t')
pasco_L1_S7_up = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s7_up t')
pasco_L1_S8_down = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s8_down t')
pasco_L1_S8_up = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s8_up t')
pasco_L1_S9_down = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s9_down t')
pasco_L1_S9_up = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s9_up t')
pasco_L1_S10_down = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s10_down t')
pasco_L1_S10_up = pr.redshift_to_pandas(
    'SELECT t.* FROM public.pasco_l1_s10_up t')
コード例 #13
0
def business_loss():
    images = [
        join("customer_lifetime_value/", f)
        for f in listdir("./static/customer_lifetime_value")
    ]

    db_name = "info7374dbassignment2"  #-------------------------------------Redshift: Database Name for gaming data

    master_username = "******"  #----------------------------------------Redshift: Admin Username
    master_password = "******"  #---------------------------------Redshift: Admin Password

    hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com"  #----------------Redshift: Hostname for database
    port_number = 5439  #----------------Redshift: Port Number for databse

    pr.connect_to_redshift(dbname=db_name,
                           host=hostname,
                           port=port_number,
                           user=master_username,
                           password=master_password)

    data = pr.redshift_to_pandas('select * from sales')

    data = data.drop_duplicates()

    data = data[pd.notnull(data['customerid'])]

    data = data[(data['quantity'] > 0)]

    #most bought product
    data['description'].value_counts()[:10]

    #which customer bought the most number of items?
    cust_data = pd.DataFrame()
    cust_data['customerid'] = list(set(data['customerid']))
    cust_data = cust_data.set_index('customerid')
    for cust_id in cust_data.index:
        cust_data.at[cust_id, 'Number_of_items'] = (len(
            data[data['customerid'] == cust_id]['description']))
    cust_data = cust_data.sort_values('Number_of_items', ascending=False)

    #	stockcode	description					states
    data = data[[
        'customerid', 'invoicedate', 'invoiceno', 'quantity', 'unitprice'
    ]]
    #Calulate total purchase
    data['TotalPurchase'] = data['quantity'] * data['unitprice']

    data_group = data.groupby('customerid').agg({
        'invoicedate':
        lambda date: (date.max() - date.min()).days,
        'invoiceno':
        lambda num: len(num),
        'quantity':
        lambda quant: quant.sum(),
        'TotalPurchase':
        lambda price: price.sum()
    })

    # Change the name of columns
    data_group.columns = [
        'num_days', 'num_transactions', 'num_units', 'spent_money'
    ]
    data_group.head()

    # Average Order Value
    data_group['avg_order_value'] = data_group['spent_money'] / data_group[
        'num_transactions']

    purchase_frequency = sum(
        data_group['num_transactions']) / data_group.shape[0]

    # Repeat Rate
    repeat_rate = data_group[
        data_group.num_transactions > 1].shape[0] / data_group.shape[0]
    #Churn Rate
    churn_rate = 1 - repeat_rate

    purchase_frequency, repeat_rate, churn_rate

    # Profit Margin
    data_group['profit_margin'] = data_group['spent_money'].astype(
        'float') * 0.05

    # Customer Value
    data_group['CLV'] = (data_group['avg_order_value'].astype('float') *
                         purchase_frequency) / churn_rate
    #Customer Lifetime Value
    data_group['cust_lifetime_value'] = data_group['CLV'].astype(
        'float') * data_group['profit_margin'].astype('float')
    data_group.head()

    clv = data_group.loc[:, "cust_lifetime_value"].mean() / 1000000

    # drop the row missing customerID
    data = data[data.customerid.notnull()]

    # extract year, month and day
    data['invoiceday'] = data.invoicedate.apply(
        lambda x: dt.datetime(x.year, x.month, x.day))
    data.head()

    monthly_unique_customers_df = data.set_index(
        'invoiceday')['customerid'].resample('M').nunique()

    pd.DataFrame(monthly_unique_customers_df)['invoicedate'] = pd.DataFrame(
        monthly_unique_customers_df).index

    df = pd.DataFrame(monthly_unique_customers_df).reset_index()

    Customer_count = df.loc[:, "customerid"].mean()

    df["CustomerIDshift"] = [0] + list(df["customerid"][:-1])

    df["ChurnRate"] = (df["CustomerIDshift"] -
                       df["customerid"]) / df["CustomerIDshift"]

    df.rename(columns={'invoiceday': 'Month'}, inplace=True)

    df['ChurnRate'][0] = 1

    data = df.drop(columns=['customerid', 'CustomerIDshift'])

    table1 = data

    table1

    from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
    from statsmodels.tsa.arima_model import ARIMA

    data = data.set_index('Month')
    data.index

    model = ARIMA(data, order=(2, 1, 0))
    model_fit = model.fit(disp=0)
    print(model_fit.summary())
    # plot residual errors
    # residuals = pd.DataFrame(model_fit.resid)
    # residuals.plot()
    # plt.show()
    # residuals.plot(kind='kde')
    # plt.show()

    X = data.values
    history = [x for x in X]

    test = [
        '2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30', '2019-05-31',
        '2019-06-31'
    ]
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=(2, 1, 0))
        model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        history.append(yhat)
        predictions.append(yhat)
        print('predicted=%f' % (yhat))

    print(predictions)

    i = 0
    yes_array = []
    for value in predictions:
        print(predictions[i])
        yes_array.append(predictions[i])
        i += 1

    df_toplot = pd.DataFrame({"ChurnRate": yes_array, "Month": test})

    df_toplot["Business_Loss"] = df["ChurnRate"] * clv * Customer_count

    x = df_toplot["Business_Loss"].astype(int)

    df_toplot['Business Loss'] = x

    final_df = df_toplot

    del final_df['Business_Loss']

    table2 = final_df
    table1 = table1.to_html(classes="data")
    table2 = table2.to_html(classes="data")

    return render_template('business_loss.html',
                           tables=[table1, table2],
                           titles=["Blah", "Churn Rate", "Future Churn Rate"],
                           images=images,
                           clv=clv)
コード例 #14
0
import pandas_redshift as pr

pr.connect_to_redshift(
    dbname="dev",
    host=
    'redshift-cluster-1.cajhj66uu5bu.ap-northeast-1.redshift.amazonaws.com',
    port='5439',
    user='******',
    password='******')

df = pr.redshift_to_pandas('select * from test')
コード例 #15
0
    print(dir(repos))

    connection = engine.connect()
    connection.execute(repos.execution_options(autocommit=True))
    connection.close()
    return file_json

pr_redshift = pr.connect_to_redshift(
    dbname='dev',
    host='redshift-cluster-1.cug5ajtfsvsw.us-west-2.redshift.amazonaws.com',
    port=5439,
    user='******',
    password='******')

for table in redshift_tables:
    data = pr.redshift_to_pandas('select * from table')
    data_ = data['data']


def get_modules_and_for_position(file):
    imports = []

    #Get all imported Modules

    result = re.findall(r"(?<!from)import (\w+)[\n.]|from\s+(\w+)\s+import",
                        file)
    #        imports=[i for imp in result for i in imp if len(i)and i not in imports]
    for imp in result:
        for i in imp:
            if len(i) and i not in imports:
                imports.append(i)
コード例 #16
0
str_s3bucket = secrets.str_s3bucket
str_s3subdirectory = secrets.str_s3subdirectory

# delete secrets.py
del secrets

# create pandas-redshift connection
pr.connect_to_redshift(dbname=str_dbname,
                       host=str_host,
                       port=str_port,
                       user=str_user,
                       password=str_pw)

# create dataframe from redshift query
sql_query = "SELECT * FROM <database>.<schema>.<table>;"
df = pr.redshift_to_pandas(sql_query)

print("Shape of dataframe: ", df.shape)

# create sample dataframe for upload
df_upload = pd.DataFrame({
    'a_col': ['red', 'green', 'blue'],
    'b_col': [1, 2, 3],
    'c_col': [True, False, True],
    'd_col': ['2020-01-01', '2020-02-04', '2020-03-06'],
})

# =============================================================
# Write a pandas DataFrame to redshift. Requires access to an S3 bucket and previously running pr.connect_to_redshift.
# If the table currently exists IT WILL BE DROPPED and then the pandas DataFrame will be put in it's place.
# If you set append = True the table will be appended to (if it exists).
コード例 #17
0
def raw_to_stops():
    connect_to_redshift()
    connect_to_s3()

    #Load stop data
    df_stop_times = pd.read_csv('gtfs/stop_times.txt')

    print('Getting vehicle_monitoring data from Redshift...')
    df = pr.redshift_to_pandas("""select data_frame_ref
                                from vehicle_monitoring
                                where data_frame_ref not in (select distinct data_frame_ref from stop_events)
                                and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()))
                                group by data_frame_ref""")

    n_days = df.shape[0]

    for i, row in df.iterrows():
        data_frame_ref = row['data_frame_ref']
        print("Processing data_frame_ref {} ({} of {})".format(
            data_frame_ref, (i + 1), n_days))

        df_cur = pr.redshift_to_pandas("""select * from vehicle_monitoring
                                where data_frame_ref = '{}';""".format(
            data_frame_ref))

        #Only bother with this if we actually have data...
        if df_cur.shape[0] == 0:
            print("No data for {}, skipping...".format(data_frame_ref))
        else:
            #Convert datetimes
            df_cur['recorded_time'] = pd.to_datetime(df_cur['recorded_time'])
            df_cur['valid_until_time'] = pd.to_datetime(
                df_cur['valid_until_time'])
            df_cur['data_frame_ref'] = pd.to_datetime(df_cur['data_frame_ref'])
            df_cur['expected_arrival_time'] = pd.to_datetime(
                df_cur['expected_arrival_time'])
            df_cur['expected_departure_time'] = pd.to_datetime(
                df_cur['expected_departure_time'])

            #Sort values, reset index
            df_cur = df_cur.sort_values(
                ['data_frame_ref', 'journey_ref', 'recorded_time'])
            df_cur = df_cur.reset_index(drop=True)
            df_cur['join_index'] = df_cur.index.astype(int)

            #Create offset dataframe
            df_next = df_cur[[
                'data_frame_ref', 'journey_ref', 'recorded_time',
                'stop_point_ref', 'stop_point_name'
            ]]
            df_next = df_next.add_suffix('_next')
            df_next['join_index'] = df_next.index
            df_next['join_index'] = df_next['join_index'].astype(int) - 1

            #Join data to offset data
            df_stops = df_cur.merge(df_next, on='join_index')

            #Filter to stop events
            df_stops = df_stops[
                (df_stops['data_frame_ref'] == df_stops['data_frame_ref_next'])
                & (df_stops['journey_ref'] == df_stops['journey_ref_next'])
                & (df_stops['stop_point_ref'] !=
                   df_stops['stop_point_ref_next'])]

            #Add in stop time column
            df_stops['stop_time'] = df_stops['recorded_time'] + (
                df_stops['recorded_time_next'] - df_stops['recorded_time']) / 2

            #Drop uneeded columns
            df_stops = df_stops[[
                'data_frame_ref', 'journey_ref', 'stop_point_ref', 'stop_time'
            ]]

            #Create output dataframe
            df_final = pd.DataFrame(columns=[
                'data_frame_ref', 'trip_id', 'stop_id', 'stop_time',
                'stop_time_unix'
            ])

            n_trips = len(df_stops['journey_ref'].unique())

            #For each trip on that day...
            for j, trip_id in enumerate(df_stops['journey_ref'].unique()):
                print(" Processing trip_id {} ({} of {})".format(
                    trip_id, (j + 1), n_trips))

                #Get actual data for this trip. Rename columns to match stop data.
                df_stops_actual = df_stops[df_stops['journey_ref'] ==
                                           trip_id].rename(
                                               index=str,
                                               columns={
                                                   "journey_ref": "trip_id",
                                                   "stop_point_ref": "stop_id"
                                               })

                #Get stop data for this trip
                df_stops_all = df_stop_times[df_stop_times['trip_id'] ==
                                             trip_id]

                #Fix to deal with the fact that that stop_ids are in a slightly different format
                df_stops_all['stop_id'] = (
                    '1' + df_stops_all['stop_id'].astype(str)).astype(int)

                #Merge dataframes todether
                df_merged = df_stops_all.merge(df_stops_actual,
                                               on=['trip_id', 'stop_id'],
                                               how='left')

                #Create unix time column
                df_merged['stop_time_unix'] = (
                    df_merged['stop_time'] -
                    pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

                #Interpolate timestamps for missing stop events
                df_merged['stop_time_unix'] = df_merged[
                    'stop_time_unix'].interpolate(limit_area='inside')

                #Convert back to actual timestamps
                df_merged['stop_time'] = pd.to_datetime(
                    df_merged['stop_time_unix'], origin='unix', unit='s')

                #Fill missing data_frame_refs
                df_merged['data_frame_ref'] = df_merged[
                    'data_frame_ref'].fillna(data_frame_ref)

                #Drop uneeeded columns
                df_merged = df_merged[[
                    'data_frame_ref', 'trip_id', 'stop_id', 'stop_time',
                    'stop_time_unix'
                ]]

                #Remove NaNs (occurs if we are missing data at the start or end of a journey)
                df_merged = df_merged.dropna(subset=['stop_time'])

                #Add to final data frame
                df_final = pd.concat([df_final, df_merged])

            #Only bother with this if we actually have stop events...
            if df_final.shape[0] == 0:
                print("No stop events for {}, skipping...".format(
                    data_frame_ref))
            else:
                pr.pandas_to_redshift(data_frame=df_final,
                                      redshift_table_name='stop_events',
                                      append=True)

    pr.close_up_shop()
コード例 #18
0
def durs_to_dists():

    connect_to_redshift()
    connect_to_s3()

    #Note: this processes data not already in distributions. Assumes we do one hour at a time, no subdividing of hours.
    df = pr.redshift_to_pandas("""select a.* from
        (select data_frame_ref, departure_time_hour from trip_durations group by data_frame_ref, departure_time_hour) a
        left join
        (select data_frame_ref, departure_time_hour from distributions_gamma group by data_frame_ref, departure_time_hour) b
        on a.data_frame_ref = b.data_frame_ref
        	and a.departure_time_hour = b.departure_time_hour
        where b.data_frame_ref is null
        	and b.departure_time_hour is null
            and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()))
            order by a.data_frame_ref, a.departure_time_hour;""")

    #Randomize order, so we can get some samples from everywhere...
    df = df.sample(frac=1).reset_index(drop=True)

    n_days_hours = df.shape[0]

    #For each day and departure stop:
    for i, row in df.iterrows():
        data_frame_ref = row['data_frame_ref']
        departure_time_hour = row['departure_time_hour']
        print(
            "Processing data_frame_ref {}, departure_time_hour {} ({} of {})".
            format(data_frame_ref, departure_time_hour, (i + 1), n_days_hours))

        #Calculate base timestamps for this day
        minutes = pd.DataFrame(np.arange(0, 60), columns=['minute'])
        minutes['key'] = 0

        df_hour = pr.redshift_to_pandas("""select *,
                                            date_trunc('min', departure_time) as departure_time_minute
                                            from trip_durations
                                            where data_frame_ref = '{}'
                                            and departure_time_hour = '{}' """.
                                        format(data_frame_ref,
                                               departure_time_hour))

        results = []

        n_dep_stops = len(df_hour['departure_stop_id'].unique())

        #For each arrival stop:
        for j, departure_stop_id in enumerate(
                df_hour['departure_stop_id'].unique()):
            print("Processing departure_stop_id {} ({} of {})".format(
                departure_stop_id, (j + 1), n_dep_stops))

            #For each departure stop:
            for k, arrival_stop_id in enumerate(
                    df_hour[df_hour['departure_stop_id'] ==
                            departure_stop_id]['arrival_stop_id'].unique()):

                #Select data
                df_dist = df_hour[
                    (df_hour['departure_stop_id'] == departure_stop_id)
                    & (df_hour['arrival_stop_id'] == arrival_stop_id)]

                #Create date array
                date = pd.DataFrame([departure_time_hour],
                                    columns=['departure_time_hour'])
                date['key'] = 0

                #Create base array
                base = date.merge(minutes)
                base['departure_time_minute'] = base[
                    'departure_time_hour'] + pd.to_timedelta(base.minute,
                                                             unit='m')
                base = base[['departure_time_minute']]
                base['departure_time_minute_unix'] = (
                    base['departure_time_minute'] -
                    pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

                df_dist = base.merge(df_dist,
                                     on='departure_time_minute',
                                     how='left')
                df_dist = df_dist.fillna(method='bfill')
                df_dist['total_journey_time'] = df_dist[
                    'arrival_time_unix'] - df_dist['departure_time_minute_unix']
                df_dist = df_dist.dropna(subset=['total_journey_time'])

                data = df_dist['total_journey_time']

                try:
                    # fit dist to data
                    params = st.gamma.fit(data, floc=True)

                    y, x = np.histogram(data)
                    x = (x + np.roll(x, -1))[:-1] / 2.0

                    # Separate parts of parameters
                    arg = params[:-2]
                    loc = params[-2]
                    scale = params[-1]

                    # Calculate fitted PDF and error with fit in distribution
                    pdf = st.gamma.pdf(x, loc=loc, scale=scale, *arg)
                    sse = np.sum(np.power(y - pdf, 2.0))

                    results.append([
                        data_frame_ref, departure_time_hour, departure_stop_id,
                        arrival_stop_id, arg[0], scale, sse
                    ])
                except Exception as e:
                    print(e)
                    continue
        #Only bother with this if we actually have stop events...
        if len(results) == 0:
            print(
                "No distributions for data_frame_ref {}, departure_time_hour {}, skipping..."
                .format(data_frame_ref, departure_time_hour))
        else:
            print("Writing distributions to Redshift...")
            df_results = pd.DataFrame(results,
                                      columns=[
                                          'data_frame_ref',
                                          'departure_time_hour',
                                          'departure_stop_id',
                                          'arrival_stop_id', 'shape', 'scale',
                                          'sse'
                                      ])
            pr.pandas_to_redshift(data_frame=df_results,
                                  redshift_table_name='distributions_gamma',
                                  append=True)

    pr.close_up_shop()