Exemple #1
0
def to_redshift(data_frame):
    #date_str =data_base.strftime('%Y/%m/%d')
    print('connect_to_redshift')
    pr.connect_to_redshift(
        dbname='pricing',
        host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com',
        port='5432',
        user='******',
        password='******')
    print('connect_to_s3')
    pr.connect_to_s3(
        aws_access_key_id="AKIAILQVO2DJQHRLFLQQ",
        aws_secret_access_key="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",
        bucket="kroton-analytics",
        subdirectory="raw/uploads/esteira")
    print('pandas_to_redshift')
    pr.pandas_to_redshift(
        data_frame=data_frame,
        ##columns_data_types = ['real_mes' float64, 'real_ano_anterior' float64, 'acumulado_ano' float64, 'meta_ano' float64],
        ##column_data_types=['VARCHAR(250)','DATE','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','FLOAT','FLOAT','FLOAT','FLOAT'],
        index=False,
        ##column_data_types = None,
        redshift_table_name='kroton_pricing.tb_dev_bd_ccr',
        append=True)
    return print('fim save_to_redshift')
def pandas_df_to_redshist(bucket_name, s3_file_name):
    df = s3_to_pandas_with_processing(bucket_name, s3_file_name)
    # Write the DataFrame to S3 and then to redshift
    pr.connect_to_s3(aws_access_key_id=KEY_ID,
                     aws_secret_access_key=KEY_SECRET,
                     bucket=bucket_name,
                     subdirectory=s3_file_name)
    ## to redshift
    pr.pandas_to_redshift(data_frame=df, redshift_table_name=table_name)
def dbconnect(df):
    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='TODO')
    pr.pandas_to_redshift(data_frame=df,
                          redshift_table_name='weather_data',
                          append=True)
Exemple #4
0
    connect_to_redshift()
    print('Getting vehicle_monitoring data from Redshift...')
    df = pr.redshift_to_pandas("""select * from vehicle_monitoring
                                where data_frame_ref not in (select distinct data_frame_ref from stop_events)
                                and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""")
    pr.close_up_shop()

    #Parse into stop events
    df = raw_to_stops(df)

    #Write results to stop_events
    connect_to_s3()
    connect_to_redshift()
    print('Writing stop_events data to Redshift...')
    pr.pandas_to_redshift(data_frame = df,
                        redshift_table_name = 'stop_events',
                        append = True)

    #Get stop events for processing
    print('Getting stop_events data from Redshift...')
    df = pr.redshift_to_pandas("""select * from stop_events
                                where data_frame_ref not in (select distinct data_frame_ref from trip_durations)
                                and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""")

    pr.close_up_shop()

    #Parse into journey durations
    df = stops_to_durations(df)

    #Write results to stop_events
    connect_to_s3()
Exemple #5
0
##difference
#for i in tqdm(reversed(range(len(li)))):
#    if i != 0:
#        li[i][9] = li[i][9] - li[i-1][9]
#        li[i][10] = li[i][10] - li[i-1][10]

df3 = pd.concat(li) #concat li. df3 is final dataframe
df3 = df3[[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17]]
df3.columns = ['date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari']

dbname = os.getenv('REDSHIFT_DB')
host = os.getenv('REDSHIFT_HOST')
port = os.getenv('REDSHIFT_PORT')
user = os.getenv('REDSHIFT_USER')
password = os.getenv('REDSHIFT_PASS')

pr.connect_to_redshift(dbname = dbname,
                        host = host,
                        port = port,
                        user = user,
                        password = password)

pr.connect_to_s3(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'),
                bucket = 'nichiji-tmp'
                )

pr.pandas_to_redshift(data_frame = df3,
                        redshift_table_name = 'jisseki_nichiji',
                        append = True)
                        password ='******')

# Connect to S3
pr.connect_to_s3(aws_access_key_id = 'sadadasfaftew',
                aws_secret_access_key = 'ewwet4tsdvsrvrvrervwef',
                bucket = 'data-science',
                subdirectory = 'shwangdir'
                )

#upload a copy to S3 and redshift

#for i in range(5):
    #url = 'https://s3.amazonaws.com/tripdata/20180{}-citibike-tripdata.csv.zip'.format(i+1)
df = pd.read_csv('/Users/ankitkumar/Downloads/201801-citibike-tripdata.csv')
print(df)
pr.pandas_to_redshift(data_frame = df,
                      redshift_table_name = 'analytics.trip_fact')

dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \
  .sort_values(ascending=False) \
  .reset_index(name='count'))

dfroutes.columns = ['start_station_id','end_station_id','count']
#print(type(dfroutes))

pr.pandas_to_redshift(data_frame = dfroutes,
                      redshift_table_name = 'analytics.most_used_routes', append=True)



dataframecount=pr.redshift_to_pandas("select * from analytics.most_used_routes")
Exemple #7
0
 sku_cd    | character varying(256) |           |          |
 dpt_cd    | character varying(256) |           |          |
 line_cd   | character varying(256) |           |          |
 class_cd  | character varying(256) |           |          |
 sku_name  | character varying(256) |           |          |
 urisu     | numeric(10,2)          |           |          |
 urikin    | numeric(10,2)          |           |          |
 gsagsu1   | numeric(10,2)          |           |          |
 gsaggk1   | numeric(10,2)          |           |          |
 gsagsu2   | numeric(10,2)          |           |          |
 gsaggk2   | numeric(10,2)          |           |          |
 gsagsu3   | numeric(10,2)          |           |          |
 gsaggk3   | numeric(10,2)          |           |          |
 garari    | numeric(10,2)          |           |          |
"""

pr.connect_to_redshift(dbname=os.getenv('REDSHIFT_DB'),
                       host=os.getenv('REDSHIFT_HOST'),
                       port=os.getenv('REDSHIFT_PORT'),
                       user=os.getenv('REDSHIFT_USER'),
                       password=os.getenv('REDSHIFT_PASS'))

pr.connect_to_s3(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                 aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
                 bucket='jisseki-nichiji')

data = {"date_ymdh": ["2018063012"], "ten_cd": ["0001"]}
f = pd.DataFrame(data)

pr.pandas_to_redshift(f, 'jisseki_nichiji')
def cleansing_format_data(date):
    colss_li = [i for i in range(0, 211)]
    del colss_li[11:21]
    colss_li.remove(2)
    colss_li.remove(7)

    response = s3client.list_objects(Bucket='ld-rawdata',
                                     Prefix='TR_JISSEKI/' + date + 'XXXXXX/')

    if 'Contents' in response:
        keys = [content['Key'] for content in response['Contents']]
        key = keys[-1]  #23時のデータ

    bucket_name = 'ld-rawdata'
    file_name = key
    day = file_name[37:45]  #day string
    reader = pd.read_csv('s3n://' + bucket_name + '/' + file_name,
                         encoding="cp932",
                         header=None,
                         iterator=True,
                         chunksize=1000,
                         usecols=colss_li)
    df = pd.concat((r for r in reader), ignore_index=True)

    li = []
    df = df[df[0].isin([day])]
    hour = 7
    base = df.loc[:, 0:10]

    #Make hours list
    for i in range(19):
        if hour < 24:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(day[4:6]),
                                         int(day[6:8]), hour)
        elif hour > 23:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(
                day[4:6]), int(day[6:8]), hour - 24) + timedelta(days=1)

        hour += 1
        li.append(pd.concat([base, df.loc[:, 21 + i * 10:30 + i * 10]],
                            axis=1))

    #set columns
    for i in range(len(li)):
        li[i].columns = [j for j in range(19)]

    df3 = pd.concat(li)  #concat li. df3 is final dataframe
    df3 = df3[[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
    df3.columns = [
        'date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd',
        'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2',
        'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari'
    ]

    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='nichiji-tmp')

    pr.pandas_to_redshift(data_frame=df3,
                          redshift_table_name='jisseki_nichiji',
                          append=True)
# If you set append = True the table will be appended to (if it exists).
# =============================================================

# Connect to S3
pr.connect_to_s3(
    aws_access_key_id=str_accesskeyid,
    aws_secret_access_key=str_secretaccesskey,
    bucket=str_s3bucket,
    subdirectory=str_s3subdirectory
    # As of release 1.1.1 you are able to specify an aws_session_token (if necessary):
    # aws_session_token = <aws_session_token>
)

# Write the DataFrame to S3 and then to redshift
str_schema_table = '<schema>.<table>'
pr.pandas_to_redshift(data_frame=df_upload,
                      redshift_table_name=str_schema_table)

# confirm that the table has been uploaded to Redshift by reading
pr.connect_to_redshift(dbname=str_dbname,
                       host=str_host,
                       port=str_port,
                       user=str_user,
                       password=str_pw)
sql_confirm = "SELECT * FROM <database>.<schema>.<table>;"
df_confirm = pr.redshift_to_pandas(sql_confirm)

print("Shape of dataframe: ", df_confirm.shape)

# close pandas_redshift connection
pr.close_up_shop()
Exemple #10
0
def raw_to_stops():
    connect_to_redshift()
    connect_to_s3()

    #Load stop data
    df_stop_times = pd.read_csv('gtfs/stop_times.txt')

    print('Getting vehicle_monitoring data from Redshift...')
    df = pr.redshift_to_pandas("""select data_frame_ref
                                from vehicle_monitoring
                                where data_frame_ref not in (select distinct data_frame_ref from stop_events)
                                and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()))
                                group by data_frame_ref""")

    n_days = df.shape[0]

    for i, row in df.iterrows():
        data_frame_ref = row['data_frame_ref']
        print("Processing data_frame_ref {} ({} of {})".format(
            data_frame_ref, (i + 1), n_days))

        df_cur = pr.redshift_to_pandas("""select * from vehicle_monitoring
                                where data_frame_ref = '{}';""".format(
            data_frame_ref))

        #Only bother with this if we actually have data...
        if df_cur.shape[0] == 0:
            print("No data for {}, skipping...".format(data_frame_ref))
        else:
            #Convert datetimes
            df_cur['recorded_time'] = pd.to_datetime(df_cur['recorded_time'])
            df_cur['valid_until_time'] = pd.to_datetime(
                df_cur['valid_until_time'])
            df_cur['data_frame_ref'] = pd.to_datetime(df_cur['data_frame_ref'])
            df_cur['expected_arrival_time'] = pd.to_datetime(
                df_cur['expected_arrival_time'])
            df_cur['expected_departure_time'] = pd.to_datetime(
                df_cur['expected_departure_time'])

            #Sort values, reset index
            df_cur = df_cur.sort_values(
                ['data_frame_ref', 'journey_ref', 'recorded_time'])
            df_cur = df_cur.reset_index(drop=True)
            df_cur['join_index'] = df_cur.index.astype(int)

            #Create offset dataframe
            df_next = df_cur[[
                'data_frame_ref', 'journey_ref', 'recorded_time',
                'stop_point_ref', 'stop_point_name'
            ]]
            df_next = df_next.add_suffix('_next')
            df_next['join_index'] = df_next.index
            df_next['join_index'] = df_next['join_index'].astype(int) - 1

            #Join data to offset data
            df_stops = df_cur.merge(df_next, on='join_index')

            #Filter to stop events
            df_stops = df_stops[
                (df_stops['data_frame_ref'] == df_stops['data_frame_ref_next'])
                & (df_stops['journey_ref'] == df_stops['journey_ref_next'])
                & (df_stops['stop_point_ref'] !=
                   df_stops['stop_point_ref_next'])]

            #Add in stop time column
            df_stops['stop_time'] = df_stops['recorded_time'] + (
                df_stops['recorded_time_next'] - df_stops['recorded_time']) / 2

            #Drop uneeded columns
            df_stops = df_stops[[
                'data_frame_ref', 'journey_ref', 'stop_point_ref', 'stop_time'
            ]]

            #Create output dataframe
            df_final = pd.DataFrame(columns=[
                'data_frame_ref', 'trip_id', 'stop_id', 'stop_time',
                'stop_time_unix'
            ])

            n_trips = len(df_stops['journey_ref'].unique())

            #For each trip on that day...
            for j, trip_id in enumerate(df_stops['journey_ref'].unique()):
                print(" Processing trip_id {} ({} of {})".format(
                    trip_id, (j + 1), n_trips))

                #Get actual data for this trip. Rename columns to match stop data.
                df_stops_actual = df_stops[df_stops['journey_ref'] ==
                                           trip_id].rename(
                                               index=str,
                                               columns={
                                                   "journey_ref": "trip_id",
                                                   "stop_point_ref": "stop_id"
                                               })

                #Get stop data for this trip
                df_stops_all = df_stop_times[df_stop_times['trip_id'] ==
                                             trip_id]

                #Fix to deal with the fact that that stop_ids are in a slightly different format
                df_stops_all['stop_id'] = (
                    '1' + df_stops_all['stop_id'].astype(str)).astype(int)

                #Merge dataframes todether
                df_merged = df_stops_all.merge(df_stops_actual,
                                               on=['trip_id', 'stop_id'],
                                               how='left')

                #Create unix time column
                df_merged['stop_time_unix'] = (
                    df_merged['stop_time'] -
                    pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

                #Interpolate timestamps for missing stop events
                df_merged['stop_time_unix'] = df_merged[
                    'stop_time_unix'].interpolate(limit_area='inside')

                #Convert back to actual timestamps
                df_merged['stop_time'] = pd.to_datetime(
                    df_merged['stop_time_unix'], origin='unix', unit='s')

                #Fill missing data_frame_refs
                df_merged['data_frame_ref'] = df_merged[
                    'data_frame_ref'].fillna(data_frame_ref)

                #Drop uneeeded columns
                df_merged = df_merged[[
                    'data_frame_ref', 'trip_id', 'stop_id', 'stop_time',
                    'stop_time_unix'
                ]]

                #Remove NaNs (occurs if we are missing data at the start or end of a journey)
                df_merged = df_merged.dropna(subset=['stop_time'])

                #Add to final data frame
                df_final = pd.concat([df_final, df_merged])

            #Only bother with this if we actually have stop events...
            if df_final.shape[0] == 0:
                print("No stop events for {}, skipping...".format(
                    data_frame_ref))
            else:
                pr.pandas_to_redshift(data_frame=df_final,
                                      redshift_table_name='stop_events',
                                      append=True)

    pr.close_up_shop()
Exemple #11
0
def durs_to_dists():

    connect_to_redshift()
    connect_to_s3()

    #Note: this processes data not already in distributions. Assumes we do one hour at a time, no subdividing of hours.
    df = pr.redshift_to_pandas("""select a.* from
        (select data_frame_ref, departure_time_hour from trip_durations group by data_frame_ref, departure_time_hour) a
        left join
        (select data_frame_ref, departure_time_hour from distributions_gamma group by data_frame_ref, departure_time_hour) b
        on a.data_frame_ref = b.data_frame_ref
        	and a.departure_time_hour = b.departure_time_hour
        where b.data_frame_ref is null
        	and b.departure_time_hour is null
            and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()))
            order by a.data_frame_ref, a.departure_time_hour;""")

    #Randomize order, so we can get some samples from everywhere...
    df = df.sample(frac=1).reset_index(drop=True)

    n_days_hours = df.shape[0]

    #For each day and departure stop:
    for i, row in df.iterrows():
        data_frame_ref = row['data_frame_ref']
        departure_time_hour = row['departure_time_hour']
        print(
            "Processing data_frame_ref {}, departure_time_hour {} ({} of {})".
            format(data_frame_ref, departure_time_hour, (i + 1), n_days_hours))

        #Calculate base timestamps for this day
        minutes = pd.DataFrame(np.arange(0, 60), columns=['minute'])
        minutes['key'] = 0

        df_hour = pr.redshift_to_pandas("""select *,
                                            date_trunc('min', departure_time) as departure_time_minute
                                            from trip_durations
                                            where data_frame_ref = '{}'
                                            and departure_time_hour = '{}' """.
                                        format(data_frame_ref,
                                               departure_time_hour))

        results = []

        n_dep_stops = len(df_hour['departure_stop_id'].unique())

        #For each arrival stop:
        for j, departure_stop_id in enumerate(
                df_hour['departure_stop_id'].unique()):
            print("Processing departure_stop_id {} ({} of {})".format(
                departure_stop_id, (j + 1), n_dep_stops))

            #For each departure stop:
            for k, arrival_stop_id in enumerate(
                    df_hour[df_hour['departure_stop_id'] ==
                            departure_stop_id]['arrival_stop_id'].unique()):

                #Select data
                df_dist = df_hour[
                    (df_hour['departure_stop_id'] == departure_stop_id)
                    & (df_hour['arrival_stop_id'] == arrival_stop_id)]

                #Create date array
                date = pd.DataFrame([departure_time_hour],
                                    columns=['departure_time_hour'])
                date['key'] = 0

                #Create base array
                base = date.merge(minutes)
                base['departure_time_minute'] = base[
                    'departure_time_hour'] + pd.to_timedelta(base.minute,
                                                             unit='m')
                base = base[['departure_time_minute']]
                base['departure_time_minute_unix'] = (
                    base['departure_time_minute'] -
                    pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

                df_dist = base.merge(df_dist,
                                     on='departure_time_minute',
                                     how='left')
                df_dist = df_dist.fillna(method='bfill')
                df_dist['total_journey_time'] = df_dist[
                    'arrival_time_unix'] - df_dist['departure_time_minute_unix']
                df_dist = df_dist.dropna(subset=['total_journey_time'])

                data = df_dist['total_journey_time']

                try:
                    # fit dist to data
                    params = st.gamma.fit(data, floc=True)

                    y, x = np.histogram(data)
                    x = (x + np.roll(x, -1))[:-1] / 2.0

                    # Separate parts of parameters
                    arg = params[:-2]
                    loc = params[-2]
                    scale = params[-1]

                    # Calculate fitted PDF and error with fit in distribution
                    pdf = st.gamma.pdf(x, loc=loc, scale=scale, *arg)
                    sse = np.sum(np.power(y - pdf, 2.0))

                    results.append([
                        data_frame_ref, departure_time_hour, departure_stop_id,
                        arrival_stop_id, arg[0], scale, sse
                    ])
                except Exception as e:
                    print(e)
                    continue
        #Only bother with this if we actually have stop events...
        if len(results) == 0:
            print(
                "No distributions for data_frame_ref {}, departure_time_hour {}, skipping..."
                .format(data_frame_ref, departure_time_hour))
        else:
            print("Writing distributions to Redshift...")
            df_results = pd.DataFrame(results,
                                      columns=[
                                          'data_frame_ref',
                                          'departure_time_hour',
                                          'departure_stop_id',
                                          'arrival_stop_id', 'shape', 'scale',
                                          'sse'
                                      ])
            pr.pandas_to_redshift(data_frame=df_results,
                                  redshift_table_name='distributions_gamma',
                                  append=True)

    pr.close_up_shop()
Exemple #12
0
 def enviar_df_para_redshift(self, df):
     pr.pandas_to_redshift(data_frame=df,
                           redshift_table_name=self.redshift_table_name)