def to_redshift(data_frame): #date_str =data_base.strftime('%Y/%m/%d') print('connect_to_redshift') pr.connect_to_redshift( dbname='pricing', host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com', port='5432', user='******', password='******') print('connect_to_s3') pr.connect_to_s3( aws_access_key_id="AKIAILQVO2DJQHRLFLQQ", aws_secret_access_key="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", bucket="kroton-analytics", subdirectory="raw/uploads/esteira") print('pandas_to_redshift') pr.pandas_to_redshift( data_frame=data_frame, ##columns_data_types = ['real_mes' float64, 'real_ano_anterior' float64, 'acumulado_ano' float64, 'meta_ano' float64], ##column_data_types=['VARCHAR(250)','DATE','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','FLOAT','FLOAT','FLOAT','FLOAT'], index=False, ##column_data_types = None, redshift_table_name='kroton_pricing.tb_dev_bd_ccr', append=True) return print('fim save_to_redshift')
def pandas_df_to_redshist(bucket_name, s3_file_name): df = s3_to_pandas_with_processing(bucket_name, s3_file_name) # Write the DataFrame to S3 and then to redshift pr.connect_to_s3(aws_access_key_id=KEY_ID, aws_secret_access_key=KEY_SECRET, bucket=bucket_name, subdirectory=s3_file_name) ## to redshift pr.pandas_to_redshift(data_frame=df, redshift_table_name=table_name)
def dbconnect(df): dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname=dbname, host=host, port=port, user=user, password=password) pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'), bucket='TODO') pr.pandas_to_redshift(data_frame=df, redshift_table_name='weather_data', append=True)
connect_to_redshift() print('Getting vehicle_monitoring data from Redshift...') df = pr.redshift_to_pandas("""select * from vehicle_monitoring where data_frame_ref not in (select distinct data_frame_ref from stop_events) and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""") pr.close_up_shop() #Parse into stop events df = raw_to_stops(df) #Write results to stop_events connect_to_s3() connect_to_redshift() print('Writing stop_events data to Redshift...') pr.pandas_to_redshift(data_frame = df, redshift_table_name = 'stop_events', append = True) #Get stop events for processing print('Getting stop_events data from Redshift...') df = pr.redshift_to_pandas("""select * from stop_events where data_frame_ref not in (select distinct data_frame_ref from trip_durations) and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""") pr.close_up_shop() #Parse into journey durations df = stops_to_durations(df) #Write results to stop_events connect_to_s3()
##difference #for i in tqdm(reversed(range(len(li)))): # if i != 0: # li[i][9] = li[i][9] - li[i-1][9] # li[i][10] = li[i][10] - li[i-1][10] df3 = pd.concat(li) #concat li. df3 is final dataframe df3 = df3[[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17]] df3.columns = ['date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari'] dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname = dbname, host = host, port = port, user = user, password = password) pr.connect_to_s3(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'), bucket = 'nichiji-tmp' ) pr.pandas_to_redshift(data_frame = df3, redshift_table_name = 'jisseki_nichiji', append = True)
password ='******') # Connect to S3 pr.connect_to_s3(aws_access_key_id = 'sadadasfaftew', aws_secret_access_key = 'ewwet4tsdvsrvrvrervwef', bucket = 'data-science', subdirectory = 'shwangdir' ) #upload a copy to S3 and redshift #for i in range(5): #url = 'https://s3.amazonaws.com/tripdata/20180{}-citibike-tripdata.csv.zip'.format(i+1) df = pd.read_csv('/Users/ankitkumar/Downloads/201801-citibike-tripdata.csv') print(df) pr.pandas_to_redshift(data_frame = df, redshift_table_name = 'analytics.trip_fact') dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \ .sort_values(ascending=False) \ .reset_index(name='count')) dfroutes.columns = ['start_station_id','end_station_id','count'] #print(type(dfroutes)) pr.pandas_to_redshift(data_frame = dfroutes, redshift_table_name = 'analytics.most_used_routes', append=True) dataframecount=pr.redshift_to_pandas("select * from analytics.most_used_routes")
sku_cd | character varying(256) | | | dpt_cd | character varying(256) | | | line_cd | character varying(256) | | | class_cd | character varying(256) | | | sku_name | character varying(256) | | | urisu | numeric(10,2) | | | urikin | numeric(10,2) | | | gsagsu1 | numeric(10,2) | | | gsaggk1 | numeric(10,2) | | | gsagsu2 | numeric(10,2) | | | gsaggk2 | numeric(10,2) | | | gsagsu3 | numeric(10,2) | | | gsaggk3 | numeric(10,2) | | | garari | numeric(10,2) | | | """ pr.connect_to_redshift(dbname=os.getenv('REDSHIFT_DB'), host=os.getenv('REDSHIFT_HOST'), port=os.getenv('REDSHIFT_PORT'), user=os.getenv('REDSHIFT_USER'), password=os.getenv('REDSHIFT_PASS')) pr.connect_to_s3(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), bucket='jisseki-nichiji') data = {"date_ymdh": ["2018063012"], "ten_cd": ["0001"]} f = pd.DataFrame(data) pr.pandas_to_redshift(f, 'jisseki_nichiji')
def cleansing_format_data(date): colss_li = [i for i in range(0, 211)] del colss_li[11:21] colss_li.remove(2) colss_li.remove(7) response = s3client.list_objects(Bucket='ld-rawdata', Prefix='TR_JISSEKI/' + date + 'XXXXXX/') if 'Contents' in response: keys = [content['Key'] for content in response['Contents']] key = keys[-1] #23時のデータ bucket_name = 'ld-rawdata' file_name = key day = file_name[37:45] #day string reader = pd.read_csv('s3n://' + bucket_name + '/' + file_name, encoding="cp932", header=None, iterator=True, chunksize=1000, usecols=colss_li) df = pd.concat((r for r in reader), ignore_index=True) li = [] df = df[df[0].isin([day])] hour = 7 base = df.loc[:, 0:10] #Make hours list for i in range(19): if hour < 24: base.loc[:, 0] = pd.datetime(int(day[0:4]), int(day[4:6]), int(day[6:8]), hour) elif hour > 23: base.loc[:, 0] = pd.datetime(int(day[0:4]), int( day[4:6]), int(day[6:8]), hour - 24) + timedelta(days=1) hour += 1 li.append(pd.concat([base, df.loc[:, 21 + i * 10:30 + i * 10]], axis=1)) #set columns for i in range(len(li)): li[i].columns = [j for j in range(19)] df3 = pd.concat(li) #concat li. df3 is final dataframe df3 = df3[[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17]] df3.columns = [ 'date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari' ] dbname = os.getenv('REDSHIFT_DB') host = os.getenv('REDSHIFT_HOST') port = os.getenv('REDSHIFT_PORT') user = os.getenv('REDSHIFT_USER') password = os.getenv('REDSHIFT_PASS') pr.connect_to_redshift(dbname=dbname, host=host, port=port, user=user, password=password) pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'), bucket='nichiji-tmp') pr.pandas_to_redshift(data_frame=df3, redshift_table_name='jisseki_nichiji', append=True)
# If you set append = True the table will be appended to (if it exists). # ============================================================= # Connect to S3 pr.connect_to_s3( aws_access_key_id=str_accesskeyid, aws_secret_access_key=str_secretaccesskey, bucket=str_s3bucket, subdirectory=str_s3subdirectory # As of release 1.1.1 you are able to specify an aws_session_token (if necessary): # aws_session_token = <aws_session_token> ) # Write the DataFrame to S3 and then to redshift str_schema_table = '<schema>.<table>' pr.pandas_to_redshift(data_frame=df_upload, redshift_table_name=str_schema_table) # confirm that the table has been uploaded to Redshift by reading pr.connect_to_redshift(dbname=str_dbname, host=str_host, port=str_port, user=str_user, password=str_pw) sql_confirm = "SELECT * FROM <database>.<schema>.<table>;" df_confirm = pr.redshift_to_pandas(sql_confirm) print("Shape of dataframe: ", df_confirm.shape) # close pandas_redshift connection pr.close_up_shop()
def raw_to_stops(): connect_to_redshift() connect_to_s3() #Load stop data df_stop_times = pd.read_csv('gtfs/stop_times.txt') print('Getting vehicle_monitoring data from Redshift...') df = pr.redshift_to_pandas("""select data_frame_ref from vehicle_monitoring where data_frame_ref not in (select distinct data_frame_ref from stop_events) and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE())) group by data_frame_ref""") n_days = df.shape[0] for i, row in df.iterrows(): data_frame_ref = row['data_frame_ref'] print("Processing data_frame_ref {} ({} of {})".format( data_frame_ref, (i + 1), n_days)) df_cur = pr.redshift_to_pandas("""select * from vehicle_monitoring where data_frame_ref = '{}';""".format( data_frame_ref)) #Only bother with this if we actually have data... if df_cur.shape[0] == 0: print("No data for {}, skipping...".format(data_frame_ref)) else: #Convert datetimes df_cur['recorded_time'] = pd.to_datetime(df_cur['recorded_time']) df_cur['valid_until_time'] = pd.to_datetime( df_cur['valid_until_time']) df_cur['data_frame_ref'] = pd.to_datetime(df_cur['data_frame_ref']) df_cur['expected_arrival_time'] = pd.to_datetime( df_cur['expected_arrival_time']) df_cur['expected_departure_time'] = pd.to_datetime( df_cur['expected_departure_time']) #Sort values, reset index df_cur = df_cur.sort_values( ['data_frame_ref', 'journey_ref', 'recorded_time']) df_cur = df_cur.reset_index(drop=True) df_cur['join_index'] = df_cur.index.astype(int) #Create offset dataframe df_next = df_cur[[ 'data_frame_ref', 'journey_ref', 'recorded_time', 'stop_point_ref', 'stop_point_name' ]] df_next = df_next.add_suffix('_next') df_next['join_index'] = df_next.index df_next['join_index'] = df_next['join_index'].astype(int) - 1 #Join data to offset data df_stops = df_cur.merge(df_next, on='join_index') #Filter to stop events df_stops = df_stops[ (df_stops['data_frame_ref'] == df_stops['data_frame_ref_next']) & (df_stops['journey_ref'] == df_stops['journey_ref_next']) & (df_stops['stop_point_ref'] != df_stops['stop_point_ref_next'])] #Add in stop time column df_stops['stop_time'] = df_stops['recorded_time'] + ( df_stops['recorded_time_next'] - df_stops['recorded_time']) / 2 #Drop uneeded columns df_stops = df_stops[[ 'data_frame_ref', 'journey_ref', 'stop_point_ref', 'stop_time' ]] #Create output dataframe df_final = pd.DataFrame(columns=[ 'data_frame_ref', 'trip_id', 'stop_id', 'stop_time', 'stop_time_unix' ]) n_trips = len(df_stops['journey_ref'].unique()) #For each trip on that day... for j, trip_id in enumerate(df_stops['journey_ref'].unique()): print(" Processing trip_id {} ({} of {})".format( trip_id, (j + 1), n_trips)) #Get actual data for this trip. Rename columns to match stop data. df_stops_actual = df_stops[df_stops['journey_ref'] == trip_id].rename( index=str, columns={ "journey_ref": "trip_id", "stop_point_ref": "stop_id" }) #Get stop data for this trip df_stops_all = df_stop_times[df_stop_times['trip_id'] == trip_id] #Fix to deal with the fact that that stop_ids are in a slightly different format df_stops_all['stop_id'] = ( '1' + df_stops_all['stop_id'].astype(str)).astype(int) #Merge dataframes todether df_merged = df_stops_all.merge(df_stops_actual, on=['trip_id', 'stop_id'], how='left') #Create unix time column df_merged['stop_time_unix'] = ( df_merged['stop_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #Interpolate timestamps for missing stop events df_merged['stop_time_unix'] = df_merged[ 'stop_time_unix'].interpolate(limit_area='inside') #Convert back to actual timestamps df_merged['stop_time'] = pd.to_datetime( df_merged['stop_time_unix'], origin='unix', unit='s') #Fill missing data_frame_refs df_merged['data_frame_ref'] = df_merged[ 'data_frame_ref'].fillna(data_frame_ref) #Drop uneeeded columns df_merged = df_merged[[ 'data_frame_ref', 'trip_id', 'stop_id', 'stop_time', 'stop_time_unix' ]] #Remove NaNs (occurs if we are missing data at the start or end of a journey) df_merged = df_merged.dropna(subset=['stop_time']) #Add to final data frame df_final = pd.concat([df_final, df_merged]) #Only bother with this if we actually have stop events... if df_final.shape[0] == 0: print("No stop events for {}, skipping...".format( data_frame_ref)) else: pr.pandas_to_redshift(data_frame=df_final, redshift_table_name='stop_events', append=True) pr.close_up_shop()
def durs_to_dists(): connect_to_redshift() connect_to_s3() #Note: this processes data not already in distributions. Assumes we do one hour at a time, no subdividing of hours. df = pr.redshift_to_pandas("""select a.* from (select data_frame_ref, departure_time_hour from trip_durations group by data_frame_ref, departure_time_hour) a left join (select data_frame_ref, departure_time_hour from distributions_gamma group by data_frame_ref, departure_time_hour) b on a.data_frame_ref = b.data_frame_ref and a.departure_time_hour = b.departure_time_hour where b.data_frame_ref is null and b.departure_time_hour is null and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE())) order by a.data_frame_ref, a.departure_time_hour;""") #Randomize order, so we can get some samples from everywhere... df = df.sample(frac=1).reset_index(drop=True) n_days_hours = df.shape[0] #For each day and departure stop: for i, row in df.iterrows(): data_frame_ref = row['data_frame_ref'] departure_time_hour = row['departure_time_hour'] print( "Processing data_frame_ref {}, departure_time_hour {} ({} of {})". format(data_frame_ref, departure_time_hour, (i + 1), n_days_hours)) #Calculate base timestamps for this day minutes = pd.DataFrame(np.arange(0, 60), columns=['minute']) minutes['key'] = 0 df_hour = pr.redshift_to_pandas("""select *, date_trunc('min', departure_time) as departure_time_minute from trip_durations where data_frame_ref = '{}' and departure_time_hour = '{}' """. format(data_frame_ref, departure_time_hour)) results = [] n_dep_stops = len(df_hour['departure_stop_id'].unique()) #For each arrival stop: for j, departure_stop_id in enumerate( df_hour['departure_stop_id'].unique()): print("Processing departure_stop_id {} ({} of {})".format( departure_stop_id, (j + 1), n_dep_stops)) #For each departure stop: for k, arrival_stop_id in enumerate( df_hour[df_hour['departure_stop_id'] == departure_stop_id]['arrival_stop_id'].unique()): #Select data df_dist = df_hour[ (df_hour['departure_stop_id'] == departure_stop_id) & (df_hour['arrival_stop_id'] == arrival_stop_id)] #Create date array date = pd.DataFrame([departure_time_hour], columns=['departure_time_hour']) date['key'] = 0 #Create base array base = date.merge(minutes) base['departure_time_minute'] = base[ 'departure_time_hour'] + pd.to_timedelta(base.minute, unit='m') base = base[['departure_time_minute']] base['departure_time_minute_unix'] = ( base['departure_time_minute'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') df_dist = base.merge(df_dist, on='departure_time_minute', how='left') df_dist = df_dist.fillna(method='bfill') df_dist['total_journey_time'] = df_dist[ 'arrival_time_unix'] - df_dist['departure_time_minute_unix'] df_dist = df_dist.dropna(subset=['total_journey_time']) data = df_dist['total_journey_time'] try: # fit dist to data params = st.gamma.fit(data, floc=True) y, x = np.histogram(data) x = (x + np.roll(x, -1))[:-1] / 2.0 # Separate parts of parameters arg = params[:-2] loc = params[-2] scale = params[-1] # Calculate fitted PDF and error with fit in distribution pdf = st.gamma.pdf(x, loc=loc, scale=scale, *arg) sse = np.sum(np.power(y - pdf, 2.0)) results.append([ data_frame_ref, departure_time_hour, departure_stop_id, arrival_stop_id, arg[0], scale, sse ]) except Exception as e: print(e) continue #Only bother with this if we actually have stop events... if len(results) == 0: print( "No distributions for data_frame_ref {}, departure_time_hour {}, skipping..." .format(data_frame_ref, departure_time_hour)) else: print("Writing distributions to Redshift...") df_results = pd.DataFrame(results, columns=[ 'data_frame_ref', 'departure_time_hour', 'departure_stop_id', 'arrival_stop_id', 'shape', 'scale', 'sse' ]) pr.pandas_to_redshift(data_frame=df_results, redshift_table_name='distributions_gamma', append=True) pr.close_up_shop()
def enviar_df_para_redshift(self, df): pr.pandas_to_redshift(data_frame=df, redshift_table_name=self.redshift_table_name)