def perform_transit_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] status: bool = False try: client: Client = dask.create_dask_client(num_workers=8) s3_options: Dict = ps.fetch_s3_options() month_st: int = 1 month_end: int = 13 calendar: cal.Calendar = cal.Calendar() for year in years: usecols = [3, 6, 7, 9, 10] names = ['station', 'date', 'time', 'entries', 'exits'] url_part1: str = 's3://' + in_bucket + '/turnstile_' url_part2: str = ".txt" # urls for all saturdays in month range for year urls: List[str] = [ url_part1 + year[2:] + prefix_zero(month) + prefix_zero(day_tuple[0]) + url_part2 for month in range(month_st, month_end) for day_tuple in calendar.itermonthdays2(int(year), month) if day_tuple[0] in range(1, 32) and day_tuple[1] == 5 ] #for url in urls: # print(url) df = dd.read_csv(urlpath=urls, storage_options=s3_options, header=None, usecols=usecols, names=names, parse_dates={'datetime': ['date', 'time']}, date_parser=row_ops.clean_transit_date, skipinitialspace=True, skip_blank_lines=True, converters={ 'entries': row_ops.clean_num, 'exits': row_ops.clean_num }, encoding='utf-8') to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True) except Exception as err: raise err else: return status
def perform_transit_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] client: Client = dask.create_dask_client(num_workers=8) s3_in_prefix: str = 's3://' + in_bucket + '/' try: s3_options: Dict = ps.fetch_s3_options() for year in years: s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' s3_in_url: str = s3_in_prefix + year df = dd.read_parquet(path=s3_in_url, storage_options=s3_options, engine='fastparquet') df['delex'] = df['exits'].diff() df['delent'] = df['entries'].diff() df = df.drop(['exits', 'entries'], axis=1) df = df.dropna() delex_lo_q = df['delex'].quantile(.25) delent_lo_q = df['delent'].quantile(.25) delex_hi_q = df['delex'].quantile(.75) delent_hi_q = df['delent'].quantile(.75) delex_iqr = delex_hi_q - delex_lo_q delent_iqr = delent_hi_q - delent_lo_q discard = (df['delex'] < delex_lo_q - 1.5 * delex_iqr) | \ (df['delex'] > delex_hi_q + 1.5 * delex_iqr) | \ (df['delent'] < delent_lo_q - 1.5 * delent_iqr) | \ (df['delent'] > delent_hi_q + 1.5 * delent_iqr) df = df.loc[~discard] dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='lz4', storage_options=s3_options) except Exception as err: print('error in perform_transit %s' % str(err)) client.close() raise err client.close() return True
def regroup_dask(task_type: str, years: List[str], resample_freq: str, filter_key: str, filter_val: str) -> bool: try: # determine in and out buckets # and split_by from task type map in_bucket: str = task_map.task_type_map[task_type]['in'] out_bucket: str = task_map.task_type_map[task_type]['out'] split_by: List[str] = task_map.task_type_map[task_type]['split_by'] date_cols: List[str] = task_map.task_type_map[task_type]['date_cols'] dtypes: Dict = task_map.task_type_map[task_type]['dtypes'] print('fetched in out and split_by for task_type %(task)s' % {'task': task_type}) # read files from in bucket and concat into one df s3_options: Dict = ps.fetch_s3_options() client: Client = dask.create_dask_client(num_workers=8) # create out bucket ps.create_bucket(out_bucket) s3_in_url: str = 's3://' + in_bucket + '/' s3_sub_path: str = resample_freq + '/' + filter_key + filter_val + '/' if task_type == 'rg-tsfare': s3_sub_path = '' df = dd.concat([ dd.read_csv(urlpath=s3_in_url + year + '/' + s3_sub_path + '*', storage_options=s3_options, parse_dates=date_cols, dtype=dtypes) for year in years ]) print('read files from in bucket and concat-ted into one df') fillna_dict: Dict = {key: 0 for key in dtypes} df.fillna(fillna_dict) if task_type == 'rg-tsfare': s3_sub_path = resample_freq + '/' df.groupby(split_by).apply(partial(write_group_to_csv, split_by=split_by, out_bucket=out_bucket, out_path=s3_sub_path), meta=('int')).compute() except Exception as err: print('Error: %(error)s in regrouper for task_type %(task)s' % { 'error': err, 'task': task_type }) raise err return True
def perform_traffic_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] client: Client = dask.create_dask_client(num_workers=8) s3_in_prefix: str = 's3://' + in_bucket + '/' try: s3_options: Dict = ps.fetch_s3_options() usecols = [1, 2, 4, 5] names = ['speed', 'traveltime', 'datetime', 'linkid'] for year in years: s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' s3_in_url: str = s3_in_prefix + '*' + year + '.csv' df = dd.read_csv(urlpath=s3_in_url, storage_options=s3_options, header=None, usecols=usecols, names=names, parse_dates=['datetime'], date_parser=row_ops.clean_traffic_date, skipinitialspace=True, skip_blank_lines=True, converters={ 'speed': row_ops.clean_num, 'traveltime': row_ops.clean_num, 'linkid': row_ops.clean_num }, encoding='utf-8') dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='GZIP', storage_options=s3_options) except Exception as err: print('error in perform_transit %s' % str(err)) client.close() raise err client.close() return True
def perform_cabs_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] client: Client = dask.create_dask_client(num_workers=8) special_case: bool = False normal_case: bool = False s3_in_prefix: str = 's3://' + in_bucket + '/' try: s3_options: Dict = ps.fetch_s3_options() for year in years: s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' s3_in_url: str = s3_in_prefix + year if int(year) == 2016: special_case = True normal_case = True elif int(year) > 2016: special_case = True normal_case = False elif int(year) < 2016: special_case = False normal_case = True if special_case: clean_cabs_at_path(special=True, s3_in_url=s3_in_url + '/special/', s3_out_url=s3_out_url + '/special/', s3_options=s3_options) if normal_case: clean_cabs_at_path(special=False, s3_in_url=s3_in_url + '/normal/', s3_out_url=s3_out_url + '/normal/', s3_options=s3_options) except Exception as err: print('error in perform_cabs %s' % str(err)) client.close() raise err client.close() return True
def to_parquet(df: dd.DataFrame, out_bucket: str, folder: str, compute: bool = True) -> bool: try: s3_out_url: str = 's3://' + out_bucket + '/' + folder s3_options: Dict = ps.fetch_s3_options() dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=compute, compression='lz4', storage_options=s3_options) except Exception as err: print('error while saving to parquet to path %(path)s - %(error)s' % { 'path': out_bucket + '/' + folder, 'error': str(err) }) raise err else: return True
def run_pipeline(task_type: str) -> bool: map: Dict = task_type_map[task_type] in_bucket: str = map['in'] out_bucket: str = map['out'] cols: Dict[str, str] = map['cols'] converters: Dict[str, Callable] = map['converters'] dtypes: Dict[str, str] = map['dtypes'] index_col: str = map['index']['col'] sorted: bool = map['index']['sorted'] row_op: Callable = map['row_op'] diff: Dict = map['diff'] filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] aggr_func: Callable = map['aggr_func'] try: #client = Client(address='dscheduler:8786') s3_in_url: str = 's3://'+in_bucket+'/*.*' s3_options: Dict = ps.fetch_s3_options() #df = dd.read_table(path=s3_in_url, storage_options=s3_options) df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*', header=0, usecols=lambda x: x.upper() in list(cols.keys()), skipinitialspace=True, converters=converters ) # rename columns df = df.rename(columns=cols) df.compute() if sorted: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .apply(func=row_op, axis=1), meta=dtypes).compute() else: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .set_index(index_col).sort().reset_index() .apply(func=row_op, axis=1), meta=dtypes).compute() # map row-wise operations #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes) # diff if diff['compute']: df[diff['new_col']] = df[diff['col']].diff() # specific processing for transit if task_type == 'cl-transit': df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes) # drop na values df = df.dropna() # set index (assumes pre-sorted data) df = df.set_index(index_col, sorted=True) #df.compute() # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].weekday() == filter_by_val] # resample using frequency and aggregate function specified df = compose(df.resample(resample_freq), aggr_func) # save in out bucket s3_out_url: str = 's3://' + out_bucket # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options) dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*') except Exception as err: print('error in run_pipeline %s' % str(err)) raise err return True
def perform_traffic_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] status: bool = False try: client: Client = dask.create_dask_client(num_workers=8) s3_options: Dict = ps.fetch_s3_options() month_st: int = 1 month_end: int = 13 calendar: cal.Calendar = cal.Calendar() for year in years: if year in ['2016', '2017']: month_st = 1 month_end = 13 elif year == '2015': month_st = 4 month_end = 13 elif year == '2018': month_st = 1 month_end = 10 usecols = [1, 2, 4, 5] names = ['speed', 'traveltime', 'datetime', 'linkid'] url_part1: str = 's3://' + in_bucket + '/' url_part2: str = ".csv" # urls for all saturdays in month range for year urls: List[str] = [ url_part1 + prefix_zero(month) + year + url_part2 for month in range(month_st, month_end) ] #for url in urls: # print(url) df = dd.read_csv(urlpath=urls, storage_options=s3_options, header=None, usecols=usecols, names=names, parse_dates=['datetime'], date_parser=row_ops.clean_traffic_date, skipinitialspace=True, skip_blank_lines=True, converters={ 'speed': row_ops.clean_num, 'traveltime': row_ops.clean_num, 'linkid': row_ops.clean_num }, encoding='utf-8') to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True) #dd.to_csv(df=df, # filename='s3://'+out_bucket+'/'+year+'/', # #name_function=lambda i: out_file_prefix + '_' + str(i), # storage_options=s3_options) except Exception as err: raise err else: return status
def perform_tsfare_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] status: bool = False try: client: Client = dask.create_dask_client(num_workers=8) s3_options: Dict = ps.fetch_s3_options() month_st: int = 1 month_end: int = 13 calendar: cal.Calendar = cal.Calendar() for year in years: usecols = [ 'date', 'STATION', 'FF', 'SEN/DIS', '7-D AFAS UNL', '30-D AFAS/RMF UNL', 'JOINT RR TKT', '7-D UNL', '30-D UNL', '14-D RFM UNL', '1-D UNL', '14-D UNL', '7D-XBUS PASS', 'TCMC', 'RF 2 TRIP', 'RR UNL NO TRADE', 'TCMC ANNUAL MC', 'MR EZPAY EXP', 'MR EZPAY UNL', 'PATH 2-T', 'AIRTRAIN FF', 'AIRTRAIN 30-D', 'AIRTRAIN 10-T', 'AIRTRAIN MTHLY', 'STUDENTS' ] url_part1: str = 's3://' + in_bucket + '/fares_' url_part2: str = ".csv" # urls for all saturdays in month range for year urls: List[str] = [ url_part1 + year[2:] + prefix_zero(month) + prefix_zero(day_tuple[0]) + url_part2 for month in range(month_st, month_end) for day_tuple in calendar.itermonthdays2(int(year), month) if day_tuple[0] in range(1, 32) and day_tuple[1] == 5 ] #for url in urls: # print(url) df = dd.read_csv(urlpath=urls, storage_options=s3_options, header=0, usecols=usecols, skipinitialspace=True, skip_blank_lines=True, parse_dates=['date'], converters={ 'STATION': str.strip, 'FF': row_ops.clean_num, 'SEN/DIS': row_ops.clean_num, '7-D AFAS UNL': row_ops.clean_num, '30-D AFAS/RMF UNL': row_ops.clean_num, 'JOINT RR TKT': row_ops.clean_num, '7-D UNL': row_ops.clean_num, '30-D UNL': row_ops.clean_num, '14-D RFM UNL': row_ops.clean_num, '1-D UNL': row_ops.clean_num, '14-D UNL': row_ops.clean_num, '7D-XBUS PASS': row_ops.clean_num, 'TCMC': row_ops.clean_num, 'RF 2 TRIP': row_ops.clean_num, 'RR UNL NO TRADE': row_ops.clean_num, 'TCMC ANNUAL MC': row_ops.clean_num, 'MR EZPAY EXP': row_ops.clean_num, 'MR EZPAY UNL': row_ops.clean_num, 'PATH 2-T': row_ops.clean_num, 'AIRTRAIN FF': row_ops.clean_num, 'AIRTRAIN 30-D': row_ops.clean_num, 'AIRTRAIN 10-T': row_ops.clean_num, 'AIRTRAIN MTHLY': row_ops.clean_num, 'STUDENTS': row_ops.clean_num }, encoding='utf-8') #to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True) dd.to_csv( df=df, filename='s3://' + out_bucket + '/' + year + '/', #name_function=lambda i: out_file_prefix + '_' + str(i), storage_options=s3_options) except Exception as err: raise err else: return status
def perform_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] group: Dict = task_type_map['group'] index_col: str = task_type_map['index']['col'] aggr_func: Callable filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] s3_options: Dict = ps.fetch_s3_options() client: Client = dask.create_dask_client(num_workers=8) try: for year in years: s3_in_url: str = 's3://' + in_bucket + '/' + year + '/' s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' \ + resample_freq + '/' + filter_by_key+str(filter_by_val) + '/' path: str = '' print('s3 url %s' % s3_in_url) if task_type in ['rs-gcabs', 'rs-ycabs']: if int(year) >= 2016: path = '/special/' elif int(year) < 2016: path = '/normal/' #resample_at_path(s3_in_url+path, # s3_out_url, # s3_options, # group, # index_col) df = dd.read_parquet(path=s3_in_url + path, storage_options=s3_options, engine='fastparquet') if task_type in ['rs-gcabs', 'rs-ycabs'] and int(year) == 2016: #resample_at_path(s3_in_url + '/normal/', # s3_out_url, # s3_options, # group, # index_col, # 'out2') df_2 = dd.read_parquet(path=s3_in_url + '/normal/', storage_options=s3_options, engine='fastparquet') df = dd.concat([df, df_2], axis=0) partitions = df.npartitions if partitions < 5: print('repartitioning to 5') df = df.repartition(npartitions=5) client.persist(df) # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].dt.weekday == filter_by_val] if group['compute']: grouper_cols = group['by_cols'] aggr_func = group['aggr_func'] meta_cols = group['meta'] cols = [ col for col in meta_cols.keys() if col not in grouper_cols + [index_col] ] meta_types = [ meta_cols[key] for key in meta_cols.keys() if key not in grouper_cols + [index_col] ] print('meta_cols %s' % meta_cols) index = [index_col] + grouper_cols index_levels: List[List] = [[] for level in index] meta: pd.DataFrame = pd.DataFrame(columns=cols, index=pd.MultiIndex( index_levels, index_levels, names=index)) # resample using frequency and aggregate function specified df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \ apply(aggr_func, meta=meta).reset_index() # df = df.resample(resample_freq).sum() # print('after resampling') print('after grouping and resampling %s' % str(df.shape)) # save in out bucket dd.to_csv( df=df, filename=s3_out_url, #name_function=lambda i: out_file_prefix + '_' + str(i), storage_options=s3_options) #dd.to_parquet(df=df, # path=s3_out_url, # engine='fastparquet', #compute=True, #write_index=True, # compression='lz4', # storage_options=s3_options) except Exception as err: print('error in perform_cabs %s') client.close() raise err client.close() return True