Exemple #1
0
def perform_transit_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            usecols = [3, 6, 7, 9, 10]
            names = ['station', 'date', 'time', 'entries', 'exits']
            url_part1: str = 's3://' + in_bucket + '/turnstile_'
            url_part2: str = ".txt"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + year[2:] + prefix_zero(month) +
                prefix_zero(day_tuple[0]) + url_part2
                for month in range(month_st, month_end)
                for day_tuple in calendar.itermonthdays2(int(year), month)
                if day_tuple[0] in range(1, 32) and day_tuple[1] == 5
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates={'datetime': ['date', 'time']},
                             date_parser=row_ops.clean_transit_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'entries': row_ops.clean_num,
                                 'exits': row_ops.clean_num
                             },
                             encoding='utf-8')

            to_parquet(df=df,
                       out_bucket=out_bucket,
                       folder=year + '/',
                       compute=True)

    except Exception as err:
        raise err

    else:
        return status
Exemple #2
0
def perform_transit_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + year

            df = dd.read_parquet(path=s3_in_url,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            df['delex'] = df['exits'].diff()
            df['delent'] = df['entries'].diff()
            df = df.drop(['exits', 'entries'], axis=1)
            df = df.dropna()

            delex_lo_q = df['delex'].quantile(.25)
            delent_lo_q = df['delent'].quantile(.25)
            delex_hi_q = df['delex'].quantile(.75)
            delent_hi_q = df['delent'].quantile(.75)
            delex_iqr = delex_hi_q - delex_lo_q
            delent_iqr = delent_hi_q - delent_lo_q
            discard = (df['delex'] < delex_lo_q - 1.5 * delex_iqr) | \
                      (df['delex'] > delex_hi_q + 1.5 * delex_iqr) | \
                      (df['delent'] < delent_lo_q - 1.5 * delent_iqr) | \
                      (df['delent'] > delent_hi_q + 1.5 * delent_iqr)
            df = df.loc[~discard]

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='lz4',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
Exemple #3
0
def regroup_dask(task_type: str, years: List[str], resample_freq: str,
                 filter_key: str, filter_val: str) -> bool:
    try:
        # determine in and out buckets
        # and split_by from task type map
        in_bucket: str = task_map.task_type_map[task_type]['in']
        out_bucket: str = task_map.task_type_map[task_type]['out']
        split_by: List[str] = task_map.task_type_map[task_type]['split_by']
        date_cols: List[str] = task_map.task_type_map[task_type]['date_cols']
        dtypes: Dict = task_map.task_type_map[task_type]['dtypes']
        print('fetched in out and split_by for task_type %(task)s' %
              {'task': task_type})

        # read files from in bucket and concat into one df
        s3_options: Dict = ps.fetch_s3_options()
        client: Client = dask.create_dask_client(num_workers=8)

        # create out bucket
        ps.create_bucket(out_bucket)

        s3_in_url: str = 's3://' + in_bucket + '/'
        s3_sub_path: str = resample_freq + '/' + filter_key + filter_val + '/'
        if task_type == 'rg-tsfare':
            s3_sub_path = ''
        df = dd.concat([
            dd.read_csv(urlpath=s3_in_url + year + '/' + s3_sub_path + '*',
                        storage_options=s3_options,
                        parse_dates=date_cols,
                        dtype=dtypes) for year in years
        ])

        print('read files from in bucket and concat-ted into one df')
        fillna_dict: Dict = {key: 0 for key in dtypes}
        df.fillna(fillna_dict)
        if task_type == 'rg-tsfare':
            s3_sub_path = resample_freq + '/'
        df.groupby(split_by).apply(partial(write_group_to_csv,
                                           split_by=split_by,
                                           out_bucket=out_bucket,
                                           out_path=s3_sub_path),
                                   meta=('int')).compute()

    except Exception as err:
        print('Error: %(error)s in regrouper for task_type %(task)s' % {
            'error': err,
            'task': task_type
        })
        raise err

    return True
Exemple #4
0
def perform_traffic_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()
        usecols = [1, 2, 4, 5]
        names = ['speed', 'traveltime', 'datetime', 'linkid']

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + '*' + year + '.csv'

            df = dd.read_csv(urlpath=s3_in_url,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates=['datetime'],
                             date_parser=row_ops.clean_traffic_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'speed': row_ops.clean_num,
                                 'traveltime': row_ops.clean_num,
                                 'linkid': row_ops.clean_num
                             },
                             encoding='utf-8')

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='GZIP',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
Exemple #5
0
def perform_cabs_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    special_case: bool = False
    normal_case: bool = False
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + year
            if int(year) == 2016:
                special_case = True
                normal_case = True
            elif int(year) > 2016:
                special_case = True
                normal_case = False
            elif int(year) < 2016:
                special_case = False
                normal_case = True

            if special_case:
                clean_cabs_at_path(special=True,
                                   s3_in_url=s3_in_url + '/special/',
                                   s3_out_url=s3_out_url + '/special/',
                                   s3_options=s3_options)

            if normal_case:
                clean_cabs_at_path(special=False,
                                   s3_in_url=s3_in_url + '/normal/',
                                   s3_out_url=s3_out_url + '/normal/',
                                   s3_options=s3_options)

    except Exception as err:
        print('error in perform_cabs %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
Exemple #6
0
def perform_traffic_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            if year in ['2016', '2017']:
                month_st = 1
                month_end = 13
            elif year == '2015':
                month_st = 4
                month_end = 13
            elif year == '2018':
                month_st = 1
                month_end = 10
            usecols = [1, 2, 4, 5]
            names = ['speed', 'traveltime', 'datetime', 'linkid']
            url_part1: str = 's3://' + in_bucket + '/'
            url_part2: str = ".csv"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + prefix_zero(month) + year + url_part2
                for month in range(month_st, month_end)
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates=['datetime'],
                             date_parser=row_ops.clean_traffic_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'speed': row_ops.clean_num,
                                 'traveltime': row_ops.clean_num,
                                 'linkid': row_ops.clean_num
                             },
                             encoding='utf-8')

            to_parquet(df=df,
                       out_bucket=out_bucket,
                       folder=year + '/',
                       compute=True)

            #dd.to_csv(df=df,
            #          filename='s3://'+out_bucket+'/'+year+'/',
            #          #name_function=lambda i: out_file_prefix + '_' + str(i),
            #          storage_options=s3_options)

    except Exception as err:
        raise err

    else:
        return status
Exemple #7
0
def perform_tsfare_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            usecols = [
                'date', 'STATION', 'FF', 'SEN/DIS', '7-D AFAS UNL',
                '30-D AFAS/RMF UNL', 'JOINT RR TKT', '7-D UNL', '30-D UNL',
                '14-D RFM UNL', '1-D UNL', '14-D UNL', '7D-XBUS PASS', 'TCMC',
                'RF 2 TRIP', 'RR UNL NO TRADE', 'TCMC ANNUAL MC',
                'MR EZPAY EXP', 'MR EZPAY UNL', 'PATH 2-T', 'AIRTRAIN FF',
                'AIRTRAIN 30-D', 'AIRTRAIN 10-T', 'AIRTRAIN MTHLY', 'STUDENTS'
            ]
            url_part1: str = 's3://' + in_bucket + '/fares_'
            url_part2: str = ".csv"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + year[2:] + prefix_zero(month) +
                prefix_zero(day_tuple[0]) + url_part2
                for month in range(month_st, month_end)
                for day_tuple in calendar.itermonthdays2(int(year), month)
                if day_tuple[0] in range(1, 32) and day_tuple[1] == 5
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=0,
                             usecols=usecols,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             parse_dates=['date'],
                             converters={
                                 'STATION': str.strip,
                                 'FF': row_ops.clean_num,
                                 'SEN/DIS': row_ops.clean_num,
                                 '7-D AFAS UNL': row_ops.clean_num,
                                 '30-D AFAS/RMF UNL': row_ops.clean_num,
                                 'JOINT RR TKT': row_ops.clean_num,
                                 '7-D UNL': row_ops.clean_num,
                                 '30-D UNL': row_ops.clean_num,
                                 '14-D RFM UNL': row_ops.clean_num,
                                 '1-D UNL': row_ops.clean_num,
                                 '14-D UNL': row_ops.clean_num,
                                 '7D-XBUS PASS': row_ops.clean_num,
                                 'TCMC': row_ops.clean_num,
                                 'RF 2 TRIP': row_ops.clean_num,
                                 'RR UNL NO TRADE': row_ops.clean_num,
                                 'TCMC ANNUAL MC': row_ops.clean_num,
                                 'MR EZPAY EXP': row_ops.clean_num,
                                 'MR EZPAY UNL': row_ops.clean_num,
                                 'PATH 2-T': row_ops.clean_num,
                                 'AIRTRAIN FF': row_ops.clean_num,
                                 'AIRTRAIN 30-D': row_ops.clean_num,
                                 'AIRTRAIN 10-T': row_ops.clean_num,
                                 'AIRTRAIN MTHLY': row_ops.clean_num,
                                 'STUDENTS': row_ops.clean_num
                             },
                             encoding='utf-8')
            #to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True)
            dd.to_csv(
                df=df,
                filename='s3://' + out_bucket + '/' + year + '/',
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

    except Exception as err:
        raise err

    else:
        return status
Exemple #8
0
def perform_cabs_dask(task_type: str, years: List[str]) -> bool:
    file_prefix: str
    cab_type: str = task_type.split('-', 1)[1]
    if cab_type == 'gcabs':
        file_prefix = 'green'
    elif cab_type == 'ycabs':
        file_prefix = 'yellow'

    task_type_map: Dict = task_map.task_type_map[task_type]
    out_bucket: str = task_type_map['out']
    cols: Dict[str, str] = task_type_map['cols']
    parse_dates: bool = task_type_map['dates']['parse']
    dates: Union[bool, Dict[str, List[str]], List[str]]
    date_parser: Optional[Callable]
    if parse_dates:
        date_parser = task_type_map['dates']['parser']
    else:
        dates = False
        date_parser = None
    #dtypes: Dict[str, str] = task_type_map['dtypes']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        special_case: bool = False
        normal_case: bool = False
        month_st_sp: int
        month_end_sp: int
        month_st_norm: int
        month_end_norm: int
        for year in years:
            if int(year) == 2016:
                special_case = True
                normal_case = True
                month_st_sp = 7
                month_end_sp = 13
                month_st_norm = 1
                month_end_norm = 7
            elif int(year) > 2016:
                special_case = True
                normal_case = False
                month_st_sp = 1
                month_end_sp = 13
            elif int(year) < 2016:
                special_case = False
                normal_case = True
                month_st_norm = 1
                month_end_norm = 13

            if special_case:
                if task_type == 'dl-gcabs':
                    usecols = [1, 2, 5, 6, 7, 8]
                    names = [
                        'pudatetime', 'dodatetime', 'pulocationid',
                        'dolocationid', 'passengers', 'distance'
                    ]
                else:
                    usecols = [1, 2, 3, 4, 7, 8]
                    names = [
                        'pudatetime', 'dodatetime', 'passengers', 'distance',
                        'pulocationid', 'dolocationid'
                    ]

                urls = [
                    'https://s3.amazonaws.com/nyc-tlc/trip+data/' +
                    file_prefix + '_tripdata_' + year + '-' +
                    prefix_zero(month) + '.csv'
                    for month in range(month_st_sp, month_end_sp)
                ]

                df = dd.read_csv(urlpath=urls,
                                 header=None,
                                 usecols=usecols,
                                 names=names,
                                 parse_dates=['pudatetime', 'dodatetime'],
                                 date_parser=date_parser,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 converters={
                                     'passengers': row_ops.clean_num,
                                     'distance': row_ops.clean_num,
                                     'dolocationid': row_ops.clean_num,
                                     'pulocationid': row_ops.clean_num
                                 },
                                 encoding='utf-8')
                to_parquet(df=df,
                           out_bucket=out_bucket,
                           folder=year + '/special/',
                           compute=True)

            if normal_case:
                if task_type == 'dl-gcabs':
                    usecols = [1, 2, 5, 6, 7, 8, 9, 10]
                    names = [
                        'pudatetime', 'dodatetime', 'pulongitude',
                        'pulatitude', 'dolongitude', 'dolatitude',
                        'passengers', 'distance'
                    ]
                else:
                    usecols = [1, 2, 3, 4, 5, 6, 9, 10]
                    names = [
                        'pudatetime', 'dodatetime', 'passengers', 'distance',
                        'pulongitude', 'pulatitude', 'dolongitude',
                        'dolatitude'
                    ]
                urls = [
                    'https://s3.amazonaws.com/nyc-tlc/trip+data/' +
                    file_prefix + '_tripdata_' + year + '-' +
                    prefix_zero(month) + '.csv'
                    for month in range(month_st_norm, month_end_norm)
                ]
                df = dd.read_csv(urlpath=urls,
                                 header=None,
                                 usecols=usecols,
                                 names=names,
                                 parse_dates=['pudatetime', 'dodatetime'],
                                 date_parser=date_parser,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 converters={
                                     'passengers': row_ops.clean_num,
                                     'distance': row_ops.clean_num,
                                     'dolongitude': row_ops.clean_num,
                                     'dolatitude': row_ops.clean_num,
                                     'pulongitude': row_ops.clean_num,
                                     'pulatitude': row_ops.clean_num
                                 },
                                 encoding='utf-8')

                to_parquet(df=df,
                           out_bucket=out_bucket,
                           folder=year + '/normal/',
                           compute=True)

    except Exception as err:
        raise err

    else:
        return status
Exemple #9
0
def perform_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']
    group: Dict = task_type_map['group']
    index_col: str = task_type_map['index']['col']

    aggr_func: Callable
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']

    s3_options: Dict = ps.fetch_s3_options()

    client: Client = dask.create_dask_client(num_workers=8)

    try:
        for year in years:
            s3_in_url: str = 's3://' + in_bucket + '/' + year + '/'
            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' \
                              + resample_freq + '/' + filter_by_key+str(filter_by_val) + '/'
            path: str = ''
            print('s3 url %s' % s3_in_url)
            if task_type in ['rs-gcabs', 'rs-ycabs']:
                if int(year) >= 2016:
                    path = '/special/'
                elif int(year) < 2016:
                    path = '/normal/'

            #resample_at_path(s3_in_url+path,
            #                 s3_out_url,
            #                 s3_options,
            #                 group,
            #                 index_col)

            df = dd.read_parquet(path=s3_in_url + path,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            if task_type in ['rs-gcabs', 'rs-ycabs'] and int(year) == 2016:
                #resample_at_path(s3_in_url + '/normal/',
                #                 s3_out_url,
                #                 s3_options,
                #                 group,
                #                 index_col,
                #                 'out2')
                df_2 = dd.read_parquet(path=s3_in_url + '/normal/',
                                       storage_options=s3_options,
                                       engine='fastparquet')
                df = dd.concat([df, df_2], axis=0)

            partitions = df.npartitions
            if partitions < 5:
                print('repartitioning to 5')
                df = df.repartition(npartitions=5)
                client.persist(df)

            # filter
            if filter_by_key == 'weekday':
                df = df.loc[df[index_col].dt.weekday == filter_by_val]

            if group['compute']:
                grouper_cols = group['by_cols']
                aggr_func = group['aggr_func']
                meta_cols = group['meta']
                cols = [
                    col for col in meta_cols.keys()
                    if col not in grouper_cols + [index_col]
                ]
                meta_types = [
                    meta_cols[key] for key in meta_cols.keys()
                    if key not in grouper_cols + [index_col]
                ]
                print('meta_cols %s' % meta_cols)
                index = [index_col] + grouper_cols
                index_levels: List[List] = [[] for level in index]
                meta: pd.DataFrame = pd.DataFrame(columns=cols,
                                                  index=pd.MultiIndex(
                                                      index_levels,
                                                      index_levels,
                                                      names=index))

                # resample using frequency and aggregate function specified
                df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \
                    apply(aggr_func, meta=meta).reset_index()
                # df = df.resample(resample_freq).sum()
                # print('after resampling')

            print('after grouping and resampling %s' % str(df.shape))

            # save in out bucket
            dd.to_csv(
                df=df,
                filename=s3_out_url,
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

            #dd.to_parquet(df=df,
            #              path=s3_out_url,
            #              engine='fastparquet',
            #compute=True,
            #write_index=True,
            #              compression='lz4',
            #              storage_options=s3_options)

    except Exception as err:
        print('error in perform_cabs %s')
        client.close()
        raise err

    client.close()

    return True