def process_single_parquet_partition(parquet_location, callback):
        parquet_file = ParquetFile(source=parquet_location)
        num_row_groups = parquet_file.num_row_groups

        print(
            "----------------------------------------------------------------------------------"
        )
        print("%d row groups for partition: %s" %
              (num_row_groups, parquet_location))

        for index in range(0, num_row_groups):
            row_df = parquet_file.read_row_group(index,
                                                 columns=["id", "img_binary"
                                                          ]).to_pandas()
            print(row_df.info(verbose=True))
            callback(row_df)
Example #2
0
    def __init__(self,
                 parquets,
                 img_root='',
                 past=0,
                 future=0,
                 stride=1,
                 cameras=['front-forward'],
                 transform=None,
                 load_from_azure=False):
        columns = [
            'speed_state',
            'curvature_invm_state',
            'run_id_noseginfix',
        ] + [cam + '_image_timestamp_rgb' for cam in cameras]

        # for loading images from azure blob storage
        azure_loader = AzureImageLoader() if load_from_azure else None

        # open a dataframe for each run_id and construct datasets
        datasets = []
        count = 0
        for i, parquet in enumerate(parquets):
            pqfile = ParquetFile(parquet, memory_map=False)
            num_row_groups = pqfile.metadata.num_row_groups
            for j in range(num_row_groups):
                if count % 100 == 0:
                    print('initializing parquet %d/%d run %d/%d' %
                          (i + 1, len(parquets), j + 1, num_row_groups))

                dataframe = pqfile.read_row_group(j, columns=columns)

                if len(dataframe) > (past + 1 + future) * stride:
                    datasets.append(
                        SingleWayveDataset(dataframe, img_root, past, future,
                                           stride, cameras, transform,
                                           azure_loader))
                count += 1
        super().__init__(datasets)
Example #3
0
File: io.py Project: suzaku/ray
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
    """Read a parquet row_group given file_path.
    """
    pf = ParquetFile(path)
    df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
    return df
Example #4
0
File: io.py Project: adgirish/ray
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
    """Read a parquet row_group given file_path.
    """
    pf = ParquetFile(path)
    df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
    return df