def process_single_parquet_partition(parquet_location, callback): parquet_file = ParquetFile(source=parquet_location) num_row_groups = parquet_file.num_row_groups print( "----------------------------------------------------------------------------------" ) print("%d row groups for partition: %s" % (num_row_groups, parquet_location)) for index in range(0, num_row_groups): row_df = parquet_file.read_row_group(index, columns=["id", "img_binary" ]).to_pandas() print(row_df.info(verbose=True)) callback(row_df)
def __init__(self, parquets, img_root='', past=0, future=0, stride=1, cameras=['front-forward'], transform=None, load_from_azure=False): columns = [ 'speed_state', 'curvature_invm_state', 'run_id_noseginfix', ] + [cam + '_image_timestamp_rgb' for cam in cameras] # for loading images from azure blob storage azure_loader = AzureImageLoader() if load_from_azure else None # open a dataframe for each run_id and construct datasets datasets = [] count = 0 for i, parquet in enumerate(parquets): pqfile = ParquetFile(parquet, memory_map=False) num_row_groups = pqfile.metadata.num_row_groups for j in range(num_row_groups): if count % 100 == 0: print('initializing parquet %d/%d run %d/%d' % (i + 1, len(parquets), j + 1, num_row_groups)) dataframe = pqfile.read_row_group(j, columns=columns) if len(dataframe) > (past + 1 + future) * stride: datasets.append( SingleWayveDataset(dataframe, img_root, past, future, stride, cameras, transform, azure_loader)) count += 1 super().__init__(datasets)
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}): """Read a parquet row_group given file_path. """ pf = ParquetFile(path) df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas() return df
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}): """Read a parquet row_group given file_path. """ pf = ParquetFile(path) df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas() return df