Beispiel #1
0
 def __init__(self,
              prefix_path,
              train_path=None,
              val_path=None,
              runs_path=None):
     self._fs = pa.LocalFileSystem()
     super(LocalStore, self).__init__(prefix_path,
                                      train_path=train_path,
                                      val_path=val_path,
                                      runs_path=runs_path)
Beispiel #2
0
def pa_fs(path):
    if path.startswith("hdfs"):  # hdfs://url:port/file_path
        fs = pa.hdfs.connect()
        path = path[len("hdfs://"):]
        return path, fs
    elif path.startswith("s3"):
        raise ValueError("aws s3 is not supported for now")
    else:  # Local path
        if path.startswith("file://"):
            path = path[len("file://"):]
        return path, pa.LocalFileSystem()
Beispiel #3
0
def parquet_file_schema(file_name):
    import pyarrow.parquet as pq
    import pyarrow as pa
    col_names = []
    col_types = []

    if file_name.startswith("hdfs://"):
        fs = pa.hdfs.connect()
    else:
        fs = pa.LocalFileSystem()
    with fs.open(file_name) as _file:
        f = pq.ParquetFile(_file)
        col_names = f.schema.names
        num_cols = len(col_names)
        col_types = [
            _pq_type_to_numba[f.schema.column(i).physical_type]
            for i in range(num_cols)
        ]
    return col_names, col_types
Beispiel #4
0
 def __init__(self, prefix_path, *args, **kwargs):
     self._fs = pa.LocalFileSystem()
     super(LocalStore, self).__init__(prefix_path, *args, **kwargs)