Ejemplo n.º 1
0
def test_null_count(store, column, expected_null_count):
    serialiser = ParquetSerializer(chunk_size=2)

    df = pd.DataFrame({
        "no_nulls_int": [1, 2, 3, 4, 5, 6],
        "partial_nulls_int": [1, 2, 3, None, None, None],
        "no_nulls_float": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6],
        "partial_nulls_float": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan],
        "partial_nulls_obj": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan],
        "no_nulls_obj": ["1.1", "2", "3", "vier", "fuenfeinhalb", "6.6"],
        "partial_nulls_obj_mixed": [1.0, 2.2, None, np.nan, np.nan, 6.6],
        "nulls_reverse_rg": [3.3, np.nan, 1.0, 2.0, np.nan, -1.1],
    })

    key = serialiser.store(store, "prefix", df)
    reader = pa.BufferReader(store.get(key))
    parquet_file = ParquetFile(reader)
    col_idx = parquet_file.reader.column_name_idx(column)

    assert parquet_file.num_row_groups == 3

    for idx in range(0, 3):
        rg = parquet_file.metadata.row_group(idx)
        assert rg.column(
            col_idx).statistics.null_count == expected_null_count[idx]
Ejemplo n.º 2
0
Archivo: io.py Proyecto: lyh0208/modin
def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs):
    from pyarrow.parquet import ParquetFile

    if not columns:
        pf = ParquetFile(path)
        columns = [
            name for name in pf.metadata.schema.names
            if not PQ_INDEX_REGEX.match(name)
        ]
    num_splits = min(len(columns),
                     RayBlockPartitions._compute_num_partitions())
    # Each item in this list will be a column of original df
    # partitioned to smaller pieces along rows.
    # We need to transpose the oids array to fit our schema.
    blk_partitions = np.array([
        _read_parquet_column._submit(args=(path, col, num_splits, kwargs),
                                     num_return_vals=num_splits + 1)
        for col in columns
    ]).T
    remote_partitions = np.array([[RayRemotePartition(obj) for obj in row]
                                  for row in blk_partitions[:-1]])
    index_len = ray.get(blk_partitions[-1][0])
    index = pandas.RangeIndex(index_len)
    new_manager = PandasDataManager(RayBlockPartitions(remote_partitions),
                                    index, columns)
    df = DataFrame(data_manager=new_manager)
    return df
Ejemplo n.º 3
0
    def load(self, rowgroup_spec):
        """Loads data form a single rowgroup from the dataset.

        Reads a single rowgroup from a dataset. Returns a list of dictionary with still encoded data.
        If worker_predicate was passed to the constructor, the predicate is first applied to the columns specified
        by the predicate. The rest of the columns are loaded only if at least one row matches the predicate.

        A rowgroup will be loaded from local cache, if cache contains an instance of the rowgroup.

        If ngram not None was passed to the constructor, the function returns a dictionary structured according to
        NGram definition.

        :param rowgroup_spec: A dictionary containing the following fields: 'row_group': ParquetDatasetPiece object
          describing a rowgroup to be loaded; 'shuffle_row_drop_partition' a tuple with
          (this_partition, num_of_partitions)
        :return: A dictionary indexed by field names, or a dictionary defined by NGram spec.
        """
        piece = rowgroup_spec['row_group']
        shuffle_row_drop_partition = rowgroup_spec[
            'shuffle_row_drop_partition']

        # Create pyarrow file system
        with self._dataset.fs.open(piece.path) as piece_file_handle:
            parquet_file = ParquetFile(piece_file_handle)

            if not isinstance(self._local_cache, NullCache):
                if self._worker_predicate:
                    raise RuntimeError(
                        'Local cache is not supported together with predicates, '
                        'unless the dataset is partitioned by the column the predicate operates on.'
                    )
                if shuffle_row_drop_partition[1] != 1:
                    raise RuntimeError(
                        'Local cache is not supported together with shuffle_row_drop_partitions > 1'
                    )

            if self._worker_predicate:
                all_cols = self._load_rows_with_predicate(
                    parquet_file, piece, self._worker_predicate,
                    shuffle_row_drop_partition)
            else:
                # Using hash of the dataset url with the relative path in order to:
                #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
                #  2. Dataset url is hashed, to make sure we don't create too long keys, which maybe incompatible with
                #     some cache implementations
                #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
                cache_key = '{}:{}:{}'.format(
                    hashlib.md5(
                        urlunparse(self._dataset_url_parsed).encode(
                            'utf-8')).hexdigest(), piece.path, piece.row_group)
                all_cols = self._local_cache.get(
                    cache_key, lambda: self._load_rows(
                        parquet_file, piece, shuffle_row_drop_partition))

        if self._ngram:
            all_cols_as_ngrams = self._ngram.form_ngram(data=all_cols,
                                                        schema=self._schema)
            return all_cols_as_ngrams
        else:
            return all_cols
Ejemplo n.º 4
0
Archivo: io.py Proyecto: suzaku/ray
def read_parquet(path, engine='auto', columns=None, **kwargs):
    """Load a parquet object from the file path, returning a DataFrame.
    Ray DataFrame only supports pyarrow engine for now.

    Args:
        path: The filepath of the parquet file.
              We only support local files for now.
        engine: Ray only support pyarrow reader.
                This argument doesn't do anything for now.
        kwargs: Pass into parquet's read_row_group function.
    """
    pf = ParquetFile(path)

    n_rows = pf.metadata.num_rows
    chunksize = n_rows // get_npartitions()
    n_row_groups = pf.metadata.num_row_groups

    idx_regex = re.compile('__index_level_\d+__')
    columns = [
        name for name in pf.metadata.schema.names if not idx_regex.match(name)
    ]

    df_from_row_groups = [
        _read_parquet_row_group.remote(path, columns, i, kwargs)
        for i in range(n_row_groups)
    ]
    splited_dfs = ray.get(
        [_split_df.remote(df, chunksize) for df in df_from_row_groups])
    df_remotes = list(chain.from_iterable(splited_dfs))

    return DataFrame(row_partitions=df_remotes, columns=columns)
Ejemplo n.º 5
0
def parse_parquet(infile):
    """
    parse a parquet file and get the columns and index from it.
    """
    import pandas as pd
    from pyarrow.parquet import ParquetFile

    parquet = ParquetFile(infile)
    metadata = parquet.metadata
    schema = metadata.schema.to_arrow_schema()

    columns = [
        metadata.schema.column(col_i).name
        for col_i in range(metadata.num_columns)
    ]
    index_cols = [col for col in columns if "__index_level_" in col]
    assert len(index_cols) <= 1

    if len(index_cols) == 1:
        index_col = index_cols[0]
        index = pd.read_parquet(infile, columns=[index_col]).index.values
        index_used = True
    else:
        index_col = "__non-existing-col__"
        index = list(range(parquet.metadata.num_rows))
        index_used = False

    columns = [col for col in columns if col != index_col]
    return columns, index, index_used, schema
    def process_single_parquet_partition(parquet_location, callback):
        parquet_file = ParquetFile(source=parquet_location)
        num_row_groups = parquet_file.num_row_groups

        print(
            "----------------------------------------------------------------------------------"
        )
        print("%d row groups for partition: %s" %
              (num_row_groups, parquet_location))

        for index in range(0, num_row_groups):
            row_df = parquet_file.read_row_group(index,
                                                 columns=["id", "img_binary"
                                                          ]).to_pandas()
            print(row_df.info(verbose=True))
            callback(row_df)
Ejemplo n.º 7
0
    def process(self, piece_index, worker_predicate,
                shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(self._dataset_url_parsed.path,
                                              filesystem=self._filesystem,
                                              validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError(
                    'Local cache is not supported together with predicates, '
                    'unless the dataset is partitioned by the column the predicate operates on.'
                )
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError(
                    'Local cache is not supported together with shuffle_row_drop_partitions > 1'
                )

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(
                parquet_file, piece, worker_predicate,
                shuffle_row_drop_partition)
        else:
            # Using hash of the dataset url with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset url is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            cache_key = '{}:{}:{}'.format(
                hashlib.md5(
                    urlunparse(
                        self._dataset_url_parsed).encode('utf-8')).hexdigest(),
                piece.path, piece_index)
            all_cols = self._local_cache.get(
                cache_key, lambda: self._load_rows(parquet_file, piece,
                                                   shuffle_row_drop_partition))

        if self._ngram:
            all_cols = self._ngram.form_ngram(data=all_cols,
                                              schema=self._schema)

        if all_cols:
            self.publish_func(all_cols)
Ejemplo n.º 8
0
    def cache_generator(
        cls,
        glob_path,
        reads_per_file=3,
        resamples=1,
        shuffle=False,
        infinite=False,
    ):
        filenames = sorted(glob2.glob(glob_path))
        if len(filenames) == 0:
            raise Exception(
                f"{cls.__name__}.batch_generator() - invalid glob_path: {glob_path}"
            )

        gc.collect()
        # sleep(1)   # sleep(1) is required to allow measurement of the garbage collector
        while True:
            for filename in filenames:
                num_rows = ParquetFile(filename).metadata.num_rows
                cache_size = math.ceil(num_rows / reads_per_file)
                for n_read in range(reads_per_file):
                    gc.collect()
                    # sleep(1)   # sleep(1) is required to allow measurement of the garbage collector
                    cache = (
                        pd.read_parquet(filename)
                        # .set_index('image_id', drop=True)  # WARN: Don't do this, it breaks other things
                        .iloc[cache_size * n_read:cache_size *
                              (n_read + 1)].copy())
                    for resample in range(resamples):
                        if shuffle:
                            cache = cache.sample(frac=1)
                        yield cache
            if not infinite: break
Ejemplo n.º 9
0
 def get_table_column_names_and_types(
         self, config: RepoConfig) -> Iterable[Tuple[str, str]]:
     filesystem, path = FileSource.create_filesystem_and_path(
         self.path, self._file_options.s3_endpoint_override)
     schema = ParquetFile(path if filesystem is None else filesystem.
                          open_input_file(path)).schema_arrow
     return zip(schema.names, map(str, schema.types))
Ejemplo n.º 10
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            original_path = path
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Partitioned Columns in Parquet")
                return cls.single_worker_read(original_path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
Ejemplo n.º 11
0
Archivo: io.py Proyecto: mbrukman/modin
    def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """

        from pyarrow.parquet import ParquetFile

        if cls.read_parquet_remote_task is None:
            return super(RayIO, cls).read_parquet(path, engine, columns,
                                                  **kwargs)

        if not columns:
            pf = ParquetFile(path)
            columns = [
                name for name in pf.metadata.schema.names
                if not PQ_INDEX_REGEX.match(name)
            ]
        num_partitions = cls.frame_mgr_cls._compute_num_partitions()
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (len(columns) // num_partitions if len(columns) %
                         num_partitions == 0 else
                         len(columns) // num_partitions + 1)
        col_partitions = [
            columns[i:i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        # Each item in this list will be a list of columns of original df
        # partitioned to smaller pieces along rows.
        # We need to transpose the oids array to fit our schema.
        blk_partitions = np.array([
            cls.read_parquet_remote_task._remote(
                args=(path, cols, num_splits, kwargs),
                num_return_vals=num_splits + 1,
            ) for cols in col_partitions
        ]).T
        remote_partitions = np.array(
            [[cls.frame_partition_cls(obj) for obj in row]
             for row in blk_partitions[:-1]])
        index_len = ray.get(blk_partitions[-1][0])
        index = pandas.RangeIndex(index_len)
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(remote_partitions), index, columns)

        return new_query_compiler
Ejemplo n.º 12
0
    def parquet_reader(filename):
        """
        Reader interface for a single Parquet file

        Parameters:
            filename (str): The teacher parquet file name

        Returns:
            parque (obj): ParquetFile object
        """

        return ParquetFile(source=filename)
Ejemplo n.º 13
0
    def __init__(self,
                 parquets,
                 img_root='',
                 past=0,
                 future=0,
                 stride=1,
                 cameras=['front-forward'],
                 transform=None,
                 load_from_azure=False):
        columns = [
            'speed_state',
            'curvature_invm_state',
            'run_id_noseginfix',
        ] + [cam + '_image_timestamp_rgb' for cam in cameras]

        # for loading images from azure blob storage
        azure_loader = AzureImageLoader() if load_from_azure else None

        # open a dataframe for each run_id and construct datasets
        datasets = []
        count = 0
        for i, parquet in enumerate(parquets):
            pqfile = ParquetFile(parquet, memory_map=False)
            num_row_groups = pqfile.metadata.num_row_groups
            for j in range(num_row_groups):
                if count % 100 == 0:
                    print('initializing parquet %d/%d run %d/%d' %
                          (i + 1, len(parquets), j + 1, num_row_groups))

                dataframe = pqfile.read_row_group(j, columns=columns)

                if len(dataframe) > (past + 1 + future) * stride:
                    datasets.append(
                        SingleWayveDataset(dataframe, img_root, past, future,
                                           stride, cameras, transform,
                                           azure_loader))
                count += 1
        super().__init__(datasets)
Ejemplo n.º 14
0
def pyarrow_read(source):
    from pyarrow.parquet import ParquetFile
    import pprint

    # Source is either the filename or an Arrow file handle (which could be on HDFS)

    # TODO: figure out how to read from s3 directly
    args = {}
    if "s3://" in source:
        args["filesystem"] = "s3"

    m = ParquetFile(source, pre_buffer=True).metadata
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(m)
Ejemplo n.º 15
0
def test_rowgroup_writing(store, use_categorical, chunk_size):
    df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]})
    serialiser = ParquetSerializer(chunk_size=2)
    # Arrow 0.9.0 has a bug in writing categorical columns to more than a single
    # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4".
    # We have special handling for that in pandas-serialiser that should be
    # removed once we switch to 0.10.0
    if use_categorical:
        df_write = df.astype({"string": "category"})
    else:
        df_write = df
    key = serialiser.store(store, "prefix", df_write)

    parquet_file = ParquetFile(store.open(key))
    assert parquet_file.num_row_groups == 2
Ejemplo n.º 16
0
def test_predicate_accept_in(store, predicate_value, expected):
    df = pd.DataFrame({"A": [0, 4, 13, 29]})  # min = 0, max = 29
    predicate = ("A", "in", predicate_value)
    serialiser = ParquetSerializer(chunk_size=None)
    key = serialiser.store(store, "prefix", df)

    parquet_file = ParquetFile(store.open(key))
    row_meta = parquet_file.metadata.row_group(0)
    arrow_schema = parquet_file.schema.to_arrow_schema()
    parquet_reader = parquet_file.reader
    assert (_predicate_accepts(
        predicate,
        row_meta=row_meta,
        arrow_schema=arrow_schema,
        parquet_reader=parquet_reader,
    ) == expected)
Ejemplo n.º 17
0
def assert_num_row_groups(store, dataset, part_num_rows, part_chunk_size):
    """
    Assert that the row groups of each partition match the expectation based on the
    number of rows and the chunk size
    """
    # Iterate over the partitions of each index value
    for index, partitions in dataset.indices["p"].index_dct.items():
        for part_key in partitions:
            key = dataset.partitions[part_key].files["table"]
            parquet_file = ParquetFile(store.open(key))
            if part_chunk_size[index] is None:
                assert parquet_file.num_row_groups == 1
            else:
                assert parquet_file.num_row_groups == math.ceil(
                    part_num_rows[index] / part_chunk_size[index]
                )
Ejemplo n.º 18
0
def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs):
    from pyarrow.parquet import ParquetFile

    if not columns:
        pf = ParquetFile(path)
        columns = [
            name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name)
        ]
    num_partitions = RayBlockPartitions._compute_num_partitions()
    num_splits = min(len(columns), num_partitions)
    # Each item in this list will be a list of column names of the original df
    column_splits = (
        len(columns) // num_partitions
        if len(columns) % num_partitions == 0
        else len(columns) // num_partitions + 1
    )
    col_partitions = [
        columns[i : i + column_splits] for i in range(0, len(columns), column_splits)
    ]
    # Each item in this list will be a list of columns of original df
    # partitioned to smaller pieces along rows.
    # We need to transpose the oids array to fit our schema.
    blk_partitions = np.array(
        [
            _read_parquet_columns._remote(
                args=(path, cols, num_splits, kwargs), num_return_vals=num_splits + 1
            )
            for cols in col_partitions
        ]
    ).T
    remote_partitions = np.array(
        [
            [PandasOnRayRemotePartition(obj) for obj in row]
            for row in blk_partitions[:-1]
        ]
    )
    index_len = ray.get(blk_partitions[-1][0])
    index = pandas.RangeIndex(index_len)
    new_manager = PandasQueryCompiler(
        RayBlockPartitions(remote_partitions), index, columns
    )
    df = DataFrame(query_compiler=new_manager)
    return df
Ejemplo n.º 19
0
def test_persist_messages(input_messages_1, expected_df_1):
    # content of test_persist.expected.pkl based on : [{"CAD":1.3171828596,"HKD":7.7500212134,"ISK":138.6508273229,"PHP":48.5625795503,"DKK":6.3139584217,"HUF":309.7581671616,"CZK":23.2040729741,"GBP":0.7686720407,"RON":4.1381417056,"SEK":8.7889690284,"IDR":14720.101824353,"INR":73.3088672041,"BRL":5.6121340687,"RUB":77.5902418328,"HRK":6.4340263046,"JPY":105.311837081,"THB":31.1803139584,"CHF":0.9099703012,"EUR":0.8485362749,"MYR":4.1424692406,"BGN":1.6595672465,"TRY":7.8962240136,"CNY":6.6836656767,"NOK":9.2889266016,"NZD":1.5062367416,"ZAR":16.4451421298,"USD":1.0,"MXN":21.0537123462,"SGD":1.3568095036,"AUD":1.4064488757,"ILS":3.3802291048,"KRW":1138.1671616462,"PLN":3.8797624098,"date":"2020-10-19T00:00:00Z"},{"CAD":1.3171828596,"HKD":7.7500212134,"ISK":138.6508273229,"PHP":48.5625795503,"DKK":6.3139584217,"HUF":309.7581671616,"CZK":23.2040729741,"GBP":0.7686720407,"RON":4.1381417056,"SEK":8.7889690284,"IDR":14720.101824353,"INR":73.3088672041,"BRL":5.6121340687,"RUB":77.5902418328,"HRK":6.4340263046,"JPY":105.311837081,"THB":31.1803139584,"CHF":0.9099703012,"EUR":0.8485362749,"MYR":4.1424692406,"BGN":1.6595672465,"TRY":7.8962240136,"CNY":6.6836656767,"NOK":9.2889266016,"NZD":1.5062367416,"ZAR":16.4451421298,"USD":1.0,"MXN":21.0537123462,"SGD":1.3568095036,"AUD":1.4064488757,"ILS":3.3802291048,"KRW":1138.1671616462,"PLN":3.8797624098,"date":"2020-10-19T00:00:00Z"}]

    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")

    input_messages = io.TextIOWrapper(io.BytesIO(input_messages_1.encode()),
                                      encoding="utf-8")

    persist_messages(input_messages, f"test_{timestamp}")

    filename = [f for f in glob.glob(f"test_{timestamp}/*.parquet")]

    df = ParquetFile(filename[0]).read().to_pandas()

    for f in filename:
        os.remove(f)
    os.rmdir(f"test_{timestamp}")

    assert_frame_equal(df, expected_df_1)
Ejemplo n.º 20
0
def _get_partition_bounds_parquet(part, fs):
    """
    Based on the part information gathered by dask, get the partition bounds
    if available.

    """
    from pyarrow.parquet import ParquetFile

    # read the metadata from the actual file (this is again file IO, but
    # we can't rely on the schema metadata, because this is only the
    # metadata of the first piece)
    pq_metadata = None
    if "piece" in part:
        path = part["piece"][0]
        if isinstance(path, str):
            with fs.open(path, "rb") as f:
                pq_metadata = ParquetFile(f).metadata
    if pq_metadata is None:
        return None

    return _get_partition_bounds(pq_metadata.metadata)
def image_data_generator_application(train_hparams, model_hparams, pipeline_name):
    print("pipeline_name", pipeline_name)
    print("train_hparams", train_hparams)
    print("model_hparams", model_hparams)

    model_hparams_key = hparam_key(model_hparams)
    train_hparams_key = hparam_key(train_hparams)

    # csv_data    = pd.read_csv(f"{settings['dir']['data']}/train.csv")
    model_file  = f"{settings['dir']['models']}/{pipeline_name}/{pipeline_name}-{model_hparams_key}.hdf5"
    log_dir     = f"{settings['dir']['logs']}/{pipeline_name}/{model_hparams_key}/{train_hparams_key}"

    os.makedirs(os.path.dirname(model_file), exist_ok=True)
    os.makedirs(log_dir,                     exist_ok=True)

    dataset_rows = ParquetFile(f"{settings['dir']['data']}/train_image_data_0.parquet").metadata.num_rows
    dataset      = DatasetDF(size=1)
    input_shape  = dataset.input_shape()
    output_shape = dataset.output_shape()
    model = MultiOutputApplication(
        input_shape=input_shape,
        output_shape=output_shape,
        **model_hparams,
    )
    model_compile(model_hparams, model, output_shape)

    # Load Pre-existing weights
    if os.path.exists( model_file ):
        try:
            model.load_weights( model_file )
            print('Loaded Weights: ', model_file)
        except Exception as exception: print('exception', exception)

    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'):
        load_models = (glob2.glob(f'../input/**/{os.path.basename(model_file)}')
                    +  glob2.glob(f'../input/**/{os.path.basename(model_file)}'.replace('=','')))  # Kaggle Dataset Upload removes '='
        for load_model in load_models:
            try:
                model.load_weights( load_model )
                print('Loaded Weights: ', load_model)
                # break
            except Exception as exception: print('exception', exception)

    model.summary()


    # Source: https://www.kaggle.com/jamesmcguigan/bengali-ai-image-processing
    datagen_args = {
        # "rescale":          1./255,  # "normalize": True is default in Transforms
        "zoom_range":         0.2,
        "width_shift_range":  0.1,     # we already have centering
        "height_shift_range": 0.1,     # we already have centering
        "rotation_range":     45/2,
        "shear_range":        45/2,
        # "brightness_range":   0.5,   # Prebrightness normalized
        "fill_mode":         'constant',
        "cval": 0,
        # "featurewise_center": True,             # No visible effect in plt.imgshow()
        # "samplewise_center": True,              # No visible effect in plt.imgshow()
        # "featurewise_std_normalization": True,  # No visible effect in plt.imgshow() | requires .fit()
        # "samplewise_std_normalization": True,   # No visible effect in plt.imgshow() | requires .fit()
        # "zca_whitening": True,                   # Kaggle, insufficent memory
    }
    flow_args = {}
    flow_args['train'] = {
        "transform_X":      Transforms.transform_X,
        "transform_X_args": {},  #  "normalize": True is default in Transforms
        "transform_Y":      Transforms.transform_Y,
        "batch_size":       train_hparams['batch_size'],
        "reads_per_file":   3,
        "resamples":        1,
        "shuffle":          True,
        "infinite":         True,
    }
    flow_args['valid'] = {
        **flow_args['train'],
        "resamples":  1,
    }
    flow_args['test'] = {
        **flow_args['train'],
        "resamples":  1,
        "shuffle":    False,
        "infinite":   False,
        "test":       True,
    }

    datagens = {
        "train": ParquetImageDataGenerator(**datagen_args),
        "valid": ParquetImageDataGenerator(),
        "test":  ParquetImageDataGenerator(),
    }
    # [ datagens[key].fit(train_batch) for key in datagens.keys() ]  # Not required
    fileglobs = {
        "train": f"{settings['dir']['data']}/train_image_data_[123].parquet",
        "valid": f"{settings['dir']['data']}/train_image_data_0.parquet",
        "test":  f"{settings['dir']['data']}/test_image_data_*.parquet",
    }
    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'):
        # For the Kaggle Submission, train on all available data and rely on Kaggle Timeout
        fileglobs["train"] = f"{settings['dir']['data']}/train_image_data_*.parquet"

    generators = {
        key: datagens[key].flow_from_parquet(value, **flow_args[key])
        for key,value in fileglobs.items()
    }
    dataset_rows_per_file = {
        key: np.mean([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ])
        for key in fileglobs.keys()
    }
    dataset_rows_total = {
        key: sum([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ])
        for key in fileglobs.keys()
    }

    ### Epoch: train == one whole parquet files | valid = 1 filesystem read
    steps_per_epoch  = int(dataset_rows_per_file['train'] / flow_args['train']['batch_size'] * flow_args['train']['resamples'] )
    validation_steps = int(dataset_rows_per_file['valid'] / flow_args['valid']['batch_size'] / flow_args['train']['reads_per_file'] )
    callback         = callbacks(train_hparams, dataset, model_file, log_dir, best_only=True, verbose=1)

    timer_start = time.time()
    history = model.fit(
        generators['train'],
        validation_data  = generators['valid'],
        epochs           = train_hparams['epochs'],
        steps_per_epoch  = steps_per_epoch,
        validation_steps = validation_steps,
        verbose          = 2,
        callbacks        = callback
    )
    timer_seconds = int(time.time() - timer_start)
    model_stats   = model_stats_from_history(history, timer_seconds, best_only=True)

    return model, model_stats, output_shape
Ejemplo n.º 22
0
Archivo: io.py Proyecto: suzaku/ray
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
    """Read a parquet row_group given file_path.
    """
    pf = ParquetFile(path)
    df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
    return df
Ejemplo n.º 23
0
 def get_table_column_names_and_types(
         self, config: RepoConfig) -> Iterable[Tuple[str, str]]:
     schema = ParquetFile(self.path).schema_arrow
     return zip(schema.names, map(str, schema.types))
Ejemplo n.º 24
0
    def restore_dataframe(
        store,
        key,
        filter_query=None,
        columns=None,
        predicate_pushdown_to_io=True,
        categories=None,
        predicates=None,
        date_as_object=False,
    ):
        check_predicates(predicates)
        # If we want to do columnar access we can benefit from partial reads
        # otherwise full read en block is the better option.
        if (not predicate_pushdown_to_io) or (columns is None
                                              and predicates is None):
            with pa.BufferReader(store.get(key)) as reader:
                table = pq.read_pandas(reader, columns=columns)
        else:
            if HAVE_BOTO and isinstance(store, BotoStore):
                # Parquet and seeks on S3 currently leak connections thus
                # we omit column projection to the store.
                reader = pa.BufferReader(store.get(key))
            else:
                reader = store.open(key)
                # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure
                # storage client is 4MB.
                reader = BlockBuffer(reader, 4 * 1024 * 1024)
            try:
                parquet_file = ParquetFile(reader)
                if predicates and parquet_file.metadata.num_rows > 0:
                    # We need to calculate different predicates for predicate
                    # pushdown and the later DataFrame filtering. This is required
                    # e.g. in the case where we have an `in` predicate as this has
                    # different normalized values.
                    columns_to_io = _columns_for_pushdown(columns, predicates)
                    predicates_for_pushdown = _normalize_predicates(
                        parquet_file, predicates, True)
                    predicates = _normalize_predicates(parquet_file,
                                                       predicates, False)
                    tables = _read_row_groups_into_tables(
                        parquet_file, columns_to_io, predicates_for_pushdown)

                    if len(tables) == 0:
                        if ARROW_LARGER_EQ_0130:
                            table = parquet_file.schema.to_arrow_schema(
                            ).empty_table()
                        else:
                            table = _empty_table_from_schema(parquet_file)
                    else:
                        table = pa.concat_tables(tables)
                else:
                    # ARROW-5139 Column projection with empty columns returns a table w/out index
                    if ARROW_LARGER_EQ_0130 and columns == []:
                        # Create an arrow table with expected index length.
                        df = (parquet_file.schema.to_arrow_schema().
                              empty_table().to_pandas(
                                  date_as_object=date_as_object))
                        index = pd.Int64Index(
                            pd.RangeIndex(start=0,
                                          stop=parquet_file.metadata.num_rows))
                        df = pd.DataFrame(df, index=index)
                        # convert back to table to keep downstream code untouched by this patch
                        table = pa.Table.from_pandas(df)
                    else:
                        table = pq.read_pandas(reader, columns=columns)
            finally:
                reader.close()

        table = _fix_pyarrow_07992_table(table)

        table = _fix_pyarrow_0130_table(table)

        if columns is not None:
            missing_columns = set(columns) - set(table.schema.names)
            if missing_columns:
                raise ValueError(
                    "Columns cannot be found in stored dataframe: {missing}".
                    format(missing=", ".join(sorted(missing_columns))))

        df = table.to_pandas(categories=categories,
                             date_as_object=date_as_object)
        df.columns = df.columns.map(ensure_unicode_string_type)
        if predicates:
            df = filter_df_from_predicates(df,
                                           predicates,
                                           strict_date_types=date_as_object)
        else:
            df = filter_df(df, filter_query)
        if columns is not None:
            return df.loc[:, columns]
        else:
            return df
Ejemplo n.º 25
0
 def read_schema(self) -> ParquetSchema:
     return ParquetFile(self.path).schema
Ejemplo n.º 26
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if str(path).endswith(".manifest"):
            self.manifest_path = path
            if str(path).startswith(LOCAL_FILE_PREFIX):
                self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):]

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            if str(path).endswith(".manifest"):
                from obs import ObsClient
                obsClient = ObsClient(access_key_id=key,
                                      secret_access_key=secret,
                                      server=str(endpoint).replace(
                                          'http://', ''),
                                      long_conn_mode=True)
                sources = manifest.getSources(self.manifest_path, CARBON,
                                              obsClient)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.file_path, self.configuration.conf)
            else:
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                folder_path = path
                if str(path).endswith(".manifest"):
                    folder_path = str(
                        self.file_path)[0:(str(self.file_path).rindex('/'))]
                self.pieces.append(
                    CarbonDatasetPiece(folder_path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            if str(path).endswith(".manifest"):
                sources = manifest.getSources(self.manifest_path, CARBON)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")

                try:
                    carbon_schema = CarbonSchemaReader().readSchema(
                        self.file_path)
                except:
                    raise Exception("readSchema has some errors: " +
                                    self.file_path)
            else:
                try:
                    carbon_schema = CarbonSchemaReader().readSchema(self.path)
                except:
                    raise Exception("readSchema has some errors")

            carbon_splits = ArrowCarbonReader().builder(self.path) \
              .getSplits(True)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                if str(path).endswith(".manifest"):
                    self.pieces.append(
                        CarbonDatasetPiece(
                            str(self.file_path)[0:(
                                str(self.file_path).rindex('/'))],
                            carbon_schema, split))
                else:
                    self.pieces.append(
                        CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None
Ejemplo n.º 27
0
def image_data_generator_cnn(train_hparams: Dict,
                             model_hparams: Dict,
                             transform_X_args: Dict,
                             transform_Y_args: Dict,
                             datagen_args: Dict,
                             pipeline_name='image_data_generator_cnn',
                             model_file=None,
                             log_dir=None,
                             verbose=2,
                             load_weights=True,
                             fileglobs={}):
    combined_hparams = {
        **model_hparams,
        **train_hparams,
        **transform_X_args,
        **transform_Y_args,
        **datagen_args
    }
    train_hparams = {**settings['hparam_defaults'], **train_hparams}
    if verbose:
        print('-----')
        print("pipeline_name", pipeline_name)
        print("train_hparams", train_hparams)
        print("transform_X_args", transform_X_args)
        print("transform_Y_args", transform_Y_args)
        print("datagen_args", datagen_args)
        print("model_file", model_file)
        print("log_dir", log_dir)
        print("load_weights", load_weights)
        print('-----')

    model_hparams_key = hparam_key(model_hparams)
    train_hparams_key = hparam_key(train_hparams)
    transform_key = hparam_key(
        ChainMap(*[transform_X_args, transform_Y_args, datagen_args]))

    # csv_data    = pd.read_csv(f"{settings['dir']['data']}/train.csv")
    model_file = model_file or f"{settings['dir']['models']}/{pipeline_name}/{pipeline_name}-{model_hparams_key}.hdf5"
    log_dir = log_dir or f"{settings['dir']['logs']}/{pipeline_name}/{transform_key}/"

    os.makedirs(os.path.dirname(model_file), exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    dataset_rows = ParquetFile(
        f"{settings['dir']['data']}/train_image_data_0.parquet"
    ).metadata.num_rows
    dataset = DatasetDF(
        size=1,
        transform_X_args=transform_X_args,
        transform_Y_args=transform_Y_args,
    )
    input_shape = dataset.input_shape()
    output_shape = dataset.output_shape()
    model = MultiOutputCNN(
        input_shape=input_shape,
        output_shape=output_shape,
        **model_hparams,
    )
    model_compile(model_hparams, model, output_shape)

    # Load Pre-existing weights
    if load_weights:
        if os.path.exists(model_file):
            try:
                model.load_weights(model_file)
                print('Loaded Weights: ', model_file)
            except Exception as exception:
                print('exception', exception)

        if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'):
            load_models = (
                glob2.glob(f'../input/**/{os.path.basename(model_file)}') +
                glob2.glob(
                    f'../input/**/{os.path.basename(model_file)}'.replace(
                        '=', '')))  # Kaggle Dataset Upload removes '='
            for load_model in load_models:
                try:
                    model.load_weights(load_model)
                    print('Loaded Weights: ', load_model)
                    # break
                except Exception as exception:
                    print('exception', exception)

    if verbose:
        model.summary()

    flow_args = {}
    flow_args['train'] = {
        "transform_X": Transforms.transform_X,
        "transform_Y": Transforms.transform_Y,
        "transform_X_args": transform_X_args,
        "transform_Y_args": transform_Y_args,
        "batch_size": train_hparams['batch_size'],
        "reads_per_file": 2,
        "resamples": 1,
        "shuffle": True,
        "infinite": True,
    }
    flow_args['valid'] = {
        **flow_args['train'],
        "resamples": 1,
    }
    flow_args['test'] = {
        **flow_args['train'],
        "resamples": 1,
        "shuffle": False,
        "infinite": False,
        "test": True,
    }

    datagens = {
        "train": ParquetImageDataGenerator(**datagen_args),
        "valid": ParquetImageDataGenerator(),
        "test": ParquetImageDataGenerator(),
    }
    # [ datagens[key].fit(train_batch) for key in datagens.keys() ]  # Not required
    fileglobs = {
        "train": f"{settings['dir']['data']}/train_image_data_[123].parquet",
        "valid": f"{settings['dir']['data']}/train_image_data_0.parquet",
        "test": f"{settings['dir']['data']}/test_image_data_*.parquet",
        **fileglobs
    }
    ### Preserve test/train split for Kaggle
    # if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'):
    #     # For the Kaggle Submission, train on all available data and rely on Kaggle Timeout
    #     fileglobs["train"] = f"{settings['dir']['data']}/train_image_data_*.parquet"

    generators = {
        key: datagens[key].flow_from_parquet(value, **flow_args[key])
        for key, value in fileglobs.items()
    }
    dataset_rows_per_file = {
        key: np.mean([
            ParquetFile(filename).metadata.num_rows
            for filename in glob2.glob(fileglobs[key])
        ])
        for key in fileglobs.keys()
    }
    dataset_rows_total = {
        key: sum([
            ParquetFile(filename).metadata.num_rows
            for filename in glob2.glob(fileglobs[key])
        ])
        for key in fileglobs.keys()
    }

    ### Epoch: train == one whole parquet files | valid = 1 filesystem read
    steps_per_epoch = int(dataset_rows_per_file['train'] /
                          flow_args['train']['batch_size'] *
                          flow_args['train']['resamples'])
    validation_steps = int(dataset_rows_per_file['valid'] /
                           flow_args['valid']['batch_size'] /
                           flow_args['train']['reads_per_file'])
    callback = callbacks(combined_hparams,
                         dataset,
                         model_file,
                         log_dir,
                         best_only=True,
                         verbose=1)

    timer_start = time.time()
    history = model.fit(generators['train'],
                        validation_data=generators['valid'],
                        epochs=train_hparams['epochs'],
                        steps_per_epoch=steps_per_epoch,
                        validation_steps=validation_steps,
                        verbose=verbose,
                        callbacks=callback)
    timer_seconds = int(time.time() - timer_start)
    model_stats = model_stats_from_history(history,
                                           timer_seconds,
                                           best_only=True)

    return model, model_stats, output_shape
Ejemplo n.º 28
0
    def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """

        from pyarrow.parquet import ParquetFile, ParquetDataset

        if cls.read_parquet_remote_task is None:
            return super(RayIO, cls).read_parquet(path, engine, columns,
                                                  **kwargs)

        file_path = path
        if os.path.isdir(path):
            directory = True
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    file_path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining
                # columns.
                from pyarrow import ArrowIOError

                try:
                    pd = ParquetDataset(file_path)
                except ArrowIOError:
                    pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]

        # Cannot read in parquet file by only reading in the partitioned column.
        # Thus, we have to remove the partition columns from the columns to
        # ensure that when we do the math for the blocks, the partition column
        # will be read in along with a non partition column.
        if columns and directory and any(col in partitioned_columns
                                         for col in columns):
            columns = [
                col for col in columns if col not in partitioned_columns
            ]
            # If all of the columns wanted are partition columns, return an
            # empty dataframe with the desired columns.
            if len(columns) == 0:
                return cls.query_compiler_cls.from_pandas(
                    pandas.DataFrame(columns=partitioned_columns),
                    block_partitions_cls=cls.frame_mgr_cls,
                )

        num_partitions = cls.frame_mgr_cls._compute_num_partitions()
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (len(columns) // num_partitions if len(columns) %
                         num_partitions == 0 else
                         len(columns) // num_partitions + 1)
        col_partitions = [
            columns[i:i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        column_widths = [len(c) for c in col_partitions]
        # Each item in this list will be a list of columns of original df
        # partitioned to smaller pieces along rows.
        # We need to transpose the oids array to fit our schema.
        # TODO (williamma12): This part can be parallelized even more if we
        # separate the partitioned parquet file code path from the default one.
        # The workers return multiple objects for each part of the file read:
        # - The first n - 2 objects are partitions of data
        # - The n - 1 object is the length of the partition.
        # - The nth object is the dtypes of the partition. We combine these to
        #   form the final dtypes below.
        blk_partitions = np.array([
            cls.read_parquet_remote_task._remote(
                args=(path, cols + partitioned_columns, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) if directory and cols == col_partitions[len(col_partitions) - 1]
            else cls.read_parquet_remote_task._remote(
                args=(path, cols, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) for cols in col_partitions
        ]).T
        # Metadata
        index_len = ray.get(blk_partitions[-2][0])
        index = pandas.RangeIndex(index_len)
        index_chunksize = compute_chunksize(pandas.DataFrame(index=index),
                                            num_splits,
                                            axis=0)
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(num_splits - 1)]
        else:
            row_lengths = [
                index_chunksize if i != num_splits - 1 else index_len -
                (index_chunksize * (num_splits - 1)) for i in range(num_splits)
            ]
        # Compute dtypes concatenating the results from each of the columns splits
        # determined above. This creates a pandas Series that contains a dtype for every
        # column.
        dtypes_ids = list(blk_partitions[-1])
        dtypes = pandas.concat(ray.get(dtypes_ids), axis=0)

        blk_partitions = blk_partitions[:-2]
        remote_partitions = np.array([[
            cls.frame_partition_cls(
                blk_partitions[i][j],
                length=row_lengths[i],
                width=column_widths[j],
            ) for j in range(len(blk_partitions[i]))
        ] for i in range(len(blk_partitions))])
        if directory:
            columns += partitioned_columns
        dtypes.index = columns
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(remote_partitions),
            index,
            columns,
            dtypes=dtypes)

        return new_query_compiler
Ejemplo n.º 29
0
Archivo: io.py Proyecto: adgirish/ray
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
    """Read a parquet row_group given file_path.
    """
    pf = ParquetFile(path)
    df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
    return df
Ejemplo n.º 30
0
 def _patched_init(self, source, **kwargs):
     self.source = source
     return ParquetFile.__old_init__(self, source, **kwargs)
Ejemplo n.º 31
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False
        if not columns:
            import s3fs

            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path, str) and path.startswith("hdfs://"):
                import fsspec.core

                fs, path = fsspec.core.url_to_fs(path)
                pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path,
                            s3fs.S3File) or (isinstance(path, str)
                                             and path.startswith("s3://")):
                from botocore.exceptions import NoCredentialsError

                if isinstance(path, s3fs.S3File):
                    bucket_path = path.url().split(".s3.amazonaws.com")
                    path = "s3://" + bucket_path[0].split(
                        "://")[1] + bucket_path[1]
                try:
                    fs = s3fs.S3FileSystem()
                    pd = ParquetDataset(path, filesystem=fs)
                except NoCredentialsError:
                    fs = s3fs.S3FileSystem(anon=True)
                    pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            else:
                meta = ParquetFile(path).metadata
                column_names = meta.schema.to_arrow_schema().names

            if meta is not None and meta.metadata is not None:
                pandas_metadata = meta.metadata.get(b"pandas", None)
                if pandas_metadata is not None:
                    import json

                    # This is how we convert the metadata from pyarrow to a python
                    # dictionary, from which we then get the index columns.
                    # We use these to filter out from the columns in the metadata since
                    # the pyarrow storage has no concept of row labels/index.
                    # This ensures that our metadata lines up with the partitions without
                    # extra communication steps once we have done all the remote
                    # computation.
                    index_columns = json.loads(
                        pandas_metadata.decode("utf8")).get(
                            "index_columns", [])
                    column_names = [
                        c for c in column_names if c not in index_columns
                    ]
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)