Beispiel #1
0
    def _read_with_shuffle_row_drop(self, piece, pq_file, column_names,
                                    shuffle_row_drop_partition):
        partition_names = self._dataset.partitions.partition_names

        # pyarrow would fail if we request a column names that the dataset is partitioned by
        table = compat_piece_read(piece,
                                  lambda _: pq_file,
                                  columns=column_names - partition_names,
                                  partitions=self._dataset.partitions)

        # Drop columns we did not explicitly request. This may happen when a table is partitioned. Besides columns
        # requested, pyarrow will also return partition values. Having these unexpected fields will break some
        # downstream code.
        loaded_column_names = set(
            column[0] for column in compat_table_columns_gen(table))
        unasked_for_columns = loaded_column_names - column_names
        if unasked_for_columns:
            table = table.drop(unasked_for_columns)

        num_rows = len(table)
        num_partitions = shuffle_row_drop_partition[1]
        this_partition = shuffle_row_drop_partition[0]

        if num_partitions > 1:
            data_frame_pandas = table.to_pandas()
            partition_indexes = np.floor(
                np.arange(num_rows) /
                (float(num_rows) / min(num_rows, num_partitions)))

            table = pa.Table.from_pandas(
                data_frame_pandas.loc[partition_indexes == this_partition],
                preserve_index=False)

        return table
    def _read_with_shuffle_row_drop(self, piece, pq_file, column_names,
                                    shuffle_row_drop_partition):
        # If integer_object_nulls is set to False, nullable integer fields are return as floats
        # with nulls translated to nans
        data_frame = compat_piece_read(
            piece,
            lambda _: pq_file,
            columns=column_names,
            partitions=self._dataset.partitions).to_pandas(
                integer_object_nulls=True)

        num_rows = len(data_frame)
        num_partitions = shuffle_row_drop_partition[1]
        this_partition = shuffle_row_drop_partition[0]

        partition_indexes = np.floor(
            np.arange(num_rows) /
            (float(num_rows) / min(num_rows, num_partitions)))

        if self._ngram:
            # If we have an ngram we need to take elements from the next partition to build the sequence
            next_partition_indexes = np.where(
                partition_indexes >= this_partition + 1)
            if next_partition_indexes[0].size:
                next_partition_to_add = next_partition_indexes[0][0:self._ngram
                                                                  .length - 1]
                partition_indexes[next_partition_to_add] = this_partition

        selected_dataframe = data_frame.loc[partition_indexes ==
                                            this_partition]
        return selected_dataframe.to_dict('records')
Beispiel #3
0
    def _read_with_shuffle_row_drop(self, piece, pq_file, column_names, shuffle_row_drop_partition):
        table = compat_piece_read(piece, lambda _: pq_file, columns=column_names, partitions=self._dataset.partitions)

        num_rows = len(table)
        num_partitions = shuffle_row_drop_partition[1]
        this_partition = shuffle_row_drop_partition[0]

        if num_partitions > 1:
            data_frame_pandas = table.to_pandas()
            partition_indexes = np.floor(np.arange(num_rows) / (float(num_rows) / min(num_rows, num_partitions)))

            table = pa.Table.from_pandas(data_frame_pandas.loc[partition_indexes == this_partition],
                                         preserve_index=False)

        return table
Beispiel #4
0
def _index_columns(piece_info, dataset_url, partitions, indexers, schema, hdfs_driver='libhdfs3'):
    """
    Function build indexes for  dataset piece described in piece_info
    :param piece_info: description of dataset piece
    :param dataset_url: dataset location
    :param partitions: dataset partitions
    :param indexers: list of indexer objects
    :param schema: dataset schema
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: list of indexers containing index data
    """
    # Resolver in executor context will get hadoop config from environment
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    fs = resolver.filesystem()

    # Create pyarrow piece
    piece = compat_make_parquet_piece(piece_info.path, fs.open, row_group=piece_info.row_group,
                                      partition_keys=piece_info.partition_keys)

    # Collect column names needed for indexing
    column_names = set()
    for indexer in indexers:
        column_names.update(indexer.column_names)

    # Read columns needed for indexing
    column_rows = compat_piece_read(piece, fs.open, columns=list(column_names),
                                    partitions=partitions).to_pandas().to_dict('records')

    # Decode columns values
    decoded_rows = [utils.decode_row(row, schema) for row in column_rows]
    if not decoded_rows:
        raise ValueError('Cannot build index with empty decoded_rows, columns: {}, partitions: {}'
                         .format(column_names, partitions))

    # Index columns values
    for indexer in indexers:
        indexer.build_index(decoded_rows, piece_info.piece_index)

    # Indexer objects contain index data, it will be consolidated on reduce phace
    return indexers