Ejemplo n.º 1
0
    def _define_metadata(
        cls,
        df: pandas.DataFrame,
        column_names: ColumnNamesTypes,
    ) -> Tuple[list, int]:
        """
        Define partitioning metadata.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame to split.
        column_names : ColumnNamesTypes
            Column names of df.

        Returns
        -------
        column_widths : list
            Column width to use during new frame creation (number of
            columns for each partition).
        num_splits : int
            The maximum number of splits to separate the DataFrame into.
        """
        # This is the number of splits for the columns
        num_splits = min(len(column_names) or 1, NPartitions.get())
        column_chunksize = compute_chunksize(df, num_splits, axis=1)
        if column_chunksize > len(column_names):
            column_widths = [len(column_names)]
            # This prevents us from unnecessarily serializing a bunch of empty
            # objects.
            num_splits = 1
        else:
            # split columns into chunks with maximal size column_chunksize, for example
            # if num_splits == 4, len(column_names) == 80 and column_chunksize == 32,
            # column_widths will be [32, 32, 16, 0]
            column_widths = [
                column_chunksize
                if len(column_names) > (column_chunksize * (i + 1))
                else 0
                if len(column_names) < (column_chunksize * i)
                else len(column_names) - (column_chunksize * i)
                for i in range(num_splits)
            ]

        return column_widths, num_splits
Ejemplo n.º 2
0
    def from_pandas(cls, df):
        num_splits = cls._compute_num_partitions()
        put_func = cls._partition_class.put
        row_chunksize, col_chunksize = compute_chunksize(df, num_splits)

        # Each chunk must have a RangeIndex that spans its length and width
        # according to our invariant.
        def chunk_builder(i, j):
            chunk = df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy()
            chunk.index = pandas.RangeIndex(len(chunk.index))
            chunk.columns = pandas.RangeIndex(len(chunk.columns))
            return put_func(chunk)

        parts = [[
            chunk_builder(i, j)
            for j in range(0, len(df.columns), col_chunksize)
        ] for i in range(0, len(df), row_chunksize)]
        return cls(np.array(parts))
Ejemplo n.º 3
0
    def build_index(cls, partition_ids):
        from modin.pandas import DEFAULT_NPARTITIONS

        index_len = cls.materialize(partition_ids[-2][0])
        index = pandas.RangeIndex(index_len)
        index_chunksize = compute_chunksize(
            pandas.DataFrame(index=index), DEFAULT_NPARTITIONS, axis=0
        )
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(DEFAULT_NPARTITIONS - 1)]
        else:
            row_lengths = [
                index_chunksize
                if i != DEFAULT_NPARTITIONS - 1
                else index_len - (index_chunksize * (DEFAULT_NPARTITIONS - 1))
                for i in range(DEFAULT_NPARTITIONS)
            ]
        return index, row_lengths
Ejemplo n.º 4
0
 def build_index(cls, partition_ids):
     num_partitions = NPartitions.get()
     index_len = cls.materialize(partition_ids[-2][0])
     if isinstance(index_len, int):
         index = pandas.RangeIndex(index_len)
     else:
         index = index_len
         index_len = len(index)
     index_chunksize = compute_chunksize(pandas.DataFrame(index=index),
                                         num_partitions,
                                         axis=0)
     if index_chunksize > index_len:
         row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)]
     else:
         row_lengths = [
             index_chunksize if i != num_partitions - 1 else index_len -
             (index_chunksize * (num_partitions - 1))
             for i in range(num_partitions)
         ]
     return index, row_lengths
Ejemplo n.º 5
0
 def from_pandas(cls, df, return_dims=False):
     num_splits = cls._compute_num_partitions()
     put_func = cls._partition_class.put
     row_chunksize, col_chunksize = compute_chunksize(df, num_splits)
     parts = [[
         put_func(df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy())
         for j in range(0, len(df.columns), col_chunksize)
     ] for i in range(0, len(df), row_chunksize)]
     if not return_dims:
         return np.array(parts)
     else:
         row_lengths = [
             row_chunksize if i + row_chunksize < len(df) else
             len(df) % row_chunksize or row_chunksize
             for i in range(0, len(df), row_chunksize)
         ]
         col_widths = [
             col_chunksize if i + col_chunksize < len(df.columns) else
             len(df.columns) % col_chunksize or col_chunksize
             for i in range(0, len(df.columns), col_chunksize)
         ]
         return np.array(parts), row_lengths, col_widths
Ejemplo n.º 6
0
    def build_index(cls, partition_ids):
        """
        Compute index and its split sizes of resulting Modin DataFrame.

        Parameters
        ----------
        partition_ids : list
            Array with references to the partitions data.

        Returns
        -------
        index : pandas.Index
            Index of resulting Modin DataFrame.
        row_lengths : list
            List with lengths of index chunks.
        """
        num_partitions = NPartitions.get()
        index_len = cls.materialize(partition_ids[-2][0])
        if isinstance(index_len, int):
            index = pandas.RangeIndex(index_len)
        else:
            index = index_len
            index_len = len(index)
        index_chunksize = compute_chunksize(
            pandas.DataFrame(index=index), num_partitions, axis=0
        )
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)]
        else:
            row_lengths = [
                index_chunksize
                if i != num_partitions - 1
                else index_len - (index_chunksize * (num_partitions - 1))
                for i in range(num_partitions)
            ]
        return index, row_lengths
Ejemplo n.º 7
0
    def read_json(
        cls,
        path_or_buf=None,
        orient=None,
        typ="frame",
        dtype=True,
        convert_axes=True,
        convert_dates=True,
        keep_default_dates=True,
        numpy=False,
        precise_float=False,
        date_unit=None,
        encoding=None,
        lines=False,
        chunksize=None,
        compression="infer",
    ):
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
            "typ": typ,
            "dtype": dtype,
            "convert_axes": convert_axes,
            "convert_dates": convert_dates,
            "keep_default_dates": keep_default_dates,
            "numpy": numpy,
            "precise_float": precise_float,
            "date_unit": date_unit,
            "encoding": encoding,
            "lines": lines,
            "chunksize": chunksize,
            "compression": compression,
        }
        if cls.read_json_remote_task is None:
            return super(RayIO, cls).read_json(**kwargs)

        if not lines:
            ErrorMessage.default_to_pandas(
                "`read_json` only optimized with `lines=True`")
            return super(RayIO, cls).read_json(**kwargs)
        else:
            # TODO: Pick up the columns in an optimized way from all data
            # All rows must be read because some rows may have missing data
            # Currently assumes all rows have the same columns
            from io import BytesIO

            columns = pandas.read_json(
                BytesIO(b"" + open(path_or_buf, "rb").readline()),
                lines=True).columns
            kwargs["columns"] = columns
            empty_pd_df = pandas.DataFrame(columns=columns)

            path_or_buf = kwargs.pop("path_or_buf")

            with file_open(path_or_buf, "rb",
                           kwargs.get("compression", "infer")) as f:
                total_bytes = file_size(f)
                num_partitions = cls.frame_mgr_cls._compute_num_partitions()
                num_splits = min(len(columns), num_partitions)
                chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)

                partition_ids = []
                index_ids = []
                dtypes_ids = []

                column_chunksize = compute_chunksize(empty_pd_df,
                                                     num_splits,
                                                     axis=1)
                if column_chunksize > len(columns):
                    column_widths = [len(columns)]
                    num_splits = 1
                else:
                    column_widths = [
                        column_chunksize if i != num_splits - 1 else
                        len(columns) - (column_chunksize * (num_splits - 1))
                        for i in range(num_splits)
                    ]

                while f.tell() < total_bytes:
                    start = f.tell()
                    f.seek(chunk_size, os.SEEK_CUR)
                    f.readline()
                    partition_id = cls.read_json_remote_task._remote(
                        args=(path_or_buf, num_splits, start, f.tell(),
                              kwargs),
                        num_return_vals=num_splits + 3,
                    )
                    partition_ids.append(partition_id[:-3])
                    index_ids.append(partition_id[-3])
                    dtypes_ids.append(partition_id[-2])

            row_lengths = ray.get(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))

            dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply(
                lambda row: find_common_type(row.values),
                axis=1).squeeze(axis=0))

            partition_ids = [[
                cls.frame_partition_cls(
                    partition_ids[i][j],
                    length=row_lengths[i],
                    width=column_widths[j],
                ) for j in range(len(partition_ids[i]))
            ] for i in range(len(partition_ids))]

            if isinstance(dtypes, pandas.Series):
                dtypes.index = columns
            else:
                dtypes = pandas.Series(dtypes, index=columns)

            new_query_compiler = cls.query_compiler_cls(
                cls.frame_mgr_cls(np.array(partition_ids)),
                new_index,
                columns,
                dtypes=dtypes,
            )
            return new_query_compiler
Ejemplo n.º 8
0
    def _read_csv_from_file_ray(cls, filepath, kwargs={}):
        """Constructs a DataFrame from a CSV file.

        Args:
            filepath (str): path to the CSV file.
            npartitions (int): number of partitions for the DataFrame.
            kwargs (dict): args excluding filepath provided to read_csv.

        Returns:
            DataFrame or Series constructed from CSV file.
        """
        names = kwargs.get("names", None)
        index_col = kwargs.get("index_col", None)
        if names is None:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly
            kwargs["index_col"] = None
            names = pandas.read_csv(filepath,
                                    **dict(kwargs, nrows=0,
                                           skipfooter=0)).columns
            kwargs["index_col"] = index_col

        empty_pd_df = pandas.read_csv(filepath,
                                      **dict(kwargs, nrows=0, skipfooter=0))
        column_names = empty_pd_df.columns
        skipfooter = kwargs.get("skipfooter", None)
        skiprows = kwargs.pop("skiprows", None)

        usecols = kwargs.get("usecols", None)
        usecols_md = _validate_usecols_arg(kwargs.get("usecols", None))
        if usecols is not None and usecols_md[1] != "integer":
            del kwargs["usecols"]
            all_cols = pandas.read_csv(file_open(filepath, "rb"),
                                       **dict(kwargs, nrows=0,
                                              skipfooter=0)).columns
            usecols = all_cols.get_indexer_for(list(usecols_md[0]))
        parse_dates = kwargs.pop("parse_dates", False)
        partition_kwargs = dict(
            kwargs,
            header=None,
            names=names if kwargs.get("usecols") is None
            or kwargs.get("names") is not None else None,
            skipfooter=0,
            skiprows=None,
            parse_dates=parse_dates,
            usecols=usecols,
        )
        with file_open(filepath, "rb", kwargs.get("compression",
                                                  "infer")) as f:
            # Get the BOM if necessary
            prefix = b""
            if kwargs.get("encoding", None) is not None:
                prefix = f.readline()
                partition_kwargs["skiprows"] = 1
                f.seek(0, os.SEEK_SET)  # Return to beginning of file

            prefix_id = ray.put(prefix)
            partition_kwargs_id = ray.put(partition_kwargs)
            # Skip the header since we already have the header information and skip the
            # rows we are told to skip.
            kwargs["skiprows"] = skiprows
            cls._skip_header(f, kwargs)
            # Launch tasks to read partitions
            partition_ids = []
            index_ids = []
            dtypes_ids = []
            total_bytes = file_size(f)
            # Max number of partitions available
            num_parts = cls.frame_mgr_cls._compute_num_partitions()
            # This is the number of splits for the columns
            num_splits = min(len(column_names), num_parts)
            # This is the chunksize each partition will read
            chunk_size = max(1, (total_bytes - f.tell()) // num_parts)

            # Metadata
            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]

            while f.tell() < total_bytes:
                start = f.tell()
                f.seek(chunk_size, os.SEEK_CUR)
                f.readline()  # Read a whole number of lines
                # The workers return multiple objects for each part of the file read:
                # - The first n - 2 objects are partitions of data
                # - The n - 1 object is the length of the partition or the index if
                #   `index_col` is specified. We compute the index below.
                # - The nth object is the dtypes of the partition. We combine these to
                #   form the final dtypes below.
                partition_id = cls.read_csv_remote_task._remote(
                    args=(
                        filepath,
                        num_splits,
                        start,
                        f.tell(),
                        partition_kwargs_id,
                        prefix_id,
                    ),
                    num_return_vals=num_splits + 2,
                )
                partition_ids.append(partition_id[:-2])
                index_ids.append(partition_id[-2])
                dtypes_ids.append(partition_id[-1])

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = ray.get(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = ray.get(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])
            new_index.name = empty_pd_df.index.name

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply(
            lambda row: find_common_type(row.values), axis=1).squeeze(axis=0))

        partition_ids = [[
            cls.frame_partition_cls(partition_ids[i][j],
                                    length=row_lengths[i],
                                    width=column_widths[j])
            for j in range(len(partition_ids[i]))
        ] for i in range(len(partition_ids))]
        # If parse_dates is present, the column names that we have might not be
        # the same length as the returned column names. If we do need to modify
        # the column names, we remove the old names from the column names and
        # insert the new one at the front of the Index.
        if parse_dates is not None:
            # Check if is list of lists
            if isinstance(parse_dates, list) and isinstance(
                    parse_dates[0], list):
                for group in parse_dates:
                    new_col_name = "_".join(group)
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)
            # Check if it is a dictionary
            elif isinstance(parse_dates, dict):
                for new_col_name, group in parse_dates.items():
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)

        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)

        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(np.array(partition_ids)),
            new_index,
            column_names,
            dtypes=dtypes,
        )

        if skipfooter:
            new_query_compiler = new_query_compiler.drop(
                new_query_compiler.index[-skipfooter:])
        if kwargs.get("squeeze", False) and len(
                new_query_compiler.columns) == 1:
            return new_query_compiler[new_query_compiler.columns[0]]
        return new_query_compiler
Ejemplo n.º 9
0
    def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """

        from pyarrow.parquet import ParquetFile, ParquetDataset

        if cls.read_parquet_remote_task is None:
            return super(RayIO, cls).read_parquet(path, engine, columns,
                                                  **kwargs)

        file_path = path
        if os.path.isdir(path):
            directory = True
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    file_path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining
                # columns.
                from pyarrow import ArrowIOError

                try:
                    pd = ParquetDataset(file_path)
                except ArrowIOError:
                    pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]

        # Cannot read in parquet file by only reading in the partitioned column.
        # Thus, we have to remove the partition columns from the columns to
        # ensure that when we do the math for the blocks, the partition column
        # will be read in along with a non partition column.
        if columns and directory and any(col in partitioned_columns
                                         for col in columns):
            columns = [
                col for col in columns if col not in partitioned_columns
            ]
            # If all of the columns wanted are partition columns, return an
            # empty dataframe with the desired columns.
            if len(columns) == 0:
                return cls.query_compiler_cls.from_pandas(
                    pandas.DataFrame(columns=partitioned_columns),
                    block_partitions_cls=cls.frame_mgr_cls,
                )

        num_partitions = cls.frame_mgr_cls._compute_num_partitions()
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (len(columns) // num_partitions if len(columns) %
                         num_partitions == 0 else
                         len(columns) // num_partitions + 1)
        col_partitions = [
            columns[i:i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        column_widths = [len(c) for c in col_partitions]
        # Each item in this list will be a list of columns of original df
        # partitioned to smaller pieces along rows.
        # We need to transpose the oids array to fit our schema.
        # TODO (williamma12): This part can be parallelized even more if we
        # separate the partitioned parquet file code path from the default one.
        # The workers return multiple objects for each part of the file read:
        # - The first n - 2 objects are partitions of data
        # - The n - 1 object is the length of the partition.
        # - The nth object is the dtypes of the partition. We combine these to
        #   form the final dtypes below.
        blk_partitions = np.array([
            cls.read_parquet_remote_task._remote(
                args=(path, cols + partitioned_columns, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) if directory and cols == col_partitions[len(col_partitions) - 1]
            else cls.read_parquet_remote_task._remote(
                args=(path, cols, num_splits, kwargs),
                num_return_vals=num_splits + 2,
            ) for cols in col_partitions
        ]).T
        # Metadata
        index_len = ray.get(blk_partitions[-2][0])
        index = pandas.RangeIndex(index_len)
        index_chunksize = compute_chunksize(pandas.DataFrame(index=index),
                                            num_splits,
                                            axis=0)
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(num_splits - 1)]
        else:
            row_lengths = [
                index_chunksize if i != num_splits - 1 else index_len -
                (index_chunksize * (num_splits - 1)) for i in range(num_splits)
            ]
        # Compute dtypes concatenating the results from each of the columns splits
        # determined above. This creates a pandas Series that contains a dtype for every
        # column.
        dtypes_ids = list(blk_partitions[-1])
        dtypes = pandas.concat(ray.get(dtypes_ids), axis=0)

        blk_partitions = blk_partitions[:-2]
        remote_partitions = np.array([[
            cls.frame_partition_cls(
                blk_partitions[i][j],
                length=row_lengths[i],
                width=column_widths[j],
            ) for j in range(len(blk_partitions[i]))
        ] for i in range(len(blk_partitions))])
        if directory:
            columns += partitioned_columns
        dtypes.index = columns
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(remote_partitions),
            index,
            columns,
            dtypes=dtypes)

        return new_query_compiler
Ejemplo n.º 10
0
    def _read(cls, io, **kwargs):
        if (kwargs.get("engine", None) is not None
                and kwargs.get("engine") != "openpyxl"):
            warnings.warn(
                "Modin only implements parallel `read_excel` with `openpyxl` engine, "
                'please specify `engine=None` or `engine="openpyxl"` to '
                "use Modin's parallel implementation.")
            return cls.single_worker_read(io, **kwargs)
        if sys.version_info < (3, 7):
            warnings.warn(
                "Python 3.7 or higher required for parallel `read_excel`.")
            return cls.single_worker_read(io, **kwargs)

        from zipfile import ZipFile
        from openpyxl.worksheet.worksheet import Worksheet
        from openpyxl.worksheet._reader import WorksheetReader
        from openpyxl.reader.excel import ExcelReader
        from modin.backends.pandas.parsers import PandasExcelParser

        sheet_name = kwargs.get("sheet_name", 0)
        if sheet_name is None or isinstance(sheet_name, list):
            warnings.warn(
                "`read_excel` functionality is only implemented for a single sheet at a "
                "time. Multiple sheet reading coming soon!")
            return cls.single_worker_read(io, **kwargs)

        warnings.warn("Parallel `read_excel` is a new feature! Please email "
                      "[email protected] if you run into any problems.")

        # NOTE: ExcelReader() in read-only mode does not close file handle by itself
        # work around that by passing file object if we received some path
        io_file = open(io, "rb") if isinstance(io, str) else io
        try:
            ex = ExcelReader(io_file, read_only=True)
            ex.read()
            wb = ex.wb

            # Get shared strings
            ex.read_manifest()
            ex.read_strings()
            ws = Worksheet(wb)
        finally:
            if isinstance(io, str):
                # close only if it were us who opened the object
                io_file.close()

        pandas_kw = dict(kwargs)  # preserve original kwargs
        with ZipFile(io) as z:
            from io import BytesIO

            # Convert index to sheet name in file
            if isinstance(sheet_name, int):
                sheet_name = "sheet{}".format(sheet_name + 1)
            else:
                sheet_name = "sheet{}".format(
                    wb.sheetnames.index(sheet_name) + 1)
            if any(sheet_name.lower() in name for name in z.namelist()):
                sheet_name = sheet_name.lower()
            elif any(sheet_name.title() in name for name in z.namelist()):
                sheet_name = sheet_name.title()
            else:
                raise ValueError("Sheet {} not found".format(
                    sheet_name.lower()))
            # Pass this value to the workers
            kwargs["sheet_name"] = sheet_name

            f = z.open("xl/worksheets/{}.xml".format(sheet_name))
            f = BytesIO(f.read())
            total_bytes = cls.file_size(f)

            num_partitions = NPartitions.get()
            # Read some bytes from the sheet so we can extract the XML header and first
            # line. We need to make sure we get the first line of the data as well
            # because that is where the column names are. The header information will
            # be extracted and sent to all of the nodes.
            sheet_block = f.read(EXCEL_READ_BLOCK_SIZE)
            end_of_row_tag = b"</row>"
            while end_of_row_tag not in sheet_block:
                sheet_block += f.read(EXCEL_READ_BLOCK_SIZE)
            idx_of_header_end = sheet_block.index(end_of_row_tag) + len(
                end_of_row_tag)
            sheet_header = sheet_block[:idx_of_header_end]
            # Reset the file pointer to begin at the end of the header information.
            f.seek(idx_of_header_end)
            kwargs["_header"] = sheet_header
            footer = b"</sheetData></worksheet>"
            # Use openpyxml to parse the data
            reader = WorksheetReader(ws, BytesIO(sheet_header + footer),
                                     ex.shared_strings, False)
            # Attach cells to the worksheet
            reader.bind_cells()
            data = PandasExcelParser.get_sheet_data(
                ws, kwargs.get("convert_float", True))
            # Extract column names from parsed data.
            column_names = pandas.Index(data[0])
            index_col = kwargs.get("index_col", None)
            # Remove column names that are specified as `index_col`
            if index_col is not None:
                column_names = column_names.drop(column_names[index_col])

            if not all(column_names):
                # some column names are empty, use pandas reader to take the names from it
                pandas_kw["nrows"] = 1
                df = pandas.read_excel(io, **pandas_kw)
                column_names = df.columns

            # Compute partition metadata upfront so it is uniform for all partitions
            chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)
            num_splits = min(len(column_names), num_partitions)
            kwargs["fname"] = io
            # Skiprows will be used to inform a partition how many rows come before it.
            kwargs["skiprows"] = 0
            rows_to_skip = 0
            data_ids = []
            index_ids = []
            dtypes_ids = []

            # Compute column metadata
            column_chunksize = compute_chunksize(
                pandas.DataFrame(columns=column_names), num_splits, axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]
            kwargs["num_splits"] = num_splits

            while f.tell() < total_bytes:
                args = kwargs
                args["skiprows"] = rows_to_skip
                args["start"] = f.tell()
                chunk = f.read(chunk_size)
                # This edge case can happen when we have reached the end of the data
                # but not the end of the file.
                if b"<row" not in chunk:
                    break
                row_close_tag = b"</row>"
                row_count = re.subn(row_close_tag, b"", chunk)[1]

                # Make sure we are reading at least one row.
                while row_count == 0:
                    chunk += f.read(chunk_size)
                    row_count += re.subn(row_close_tag, b"", chunk)[1]

                last_index = chunk.rindex(row_close_tag)
                f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1)
                args["end"] = f.tell()

                # If there is no data, exit before triggering computation.
                if b"</row>" not in chunk and b"</sheetData>" in chunk:
                    break
                # We need to make sure we include all rows, even those that have no
                # data. Getting the number of the last row will turn into the number of
                # skipped rows, so if there are any rows missing between the last row
                # seen here and the first row the next partition reads, the parser will
                # have to include those rows in that specific partition to match the
                # expected behavior. We subtract 1 here because the header is included
                # in the skip values, and we do not want to skip the header.
                rows_to_skip = (int(chunk[:last_index + len(row_close_tag)].
                                    split(b'<row r="')[-1].split(b'"')[0]) - 1)
                remote_results_list = cls.deploy(cls.parse, num_splits + 2,
                                                 args)
                data_ids.append(remote_results_list[:-2])
                index_ids.append(remote_results_list[-2])
                dtypes_ids.append(remote_results_list[-1])

                # The end of the spreadsheet
                if b"</sheetData>" in chunk:
                    break

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids)

        data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            data_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)
        if index_col is None:
            new_query_compiler._modin_frame._apply_index_objs(axis=0)
        return new_query_compiler
Ejemplo n.º 11
0
    def _read(cls, filepath_or_buffer, **kwargs):
        """
        Read data from multiple `.csv` files passed with `filepath_or_buffer` simultaneously.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of read_csv function.
        **kwargs : dict
            Parameters of `read_csv` function.

        Returns
        -------
        new_query_compiler : BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        # Ensures that the file is a string file path. Otherwise, default to pandas.
        filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
        if isinstance(filepath_or_buffer, str):
            if not cls.file_exists(filepath_or_buffer):
                return cls.single_worker_read(filepath_or_buffer, **kwargs)
            filepath_or_buffer = cls.get_path(filepath_or_buffer)
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the
        # first file but with different data values.
        glob_filepaths = filepath_or_buffer
        filepath_or_buffer = filepath_or_buffer[0]

        compression_type = cls.infer_compression(filepath_or_buffer,
                                                 kwargs.get("compression"))
        if compression_type is not None:
            if (compression_type == "gzip" or compression_type == "bz2"
                    or compression_type == "xz"):
                kwargs["compression"] = compression_type
            elif (compression_type == "zip" and sys.version_info[0] == 3
                  and sys.version_info[1] >= 7):
                # need python3.7 to .seek and .tell ZipExtFile
                kwargs["compression"] = compression_type
            else:
                return cls.single_worker_read(filepath_or_buffer, **kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        nrows = kwargs.pop("nrows", None)
        names = kwargs.get("names", None)
        index_col = kwargs.get("index_col", None)
        usecols = kwargs.get("usecols", None)
        encoding = kwargs.get("encoding", None)
        if names is None:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly.
            names = pandas.read_csv(
                filepath_or_buffer,
                **dict(kwargs,
                       usecols=None,
                       nrows=0,
                       skipfooter=0,
                       index_col=None),
            ).columns
        elif index_col is None and not usecols:
            # When names is set to some list that is smaller than the number of columns
            # in the file, the first columns are built as a hierarchical index.
            empty_pd_df = pandas.read_csv(filepath_or_buffer,
                                          nrows=0,
                                          encoding=encoding)
            num_cols = len(empty_pd_df.columns)
            if num_cols > len(names):
                index_col = list(range(num_cols - len(names)))
                if len(index_col) == 1:
                    index_col = index_col[0]
                kwargs["index_col"] = index_col
        empty_pd_df = pandas.read_csv(filepath_or_buffer,
                                      **dict(kwargs, nrows=0, skipfooter=0))
        column_names = empty_pd_df.columns
        skipfooter = kwargs.get("skipfooter", None)
        skiprows = kwargs.pop("skiprows", None)
        usecols_md = _validate_usecols_arg(usecols)
        if usecols is not None and usecols_md[1] != "integer":
            del kwargs["usecols"]
            all_cols = pandas.read_csv(
                cls.file_open(filepath_or_buffer, "rb"),
                **dict(kwargs, nrows=0, skipfooter=0),
            ).columns
            usecols = all_cols.get_indexer_for(list(usecols_md[0]))
        parse_dates = kwargs.pop("parse_dates", False)
        partition_kwargs = dict(
            kwargs,
            header=None,
            names=names,
            skipfooter=0,
            skiprows=None,
            parse_dates=parse_dates,
            usecols=usecols,
        )
        encoding = kwargs.get("encoding", None)
        quotechar = kwargs.get(
            "quotechar",
            '"').encode(encoding if encoding is not None else "UTF-8")
        is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE

        with ExitStack() as stack:
            files = [
                stack.enter_context(
                    cls.file_open(fname, "rb", compression_type))
                for fname in glob_filepaths
            ]

            # Skip the header since we already have the header information and skip the
            # rows we are told to skip.
            if isinstance(skiprows, int) or skiprows is None:
                if skiprows is None:
                    skiprows = 0
                header = kwargs.get("header", "infer")
                if header == "infer" and kwargs.get("names", None) is None:
                    skip_header = 1
                elif isinstance(header, int):
                    skip_header = header + 1
                elif hasattr(header,
                             "__iter__") and not isinstance(header, str):
                    skip_header = max(header) + 1
                else:
                    skip_header = 0
            if kwargs.get("encoding", None) is not None:
                partition_kwargs["skiprows"] = 1
            # Launch tasks to read partitions
            partition_ids = []
            index_ids = []
            dtypes_ids = []
            # Max number of partitions available
            num_partitions = NPartitions.get()
            # This is the number of splits for the columns
            num_splits = min(len(column_names), num_partitions)
            # Metadata
            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]

            args = {
                "num_splits": num_splits,
                **partition_kwargs,
            }

            splits = cls.partitioned_file(
                files,
                glob_filepaths,
                num_partitions=num_partitions,
                nrows=nrows,
                skiprows=skiprows,
                skip_header=skip_header,
                quotechar=quotechar,
                is_quoting=is_quoting,
            )

            for chunks in splits:
                args.update({"chunks": chunks})
                partition_id = cls.deploy(cls.parse, num_splits + 2, args)
                partition_ids.append(partition_id[:-2])
                index_ids.append(partition_id[-2])
                dtypes_ids.append(partition_id[-1])

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])
            new_index.name = empty_pd_df.index.name

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None

        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)
        # If parse_dates is present, the column names that we have might not be
        # the same length as the returned column names. If we do need to modify
        # the column names, we remove the old names from the column names and
        # insert the new one at the front of the Index.
        if parse_dates is not None:
            # We have to recompute the column widths if `parse_dates` is set because
            # we are not guaranteed to have the correct information regarding how many
            # columns are on each partition.
            column_widths = None
            # Check if is list of lists
            if isinstance(parse_dates, list) and isinstance(
                    parse_dates[0], list):
                for group in parse_dates:
                    new_col_name = "_".join(group)
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)
            # Check if it is a dictionary
            elif isinstance(parse_dates, dict):
                for new_col_name, group in parse_dates.items():
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            partition_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)

        if skipfooter:
            new_query_compiler = new_query_compiler.drop(
                new_query_compiler.index[-skipfooter:])
        if kwargs.get("squeeze", False) and len(
                new_query_compiler.columns) == 1:
            return new_query_compiler[new_query_compiler.columns[0]]
        if index_col is None:
            new_query_compiler._modin_frame._apply_index_objs(axis=0)
        return new_query_compiler
Ejemplo n.º 12
0
    def _read(cls, path_or_buf, **kwargs):
        """
        Read data from `path_or_buf` according to the passed `read_json` `kwargs` parameters.

        Parameters
        ----------
        path_or_buf : str, path object or file-like object
            `path_or_buf` parameter of `read_json` function.
        **kwargs : dict
            Parameters of `read_json` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        path_or_buf = cls.get_path_or_buffer(path_or_buf)
        if isinstance(path_or_buf, str):
            if not cls.file_exists(path_or_buf):
                return cls.single_worker_read(path_or_buf, **kwargs)
            path_or_buf = cls.get_path(path_or_buf)
        elif not cls.pathlib_or_pypath(path_or_buf):
            return cls.single_worker_read(path_or_buf, **kwargs)
        if not kwargs.get("lines", False):
            return cls.single_worker_read(path_or_buf, **kwargs)
        columns = pandas.read_json(BytesIO(b"" +
                                           open(path_or_buf, "rb").readline()),
                                   lines=True).columns
        kwargs["columns"] = columns
        empty_pd_df = pandas.DataFrame(columns=columns)

        with cls.file_open(path_or_buf, "rb",
                           kwargs.get("compression", "infer")) as f:
            num_partitions = NPartitions.get()
            num_splits = min(len(columns), num_partitions)

            partition_ids = []
            index_ids = []
            dtypes_ids = []

            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(columns):
                column_widths = [len(columns)]
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if i != num_splits - 1 else len(columns) -
                    (column_chunksize * (num_splits - 1))
                    for i in range(num_splits)
                ]

            args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs}

            splits = cls.partitioned_file(
                f,
                num_partitions=num_partitions,
                is_quoting=(args.get("quoting", "") != QUOTE_NONE),
            )
            for start, end in splits:
                args.update({"start": start, "end": end})
                partition_id = cls.deploy(cls.parse, num_splits + 3, args)
                partition_ids.append(partition_id[:-3])
                index_ids.append(partition_id[-3])
                dtypes_ids.append(partition_id[-2])

        # partition_id[-1] contains the columns for each partition, which will be useful
        # for implementing when `lines=False`.
        row_lengths = cls.materialize(index_ids)
        new_index = pandas.RangeIndex(sum(row_lengths))

        dtypes = cls.get_dtypes(dtypes_ids)
        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)

        if isinstance(dtypes, pandas.Series):
            dtypes.index = columns
        else:
            dtypes = pandas.Series(dtypes, index=columns)

        new_frame = cls.frame_cls(
            np.array(partition_ids),
            new_index,
            columns,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_frame.synchronize_labels(axis=0)
        return cls.query_compiler_cls(new_frame)
Ejemplo n.º 13
0
    def _read(cls, path_or_buf, **kwargs):
        if isinstance(path_or_buf, str):
            if not cls.file_exists(path_or_buf):
                return cls.single_worker_read(path_or_buf, **kwargs)
            path_or_buf = cls.get_path(path_or_buf)
        elif not cls.pathlib_or_pypath(path_or_buf):
            return cls.single_worker_read(path_or_buf, **kwargs)
        if not kwargs.get("lines", False):
            return cls.single_worker_read(path_or_buf, **kwargs)
        columns = pandas.read_json(BytesIO(b"" +
                                           open(path_or_buf, "rb").readline()),
                                   lines=True).columns
        kwargs["columns"] = columns
        empty_pd_df = pandas.DataFrame(columns=columns)

        with cls.file_open(path_or_buf, "rb",
                           kwargs.get("compression", "infer")) as f:
            from modin.pandas import DEFAULT_NPARTITIONS

            num_partitions = DEFAULT_NPARTITIONS
            num_splits = min(len(columns), num_partitions)

            partition_ids = []
            index_ids = []
            dtypes_ids = []

            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(columns):
                column_widths = [len(columns)]
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if i != num_splits - 1 else len(columns) -
                    (column_chunksize * (num_splits - 1))
                    for i in range(num_splits)
                ]

            args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs}

            splits = cls.partitioned_file(
                f,
                num_partitions=num_partitions,
                is_quoting=(args.get("quoting", "") != QUOTE_NONE),
            )
            for start, end in splits:
                args.update({"start": start, "end": end})
                partition_id = cls.deploy(cls.parse, num_splits + 3, args)
                partition_ids.append(partition_id[:-3])
                index_ids.append(partition_id[-3])
                dtypes_ids.append(partition_id[-2])

        # partition_id[-1] contains the columns for each partition, which will be useful
        # for implementing when `lines=False`.
        row_lengths = cls.materialize(index_ids)
        new_index = pandas.RangeIndex(sum(row_lengths))

        dtypes = cls.get_dtypes(dtypes_ids)
        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)

        if isinstance(dtypes, pandas.Series):
            dtypes.index = columns
        else:
            dtypes = pandas.Series(dtypes, index=columns)

        new_frame = cls.frame_cls(
            np.array(partition_ids),
            new_index,
            columns,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_frame._apply_index_objs(axis=0)
        return cls.query_compiler_cls(new_frame)
Ejemplo n.º 14
0
    def read(cls, path_or_buf, **kwargs):
        path_or_buf = cls.get_path(path_or_buf)
        if not kwargs.get("lines", False):
            return cls.single_worker_read(path_or_buf, **kwargs)
        columns = pandas.read_json(BytesIO(b"" +
                                           open(path_or_buf, "rb").readline()),
                                   lines=True).columns
        kwargs["columns"] = columns
        empty_pd_df = pandas.DataFrame(columns=columns)

        with cls.file_open(path_or_buf, "rb",
                           kwargs.get("compression", "infer")) as f:
            total_bytes = cls.file_size(f)
            from modin.pandas import DEFAULT_NPARTITIONS

            num_partitions = DEFAULT_NPARTITIONS
            num_splits = min(len(columns), num_partitions)
            chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)

            partition_ids = []
            index_ids = []
            dtypes_ids = []

            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(columns):
                column_widths = [len(columns)]
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if i != num_splits - 1 else len(columns) -
                    (column_chunksize * (num_splits - 1))
                    for i in range(num_splits)
                ]

            while f.tell() < total_bytes:
                start = f.tell()
                args = {
                    "fname": path_or_buf,
                    "num_splits": num_splits,
                    "start": start
                }
                args.update(kwargs)
                partition_id = cls.call_deploy(f, chunk_size, num_splits + 3,
                                               args)
                partition_ids.append(partition_id[:-3])
                index_ids.append(partition_id[-3])
                dtypes_ids.append(partition_id[-2])

        # partition_id[-1] contains the columns for each partition, which will be useful
        # for implementing when `lines=False`.
        row_lengths = cls.materialize(index_ids)
        new_index = pandas.RangeIndex(sum(row_lengths))

        dtypes = cls.get_dtypes(dtypes_ids)
        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)

        if isinstance(dtypes, pandas.Series):
            dtypes.index = columns
        else:
            dtypes = pandas.Series(dtypes, index=columns)

        new_frame = cls.frame_cls(
            np.array(partition_ids),
            new_index,
            columns,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_frame._apply_index_objs(axis=0)
        return cls.query_compiler_cls(new_frame)
Ejemplo n.º 15
0
    def read(cls, filepath_or_buffer, **kwargs):
        if isinstance(filepath_or_buffer, str):
            if not cls.file_exists(filepath_or_buffer):
                return cls.single_worker_read(filepath_or_buffer, **kwargs)
            filepath_or_buffer = cls.get_path(filepath_or_buffer)
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)
        compression_type = cls.infer_compression(
            filepath_or_buffer, kwargs.get("compression", "infer")
        )
        if compression_type is not None:
            if (
                compression_type == "gzip"
                or compression_type == "bz2"
                or compression_type == "xz"
            ):
                kwargs["compression"] = compression_type
            elif (
                compression_type == "zip"
                and sys.version_info[0] == 3
                and sys.version_info[1] >= 7
            ):
                # need python3.7 to .seek and .tell ZipExtFile
                kwargs["compression"] = compression_type
            else:
                return cls.single_worker_read(filepath_or_buffer, **kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        # If infer_nrows is a significant portion of the number of rows, pandas may be
        # faster.
        infer_nrows = kwargs.get("infer_nrows", 100)
        if infer_nrows > 100:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)
        # TODO: replace this by reading lines from file.
        if kwargs.get("nrows") is not None:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)
        names = kwargs.get("names", None)
        index_col = kwargs.get("index_col", None)
        if names is None:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly
            names = pandas.read_fwf(
                filepath_or_buffer,
                **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None)
            ).columns
        empty_pd_df = pandas.read_fwf(
            filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0)
        )
        column_names = empty_pd_df.columns
        skipfooter = kwargs.get("skipfooter", None)
        skiprows = kwargs.pop("skiprows", None)
        usecols = kwargs.get("usecols", None)
        usecols_md = _validate_usecols_arg(usecols)
        if usecols is not None and usecols_md[1] != "integer":
            del kwargs["usecols"]
            all_cols = pandas.read_fwf(
                cls.file_open(filepath_or_buffer, "rb"),
                **dict(kwargs, nrows=0, skipfooter=0)
            ).columns
            usecols = all_cols.get_indexer_for(list(usecols_md[0]))
        parse_dates = kwargs.pop("parse_dates", False)
        partition_kwargs = dict(
            kwargs,
            header=None,
            names=names,
            skipfooter=0,
            skiprows=None,
            parse_dates=parse_dates,
            usecols=usecols,
        )
        encoding = kwargs.get("encoding", None)
        quotechar = kwargs.get("quotechar", '"').encode(
            encoding if encoding is not None else "UTF-8"
        )
        with cls.file_open(filepath_or_buffer, "rb", compression_type) as f:
            # Skip the header since we already have the header information and skip the
            # rows we are told to skip.
            if isinstance(skiprows, int) or skiprows is None:
                if skiprows is None:
                    skiprows = 0
                header = kwargs.get("header", "infer")
                if header == "infer" and kwargs.get("names", None) is None:
                    skiprows += 1
                elif isinstance(header, int):
                    skiprows += header + 1
                elif hasattr(header, "__iter__") and not isinstance(header, str):
                    skiprows += max(header) + 1
                for _ in range(skiprows):
                    f.readline()
            if kwargs.get("encoding", None) is not None:
                partition_kwargs["skiprows"] = 1
            # Launch tasks to read partitions
            partition_ids = []
            index_ids = []
            dtypes_ids = []
            total_bytes = cls.file_size(f)
            # Max number of partitions available
            from modin.pandas import DEFAULT_NPARTITIONS

            num_partitions = DEFAULT_NPARTITIONS
            # This is the number of splits for the columns
            num_splits = min(len(column_names), num_partitions)
            # This is the chunksize each partition will read
            chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)

            # Metadata
            column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize
                    if len(column_names) > (column_chunksize * (i + 1))
                    else 0
                    if len(column_names) < (column_chunksize * i)
                    else len(column_names) - (column_chunksize * i)
                    for i in range(num_splits)
                ]

            while f.tell() < total_bytes:
                args = {
                    "fname": filepath_or_buffer,
                    "num_splits": num_splits,
                    **partition_kwargs,
                }
                partition_id = cls.call_deploy(
                    f, chunk_size, num_splits + 2, args, quotechar=quotechar
                )
                partition_ids.append(partition_id[:-2])
                index_ids.append(partition_id[-2])
                dtypes_ids.append(partition_id[-1])

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
            # pandas has a really weird edge case here.
            if kwargs.get("names", None) is not None and skiprows > 1:
                new_index = pandas.RangeIndex(
                    skiprows - 1, new_index.stop + skiprows - 1
                )
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])
            new_index.name = empty_pd_df.index.name

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids)

        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
        # If parse_dates is present, the column names that we have might not be
        # the same length as the returned column names. If we do need to modify
        # the column names, we remove the old names from the column names and
        # insert the new one at the front of the Index.
        if parse_dates is not None:
            # We have to recompute the column widths if `parse_dates` is set because
            # we are not guaranteed to have the correct information regarding how many
            # columns are on each partition.
            column_widths = None
            # Check if is list of lists
            if isinstance(parse_dates, list) and isinstance(parse_dates[0], list):
                for group in parse_dates:
                    new_col_name = "_".join(group)
                    column_names = column_names.drop(group).insert(0, new_col_name)
            # Check if it is a dictionary
            elif isinstance(parse_dates, dict):
                for new_col_name, group in parse_dates.items():
                    column_names = column_names.drop(group).insert(0, new_col_name)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            partition_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)

        if skipfooter:
            new_query_compiler = new_query_compiler.drop(
                new_query_compiler.index[-skipfooter:]
            )
        if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1:
            return new_query_compiler[new_query_compiler.columns[0]]
        if index_col is None:
            new_query_compiler._modin_frame._apply_index_objs(axis=0)
        return new_query_compiler
Ejemplo n.º 16
0
    def _read(cls, io, **kwargs):
        if (kwargs.get("engine", None) is not None
                and kwargs.get("engine") != "openpyxl"):
            warnings.warn(
                "Modin only implements parallel `read_excel` with `openpyxl` engine, "
                'please specify `engine=None` or `engine="openpyxl"` to '
                "use Modin's parallel implementation.")
            return cls.single_worker_read(io, **kwargs)
        if sys.version_info < (3, 7):
            warnings.warn(
                "Python 3.7 or higher required for parallel `read_excel`.")
            return cls.single_worker_read(io, **kwargs)

        from zipfile import ZipFile
        from openpyxl import load_workbook
        from openpyxl.worksheet.worksheet import Worksheet
        from openpyxl.worksheet._reader import WorksheetReader
        from openpyxl.reader.excel import ExcelReader
        from modin.backends.pandas.parsers import PandasExcelParser

        sheet_name = kwargs.get("sheet_name", 0)
        if sheet_name is None or isinstance(sheet_name, list):
            warnings.warn(
                "`read_excel` functionality is only implemented for a single sheet at a "
                "time. Multiple sheet reading coming soon!")
            return cls.single_worker_read(io, **kwargs)

        warnings.warn("Parallel `read_excel` is a new feature! Please email "
                      "[email protected] if you run into any problems.")
        wb = load_workbook(filename=io, read_only=True)
        # Get shared strings
        ex = ExcelReader(io, read_only=True)
        ex.read_manifest()
        ex.read_strings()
        ws = Worksheet(wb)
        # Convert string name 0 to string
        if sheet_name == 0:
            sheet_name = wb.sheetnames[sheet_name]
        with ZipFile(io) as z:
            from io import BytesIO

            f = z.open("xl/worksheets/{}.xml".format(sheet_name.lower()))
            f = BytesIO(f.read())
            total_bytes = cls.file_size(f)

            from modin.pandas import DEFAULT_NPARTITIONS

            num_partitions = DEFAULT_NPARTITIONS
            # Read some bytes from the sheet so we can extract the XML header and first
            # line. We need to make sure we get the first line of the data as well
            # because that is where the column names are. The header information will
            # be extracted and sent to all of the nodes.
            sheet_block = f.read(EXCEL_READ_BLOCK_SIZE)
            end_of_row_tag = b"</row>"
            while end_of_row_tag not in sheet_block:
                sheet_block += f.read(EXCEL_READ_BLOCK_SIZE)
            idx_of_header_end = sheet_block.index(end_of_row_tag) + len(
                end_of_row_tag)
            sheet_header = sheet_block[:idx_of_header_end]
            # Reset the file pointer to begin at the end of the header information.
            f.seek(idx_of_header_end)
            kwargs["_header"] = sheet_header
            footer = b"</sheetData></worksheet>"
            # Use openpyxml to parse the data
            reader = WorksheetReader(ws, BytesIO(sheet_header + footer),
                                     ex.shared_strings, False)
            # Attach cells to the worksheet
            reader.bind_cells()
            data = PandasExcelParser.get_sheet_data(
                ws, kwargs.get("convert_float", True))
            # Extract column names from parsed data.
            column_names = pandas.Index(data[0])
            index_col = kwargs.get("index_col", None)
            # Remove column names that are specified as `index_col`
            if index_col is not None:
                column_names = column_names.drop(column_names[index_col])
            # Compute partition metadata upfront so it is uniform for all partitions
            chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)
            num_splits = min(len(column_names), num_partitions)
            kwargs["fname"] = io
            # Skiprows will be used to inform a partition how many rows come before it.
            kwargs["skiprows"] = 0
            row_count = 0
            data_ids = []
            index_ids = []
            dtypes_ids = []

            # Compute column metadata
            column_chunksize = compute_chunksize(
                pandas.DataFrame(columns=column_names), num_splits, axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]
            kwargs["num_splits"] = num_splits

            while f.tell() < total_bytes:
                args = kwargs
                args["skiprows"] = row_count + args["skiprows"]
                args["start"] = f.tell()
                chunk = f.read(chunk_size)
                # This edge case can happen when we have reached the end of the data
                # but not the end of the file.
                if b"<row" not in chunk:
                    break
                row_close_tag = b"</row>"
                row_count = re.subn(row_close_tag, b"", chunk)[1]

                # Make sure we are reading at least one row.
                while row_count == 0:
                    chunk += f.read(chunk_size)
                    row_count += re.subn(row_close_tag, b"", chunk)[1]

                last_index = chunk.rindex(row_close_tag)
                f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1)
                args["end"] = f.tell()

                # If there is no data, exit before triggering computation.
                if b"</row>" not in chunk and b"</sheetData>" in chunk:
                    break
                remote_results_list = cls.deploy(cls.parse, num_splits + 2,
                                                 args)
                data_ids.append(remote_results_list[:-2])
                index_ids.append(remote_results_list[-2])
                dtypes_ids.append(remote_results_list[-1])

                # The end of the spreadsheet
                if b"</sheetData>" in chunk:
                    break

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids)

        data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            data_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        return cls.query_compiler_cls(new_frame)
Ejemplo n.º 17
0
    def from_pandas(cls, df, return_dims=False):
        """
        Return the partitions from pandas.DataFrame.

        Parameters
        ----------
        df : pandas.DataFrame
            A pandas.DataFrame.
        return_dims : bool, default: False
            If it's True, return as (np.ndarray, row_lengths, col_widths),
            else np.ndarray.

        Returns
        -------
        np.ndarray or (np.ndarray, row_lengths, col_widths)
            A NumPy array with partitions (with dimensions or not).
        """

        def update_bar(pbar, f):
            if ProgressBar.get():
                pbar.update(1)
            return f

        num_splits = NPartitions.get()
        put_func = cls._partition_class.put
        row_chunksize, col_chunksize = compute_chunksize(df, num_splits)

        bar_format = (
            "{l_bar}{bar}{r_bar}"
            if os.environ.get("DEBUG_PROGRESS_BAR", "False") == "True"
            else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}"
        )
        if ProgressBar.get():
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                try:
                    from tqdm.autonotebook import tqdm as tqdm_notebook
                except ImportError:
                    raise ImportError("Please pip install tqdm to use the progress bar")

            rows = max(1, round(len(df) / row_chunksize))
            cols = max(1, round(len(df.columns) / col_chunksize))
            update_count = rows * cols
            pbar = tqdm_notebook(
                total=round(update_count),
                desc="Distributing Dataframe",
                bar_format=bar_format,
            )
        else:
            pbar = None
        parts = [
            [
                update_bar(
                    pbar,
                    put_func(
                        df.iloc[i : i + row_chunksize, j : j + col_chunksize].copy()
                    ),
                )
                for j in range(0, len(df.columns), col_chunksize)
            ]
            for i in range(0, len(df), row_chunksize)
        ]
        if ProgressBar.get():
            pbar.close()
        if not return_dims:
            return np.array(parts)
        else:
            row_lengths = [
                row_chunksize
                if i + row_chunksize < len(df)
                else len(df) % row_chunksize or row_chunksize
                for i in range(0, len(df), row_chunksize)
            ]
            col_widths = [
                col_chunksize
                if i + col_chunksize < len(df.columns)
                else len(df.columns) % col_chunksize or col_chunksize
                for i in range(0, len(df.columns), col_chunksize)
            ]
            return np.array(parts), row_lengths, col_widths