Esempio n. 1
0
    def call_deploy(cls, fname, col_partitions, **kwargs):
        """
        Deploy remote tasks to the workers with passed parameters.

        Parameters
        ----------
        fname : str, path object or file-like object
            Name of the file to read.
        col_partitions : list
            List of arrays with columns names that should be read
            by each partition.
        **kwargs : dict
            Parameters of deploying read_* function.

        Returns
        -------
        np.ndarray
            Array with references to the task deploy result for each partition.
        """
        return np.array(
            [
                cls.deploy(
                    cls.parse,
                    NPartitions.get() + 2,
                    dict(
                        fname=fname,
                        columns=cols,
                        num_splits=NPartitions.get(),
                        **kwargs,
                    ),
                )
                for cols in col_partitions
            ]
        ).T
Esempio n. 2
0
 def call_deploy(cls, fname, col_partitions, **kwargs):
     return np.array([
         cls.deploy(
             cls.parse,
             NPartitions.get() + 2,
             dict(
                 fname=fname,
                 columns=cols,
                 num_splits=NPartitions.get(),
                 **kwargs,
             ),
         ) for cols in col_partitions
     ]).T
Esempio n. 3
0
    def binary_operation(cls, axis, left, func, right):
        """
        Apply a function that requires two PandasDataframe objects.

        Parameters
        ----------
        axis : {0, 1}
            The axis to apply the function over (0 - rows, 1 - columns).
        left : np.ndarray
            The partitions of left PandasDataframe.
        func : callable
            The function to apply.
        right : np.ndarray
            The partitions of right PandasDataframe.

        Returns
        -------
        np.ndarray
            A NumPy array with new partitions.
        """
        if axis:
            left_partitions = cls.row_partitions(left)
            right_partitions = cls.row_partitions(right)
        else:
            left_partitions = cls.column_partitions(left)
            right_partitions = cls.column_partitions(right)
        func = cls.preprocess_func(func)
        result = np.array([
            left_partitions[i].apply(
                func,
                num_splits=NPartitions.get(),
                other_axis_partition=right_partitions[i],
            ) for i in range(len(left_partitions))
        ])
        return result if axis else result.T
Esempio n. 4
0
    def build_index(cls, partition_ids):
        """
        Compute index and its split sizes of resulting Modin DataFrame.

        Parameters
        ----------
        partition_ids : list
            Array with references to the partitions data.

        Returns
        -------
        index : pandas.Index
            Index of resulting Modin DataFrame.
        row_lengths : list
            List with lengths of index chunks.
        """
        num_partitions = NPartitions.get()
        index_len = (0 if len(partition_ids) == 0 else cls.materialize(
            partition_ids[-2][0]))
        if isinstance(index_len, int):
            index = pandas.RangeIndex(index_len)
        else:
            index = index_len
            index_len = len(index)
        index_chunksize = compute_chunksize(index_len, num_partitions)
        if index_chunksize > index_len:
            row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)]
        else:
            row_lengths = [
                index_chunksize if i != num_partitions - 1 else index_len -
                (index_chunksize * (num_partitions - 1))
                for i in range(num_partitions)
            ]
        return index, row_lengths
Esempio n. 5
0
    def binary_operation(cls, axis, left, func, right):
        """
        Apply a function that requires two BasePandasFrame objects.

        Parameters
        ----------
            axis : int
                The axis to apply the function over (0 - rows, 1 - columns)
            left : NumPy array
                The partitions of left Modin Frame
            func : callable
                The function to apply
            right : NumPy array
                The partitions of right Modin Frame.

        Returns
        -------
        NumPy array
            A new BasePandasFrame object, the type of object that called this.
        """
        if axis:
            left_partitions = cls.row_partitions(left)
            right_partitions = cls.row_partitions(right)
        else:
            left_partitions = cls.column_partitions(left)
            right_partitions = cls.column_partitions(right)
        func = cls.preprocess_func(func)
        result = np.array([
            left_partitions[i].apply(
                func,
                num_splits=NPartitions.get(),
                other_axis_partition=right_partitions[i],
            ) for i in range(len(left_partitions))
        ])
        return result if axis else result.T
Esempio n. 6
0
    def build_columns(cls, columns):
        """
        Split columns into chunks, that should be read be workers.

        Parameters
        ----------
        columns : list
            List of columns that should be read from file.

        Returns
        -------
        col_partitions : list
            List of lists with columns for reading by workers.
        column_widths : list
            List with lengths of `col_partitions` subarrays
            (number of columns that should be read by workers).
        """
        num_partitions = NPartitions.get()
        column_splits = (
            len(columns) // num_partitions
            if len(columns) % num_partitions == 0
            else len(columns) // num_partitions + 1
        )
        col_partitions = [
            columns[i : i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        column_widths = [len(c) for c in col_partitions]
        return col_partitions, column_widths
Esempio n. 7
0
    def from_pandas(cls, df, return_dims=False):
        """Return the partitions from Pandas DataFrame."""
        def update_bar(pbar, f):
            if ProgressBar.get():
                pbar.update(1)
            return f

        num_splits = NPartitions.get()
        put_func = cls._partition_class.put
        row_chunksize, col_chunksize = compute_chunksize(df, num_splits)

        bar_format = (
            "{l_bar}{bar}{r_bar}"
            if os.environ.get("DEBUG_PROGRESS_BAR", "False") == "True" else
            "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}"
        )
        if ProgressBar.get():
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                try:
                    from tqdm.autonotebook import tqdm as tqdm_notebook
                except ImportError:
                    raise ImportError(
                        "Please pip install tqdm to use the progress bar")

            rows = max(1, round(len(df) / row_chunksize))
            cols = max(1, round(len(df.columns) / col_chunksize))
            update_count = rows * cols
            pbar = tqdm_notebook(
                total=round(update_count),
                desc="Distributing Dataframe",
                bar_format=bar_format,
            )
        else:
            pbar = None
        parts = [[
            update_bar(
                pbar,
                put_func(df.iloc[i:i + row_chunksize,
                                 j:j + col_chunksize].copy()),
            ) for j in range(0, len(df.columns), col_chunksize)
        ] for i in range(0, len(df), row_chunksize)]
        if ProgressBar.get():
            pbar.close()
        if not return_dims:
            return np.array(parts)
        else:
            row_lengths = [
                row_chunksize if i + row_chunksize < len(df) else
                len(df) % row_chunksize or row_chunksize
                for i in range(0, len(df), row_chunksize)
            ]
            col_widths = [
                col_chunksize if i + col_chunksize < len(df.columns) else
                len(df.columns) % col_chunksize or col_chunksize
                for i in range(0, len(df.columns), col_chunksize)
            ]
            return np.array(parts), row_lengths, col_widths
Esempio n. 8
0
    def _read(cls, filepath_or_buffer, **kwargs):
        """
        In experimental mode, we can use `*` in the filename.

        Note: the number of partitions is equal to the number of input files.
        """
        if not (isinstance(filepath_or_buffer, str) and "*" in filepath_or_buffer):
            warnings.warn("Defaulting to Modin core implementation")
            return cls.single_worker_read(
                filepath_or_buffer,
                single_worker_read=True,
                **kwargs,
            )
        filepath_or_buffer = sorted(glob.glob(filepath_or_buffer))

        if len(filepath_or_buffer) == 0:
            raise ValueError(
                f"There are no files matching the pattern: {filepath_or_buffer}"
            )

        partition_ids = []
        lengths_ids = []
        widths_ids = []

        if len(filepath_or_buffer) != NPartitions.get():
            # do we need to do a repartitioning?
            warnings.warn("can be inefficient partitioning")

        for file_name in filepath_or_buffer:
            partition_id = cls.deploy(
                cls.parse,
                3,
                dict(
                    fname=file_name,
                    **kwargs,
                ),
            )
            partition_ids.append(partition_id[:-2])
            lengths_ids.append(partition_id[-2])
            widths_ids.append(partition_id[-1])

        lengths = cls.materialize(lengths_ids)
        widths = cls.materialize(widths_ids)

        # while num_splits is 1, need only one value
        partition_ids = cls.build_partition(partition_ids, lengths, [widths[0]])

        new_index = cls.frame_cls._partition_mgr_cls.get_indices(
            0, partition_ids, lambda df: df.axes[0]
        )
        new_columns = cls.frame_cls._partition_mgr_cls.get_indices(
            1, partition_ids, lambda df: df.axes[1]
        )

        return cls.query_compiler_cls(
            cls.frame_cls(partition_ids, new_index, new_columns)
        )
Esempio n. 9
0
def test_explode_all_partitions(column, ignore_index):
    # Test explode with enough rows to fill all partitions. explode should
    # expand every row in the input data into two rows. It's especially
    # important that the input data has list-like elements that must be
    # expanded at the boundaries of the partitions, e.g. at row 31.
    num_rows = NPartitions.get() * MinPartitionSize.get()
    data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows}
    eval_general(
        *create_test_dfs(data),
        lambda df: df.explode(column, ignore_index=ignore_index),
    )
Esempio n. 10
0
 def build_columns(cls, columns):
     num_partitions = NPartitions.get()
     column_splits = (len(columns) // num_partitions if len(columns) %
                      num_partitions == 0 else
                      len(columns) // num_partitions + 1)
     col_partitions = [
         columns[i:i + column_splits]
         for i in range(0, len(columns), column_splits)
     ]
     column_widths = [len(c) for c in col_partitions]
     return col_partitions, column_widths
Esempio n. 11
0
    def _define_metadata(
        cls,
        df: pandas.DataFrame,
        column_names: ColumnNamesTypes,
    ) -> Tuple[list, int]:
        """
        Define partitioning metadata.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame to split.
        column_names : ColumnNamesTypes
            Column names of df.

        Returns
        -------
        column_widths : list
            Column width to use during new frame creation (number of
            columns for each partition).
        num_splits : int
            The maximum number of splits to separate the DataFrame into.
        """
        # This is the number of splits for the columns
        num_splits = min(len(column_names) or 1, NPartitions.get())
        column_chunksize = compute_chunksize(df, num_splits, axis=1)
        if column_chunksize > len(column_names):
            column_widths = [len(column_names)]
            # This prevents us from unnecessarily serializing a bunch of empty
            # objects.
            num_splits = 1
        else:
            # split columns into chunks with maximal size column_chunksize, for example
            # if num_splits == 4, len(column_names) == 80 and column_chunksize == 32,
            # column_widths will be [32, 32, 16, 0]
            column_widths = [
                column_chunksize
                if len(column_names) > (column_chunksize * (i + 1))
                else 0
                if len(column_names) < (column_chunksize * i)
                else len(column_names) - (column_chunksize * i)
                for i in range(num_splits)
            ]

        return column_widths, num_splits
Esempio n. 12
0
 def build_index(cls, partition_ids):
     num_partitions = NPartitions.get()
     index_len = cls.materialize(partition_ids[-2][0])
     if isinstance(index_len, int):
         index = pandas.RangeIndex(index_len)
     else:
         index = index_len
         index_len = len(index)
     index_chunksize = compute_chunksize(pandas.DataFrame(index=index),
                                         num_partitions,
                                         axis=0)
     if index_chunksize > index_len:
         row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)]
     else:
         row_lengths = [
             index_chunksize if i != num_partitions - 1 else index_len -
             (index_chunksize * (num_partitions - 1))
             for i in range(num_partitions)
         ]
     return index, row_lengths
Esempio n. 13
0
 def from_pandas(cls, df, return_dims=False):
     """Return the partitions from Pandas DataFrame."""
     num_splits = NPartitions.get()
     put_func = cls._partition_class.put
     row_chunksize, col_chunksize = compute_chunksize(df, num_splits)
     parts = [[
         put_func(df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy())
         for j in range(0, len(df.columns), col_chunksize)
     ] for i in range(0, len(df), row_chunksize)]
     if not return_dims:
         return np.array(parts)
     else:
         row_lengths = [
             row_chunksize if i + row_chunksize < len(df) else
             len(df) % row_chunksize or row_chunksize
             for i in range(0, len(df), row_chunksize)
         ]
         col_widths = [
             col_chunksize if i + col_chunksize < len(df.columns) else
             len(df.columns) % col_chunksize or col_chunksize
             for i in range(0, len(df.columns), col_chunksize)
         ]
         return np.array(parts), row_lengths, col_widths
Esempio n. 14
0
    def read(cls, filepath_or_buffer, **kwargs):
        filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
        if isinstance(filepath_or_buffer, str):
            if not cls.file_exists(filepath_or_buffer):
                return cls.single_worker_read(filepath_or_buffer, **kwargs)
            filepath_or_buffer = cls.get_path(filepath_or_buffer)
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)
        compression_type = cls.infer_compression(
            filepath_or_buffer, kwargs.get("compression", "infer"))
        if compression_type is not None:
            if (compression_type == "gzip" or compression_type == "bz2"
                    or compression_type == "xz"):
                kwargs["compression"] = compression_type
            elif (compression_type == "zip" and sys.version_info[0] == 3
                  and sys.version_info[1] >= 7):
                # need python3.7 to .seek and .tell ZipExtFile
                kwargs["compression"] = compression_type
            else:
                return cls.single_worker_read(filepath_or_buffer, **kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        # If infer_nrows is a significant portion of the number of rows, pandas may be
        # faster.
        infer_nrows = kwargs.get("infer_nrows", 100)
        if infer_nrows > 100:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)
        nrows = kwargs.pop("nrows", None)
        names = kwargs.get("names", None)
        index_col = kwargs.get("index_col", None)
        if names is None:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly
            names = pandas.read_fwf(
                filepath_or_buffer,
                **dict(kwargs,
                       usecols=None,
                       nrows=0,
                       skipfooter=0,
                       index_col=None),
            ).columns
        empty_pd_df = pandas.read_fwf(filepath_or_buffer,
                                      **dict(kwargs, nrows=0, skipfooter=0))
        column_names = empty_pd_df.columns
        skipfooter = kwargs.get("skipfooter", None)
        skiprows = kwargs.pop("skiprows", None)
        usecols = kwargs.get("usecols", None)
        usecols_md = _validate_usecols_arg(usecols)
        if usecols is not None and usecols_md[1] != "integer":
            del kwargs["usecols"]
            all_cols = pandas.read_fwf(
                cls.file_open(filepath_or_buffer, "rb"),
                **dict(kwargs, nrows=0, skipfooter=0),
            ).columns
            usecols = all_cols.get_indexer_for(list(usecols_md[0]))
        parse_dates = kwargs.pop("parse_dates", False)
        partition_kwargs = dict(
            kwargs,
            header=None,
            names=names,
            skipfooter=0,
            skiprows=None,
            parse_dates=parse_dates,
            usecols=usecols,
        )
        encoding = kwargs.get("encoding", None)
        quotechar = kwargs.get(
            "quotechar",
            '"').encode(encoding if encoding is not None else "UTF-8")
        is_quoting = kwargs.get("quoting", "") != QUOTE_NONE
        with cls.file_open(filepath_or_buffer, "rb", compression_type) as f:
            # Skip the header since we already have the header information and skip the
            # rows we are told to skip.
            if isinstance(skiprows, int) or skiprows is None:
                if skiprows is None:
                    skiprows = 0
                header = kwargs.get("header", "infer")
                if header == "infer" and kwargs.get("names", None) is None:
                    skiprows += 1
                elif isinstance(header, int):
                    skiprows += header + 1
                elif hasattr(header,
                             "__iter__") and not isinstance(header, str):
                    skiprows += max(header) + 1
            if kwargs.get("encoding", None) is not None:
                partition_kwargs["skiprows"] = 1
            # Launch tasks to read partitions
            partition_ids = []
            index_ids = []
            dtypes_ids = []
            # Max number of partitions available
            num_partitions = NPartitions.get()
            # This is the number of splits for the columns
            num_splits = min(len(column_names), num_partitions)
            # Metadata
            column_chunksize = compute_chunksize(empty_pd_df,
                                                 num_splits,
                                                 axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]

            args = {
                "fname": filepath_or_buffer,
                "num_splits": num_splits,
                **partition_kwargs,
            }

            splits = cls.partitioned_file(
                f,
                num_partitions=num_partitions,
                nrows=nrows,
                skiprows=skiprows,
                quotechar=quotechar,
                is_quoting=is_quoting,
            )
            for start, end in splits:
                args.update({"start": start, "end": end})
                partition_id = cls.deploy(cls.parse, num_splits + 2, args)
                partition_ids.append(partition_id[:-2])
                index_ids.append(partition_id[-2])
                dtypes_ids.append(partition_id[-1])

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
            # pandas has a really weird edge case here.
            if kwargs.get("names", None) is not None and skiprows > 1:
                new_index = pandas.RangeIndex(skiprows - 1,
                                              new_index.stop + skiprows - 1)
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])
            new_index.name = empty_pd_df.index.name

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids)

        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)
        # If parse_dates is present, the column names that we have might not be
        # the same length as the returned column names. If we do need to modify
        # the column names, we remove the old names from the column names and
        # insert the new one at the front of the Index.
        if parse_dates is not None:
            # We have to recompute the column widths if `parse_dates` is set because
            # we are not guaranteed to have the correct information regarding how many
            # columns are on each partition.
            column_widths = None
            # Check if is list of lists
            if isinstance(parse_dates, list) and isinstance(
                    parse_dates[0], list):
                for group in parse_dates:
                    new_col_name = "_".join(group)
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)
            # Check if it is a dictionary
            elif isinstance(parse_dates, dict):
                for new_col_name, group in parse_dates.items():
                    column_names = column_names.drop(group).insert(
                        0, new_col_name)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            partition_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)

        if skipfooter:
            new_query_compiler = new_query_compiler.drop(
                new_query_compiler.index[-skipfooter:])
        if kwargs.get("squeeze", False) and len(
                new_query_compiler.columns) == 1:
            return new_query_compiler[new_query_compiler.columns[0]]
        if index_col is None:
            new_query_compiler._modin_frame._apply_index_objs(axis=0)
        return new_query_compiler
Esempio n. 15
0
    def _read(cls, sql, con, index_col=None, **kwargs):
        """
        Read a SQL query or database table into a query compiler.

        Parameters
        ----------
        sql : str or SQLAlchemy Selectable (select or text object)
            SQL query to be executed or a table name.
        con : SQLAlchemy connectable, str, or sqlite3 connection
            Connection object to database.
        index_col : str or list of str, optional
            Column(s) to set as index(MultiIndex).
        **kwargs : dict
            Parameters to pass into `pandas.read_sql` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        try:
            import psycopg2 as pg

            if isinstance(con, pg.extensions.connection):
                con = "postgresql+psycopg2://{}:{}@{}{}/{}".format(  # Table in DB
                    con.info.user,  # <Username>: for DB
                    con.info.password,  # Password for DB
                    con.info.host
                    if con.info.host != "/tmp" else "",  # @<Hostname>
                    (":" + str(con.info.port))
                    if con.info.host != "/tmp" else "",  # <port>
                    con.info.dbname,  # Table in DB
                )
        except ImportError:
            pass
        # In the case that we are given a SQLAlchemy Connection or Engine, the objects
        # are not pickleable. We have to convert it to the URL string and connect from
        # each of the workers.
        if not isinstance(con, str):
            warnings.warn(
                "To use parallel implementation of `read_sql`, pass the sqlalchemy"
                "connection string instead of {}.".format(type(con)))
            return cls.single_worker_read(sql,
                                          con=con,
                                          index_col=index_col,
                                          **kwargs)
        row_cnt_query = "SELECT COUNT(*) FROM ({}) as foo".format(sql)
        row_cnt = pandas.read_sql(row_cnt_query, con).squeeze()
        cols_names_df = pandas.read_sql(
            "SELECT * FROM ({}) as foo LIMIT 0".format(sql),
            con,
            index_col=index_col)
        cols_names = cols_names_df.columns
        num_partitions = NPartitions.get()
        partition_ids = []
        index_ids = []
        dtype_ids = []
        limit = math.ceil(row_cnt / num_partitions)
        for part in range(num_partitions):
            offset = part * limit
            query = "SELECT * FROM ({}) as foo LIMIT {} OFFSET {}".format(
                sql, limit, offset)
            partition_id = cls.deploy(
                cls.parse,
                num_partitions + 2,
                dict(
                    num_splits=num_partitions,
                    sql=query,
                    con=con,
                    index_col=index_col,
                    **kwargs,
                ),
            )
            partition_ids.append(
                [cls.frame_partition_cls(obj) for obj in partition_id[:-2]])
            index_ids.append(partition_id[-2])
            dtype_ids.append(partition_ids[-1])
        if index_col is None:  # sum all lens returned from partitions
            index_lens = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(index_lens))
        else:  # concat index returned from partitions
            index_lst = [
                x for part_index in cls.materialize(index_ids)
                for x in part_index
            ]
            new_index = pandas.Index(index_lst).set_names(index_col)
        new_frame = cls.frame_cls(np.array(partition_ids), new_index,
                                  cols_names)
        new_frame.synchronize_labels(axis=0)
        return cls.query_compiler_cls(new_frame)
Esempio n. 16
0
    def read_sql(
        cls,
        sql,
        con,
        index_col=None,
        coerce_float=True,
        params=None,
        parse_dates=None,
        columns=None,
        chunksize=None,
        partition_column=None,
        lower_bound=None,
        upper_bound=None,
        max_sessions=None,
    ):
        """
        Read SQL query or database table into a DataFrame.

        The function extended with `Spark-like parameters <https://spark.apache.org/docs/2.0.0/api/R/read.jdbc.html>`_
        such as ``partition_column``, ``lower_bound`` and ``upper_bound``. With these
        parameters, the user will be able to specify how to partition the imported data.

        Parameters
        ----------
        sql : str or SQLAlchemy Selectable (select or text object)
            SQL query to be executed or a table name.
        con : SQLAlchemy connectable or str
             Connection to database (sqlite3 connections are not supported).
        index_col : str or list of str, optional
            Column(s) to set as index(MultiIndex).
        coerce_float : bool, default: True
            Attempts to convert values of non-string, non-numeric objects
            (like decimal.Decimal) to floating point, useful for SQL result sets.
        params : list, tuple or dict, optional
            List of parameters to pass to ``execute`` method. The syntax used
            to pass parameters is database driver dependent. Check your
            database driver documentation for which of the five syntax styles,
            described in PEP 249's paramstyle, is supported.
        parse_dates : list or dict, optional
            The behavior is as follows:

            - List of column names to parse as dates.
            - Dict of `{column_name: format string}` where format string is
              strftime compatible in case of parsing string times, or is one of
              (D, s, ns, ms, us) in case of parsing integer timestamps.
            - Dict of `{column_name: arg dict}`, where the arg dict corresponds
              to the keyword arguments of ``pandas.to_datetime``.
              Especially useful with databases without native Datetime support,
              such as SQLite.
        columns : list, optional
            List of column names to select from SQL table (only used when reading a
            table).
        chunksize : int, optional
            If specified, return an iterator where `chunksize` is the number of rows
            to include in each chunk.
        partition_column : str, optional
            Column name used for data partitioning between the workers
            (MUST be an INTEGER column).
        lower_bound : int, optional
            The minimum value to be requested from the `partition_column`.
        upper_bound : int, optional
            The maximum value to be requested from the `partition_column`.
        max_sessions : int, optional
            The maximum number of simultaneous connections allowed to use.

        Returns
        -------
        BaseQueryCompiler
            A new query compiler with imported data for further processing.
        """
        from .sql import is_distributed, get_query_info

        if not is_distributed(partition_column, lower_bound, upper_bound):
            warnings.warn("Defaulting to Modin core implementation")
            return PandasOnRayIO.read_sql(
                sql,
                con,
                index_col,
                coerce_float=coerce_float,
                params=params,
                parse_dates=parse_dates,
                columns=columns,
                chunksize=chunksize,
            )
        #  starts the distributed alternative
        cols_names, query = get_query_info(sql, con, partition_column)
        num_parts = min(NPartitions.get(), max_sessions if max_sessions else 1)
        num_splits = min(len(cols_names), num_parts)
        diff = (upper_bound - lower_bound) + 1
        min_size = diff // num_parts
        rest = diff % num_parts
        partition_ids = []
        index_ids = []
        end = lower_bound - 1
        for part in range(num_parts):
            if rest:
                size = min_size + 1
                rest -= 1
            else:
                size = min_size
            start = end + 1
            end = start + size - 1
            partition_id = _read_sql_with_offset_pandas_on_ray.options(
                num_returns=num_splits + 1).remote(
                    partition_column,
                    start,
                    end,
                    num_splits,
                    query,
                    con,
                    index_col,
                    coerce_float,
                    params,
                    parse_dates,
                    columns,
                    chunksize,
                )
            partition_ids.append([
                PandasOnRayDataframePartition(obj) for obj in partition_id[:-1]
            ])
            index_ids.append(partition_id[-1])
        new_index = pandas.RangeIndex(sum(ray.get(index_ids)))
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_cls(np.array(partition_ids), new_index, cols_names))
        new_query_compiler._modin_frame.synchronize_labels(axis=0)
        return new_query_compiler
Esempio n. 17
0
import os
import logging
import modin.pandas as pd
import pandas
import numpy as np
import uuid

RAND_LOW = 0
RAND_HIGH = 100
random_state = np.random.RandomState(seed=42)


try:
    from modin.config import NPartitions

    NPARTITIONS = NPartitions.get()
except ImportError:
    NPARTITIONS = pd.DEFAULT_NPARTITIONS

try:
    from modin.config import TestDatasetSize, AsvImplementation

    ASV_USE_IMPL = AsvImplementation.get()
    ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
except ImportError:
    # The same benchmarking code can be run for different versions of Modin, so in
    # case of an error importing important variables, we'll just use predefined values
    ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
    ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")

assert ASV_USE_IMPL in ("modin", "pandas")
Esempio n. 18
0
    def _read(cls, path_or_buf, **kwargs):
        """
        Read data from `path_or_buf` according to the passed `read_json` `kwargs` parameters.

        Parameters
        ----------
        path_or_buf : str, path object or file-like object
            `path_or_buf` parameter of `read_json` function.
        **kwargs : dict
            Parameters of `read_json` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        path_or_buf = cls.get_path_or_buffer(path_or_buf)
        if isinstance(path_or_buf, str):
            if not cls.file_exists(path_or_buf):
                return cls.single_worker_read(path_or_buf, **kwargs)
            path_or_buf = cls.get_path(path_or_buf)
        elif not cls.pathlib_or_pypath(path_or_buf):
            return cls.single_worker_read(path_or_buf, **kwargs)
        if not kwargs.get("lines", False):
            return cls.single_worker_read(path_or_buf, **kwargs)
        with OpenFile(path_or_buf, "rb") as f:
            columns = pandas.read_json(BytesIO(b"" + f.readline()),
                                       lines=True).columns
        kwargs["columns"] = columns
        empty_pd_df = pandas.DataFrame(columns=columns)

        with OpenFile(path_or_buf, "rb", kwargs.get("compression",
                                                    "infer")) as f:
            partition_ids = []
            index_ids = []
            dtypes_ids = []

            column_widths, num_splits = cls._define_metadata(
                empty_pd_df, columns)

            args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs}

            splits = cls.partitioned_file(
                f,
                num_partitions=NPartitions.get(),
            )
            for start, end in splits:
                args.update({"start": start, "end": end})
                partition_id = cls.deploy(cls.parse,
                                          num_returns=num_splits + 3,
                                          **args)
                partition_ids.append(partition_id[:-3])
                index_ids.append(partition_id[-3])
                dtypes_ids.append(partition_id[-2])

        # partition_id[-1] contains the columns for each partition, which will be useful
        # for implementing when `lines=False`.
        row_lengths = cls.materialize(index_ids)
        new_index = pandas.RangeIndex(sum(row_lengths))

        dtypes = cls.get_dtypes(dtypes_ids)
        partition_ids = cls.build_partition(partition_ids, row_lengths,
                                            column_widths)

        if isinstance(dtypes, pandas.Series):
            dtypes.index = columns
        else:
            dtypes = pandas.Series(dtypes, index=columns)

        new_frame = cls.frame_cls(
            np.array(partition_ids),
            new_index,
            columns,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_frame.synchronize_labels(axis=0)
        return cls.query_compiler_cls(new_frame)
Esempio n. 19
0
import numpy as np
import pandas

from .utils import generate_dataframe, RAND_LOW, RAND_HIGH, random_string
from modin.config import NPartitions

try:
    from modin.config import TestDatasetSize, AsvImplementation

    ASV_USE_IMPL = AsvImplementation.get()
    ASV_DATASET_SIZE = TestDatasetSize.get()
except ImportError:
    # The same benchmarking code can be run for different versions of Modin, so in
    # case of an error importing important variables, we'll just use predefined values
    ASV_USE_IMPL = "modin"
    ASV_DATASET_SIZE = "Big" if NPartitions.get() >= 32 else "Small"

if ASV_DATASET_SIZE == "Big":
    BINARY_OP_DATA_SIZE = [
        (5000, 5000, 5000, 5000),
        # the case extremely inefficient
        # (20, 500_000, 10, 1_000_000),
        (500_000, 20, 1_000_000, 10),
    ]
    UNARY_OP_DATA_SIZE = [
        (5000, 5000),
        # the case extremely inefficient
        # (10, 1_000_000),
        (1_000_000, 10),
    ]
else:
Esempio n. 20
0
    def _read(cls, io, **kwargs):
        if (kwargs.get("engine", None) is not None
                and kwargs.get("engine") != "openpyxl"):
            warnings.warn(
                "Modin only implements parallel `read_excel` with `openpyxl` engine, "
                'please specify `engine=None` or `engine="openpyxl"` to '
                "use Modin's parallel implementation.")
            return cls.single_worker_read(io, **kwargs)
        if sys.version_info < (3, 7):
            warnings.warn(
                "Python 3.7 or higher required for parallel `read_excel`.")
            return cls.single_worker_read(io, **kwargs)

        from zipfile import ZipFile
        from openpyxl.worksheet.worksheet import Worksheet
        from openpyxl.worksheet._reader import WorksheetReader
        from openpyxl.reader.excel import ExcelReader
        from modin.backends.pandas.parsers import PandasExcelParser

        sheet_name = kwargs.get("sheet_name", 0)
        if sheet_name is None or isinstance(sheet_name, list):
            warnings.warn(
                "`read_excel` functionality is only implemented for a single sheet at a "
                "time. Multiple sheet reading coming soon!")
            return cls.single_worker_read(io, **kwargs)

        warnings.warn("Parallel `read_excel` is a new feature! Please email "
                      "[email protected] if you run into any problems.")

        # NOTE: ExcelReader() in read-only mode does not close file handle by itself
        # work around that by passing file object if we received some path
        io_file = open(io, "rb") if isinstance(io, str) else io
        try:
            ex = ExcelReader(io_file, read_only=True)
            ex.read()
            wb = ex.wb

            # Get shared strings
            ex.read_manifest()
            ex.read_strings()
            ws = Worksheet(wb)
        finally:
            if isinstance(io, str):
                # close only if it were us who opened the object
                io_file.close()

        pandas_kw = dict(kwargs)  # preserve original kwargs
        with ZipFile(io) as z:
            from io import BytesIO

            # Convert index to sheet name in file
            if isinstance(sheet_name, int):
                sheet_name = "sheet{}".format(sheet_name + 1)
            else:
                sheet_name = "sheet{}".format(
                    wb.sheetnames.index(sheet_name) + 1)
            if any(sheet_name.lower() in name for name in z.namelist()):
                sheet_name = sheet_name.lower()
            elif any(sheet_name.title() in name for name in z.namelist()):
                sheet_name = sheet_name.title()
            else:
                raise ValueError("Sheet {} not found".format(
                    sheet_name.lower()))
            # Pass this value to the workers
            kwargs["sheet_name"] = sheet_name

            f = z.open("xl/worksheets/{}.xml".format(sheet_name))
            f = BytesIO(f.read())
            total_bytes = cls.file_size(f)

            num_partitions = NPartitions.get()
            # Read some bytes from the sheet so we can extract the XML header and first
            # line. We need to make sure we get the first line of the data as well
            # because that is where the column names are. The header information will
            # be extracted and sent to all of the nodes.
            sheet_block = f.read(EXCEL_READ_BLOCK_SIZE)
            end_of_row_tag = b"</row>"
            while end_of_row_tag not in sheet_block:
                sheet_block += f.read(EXCEL_READ_BLOCK_SIZE)
            idx_of_header_end = sheet_block.index(end_of_row_tag) + len(
                end_of_row_tag)
            sheet_header = sheet_block[:idx_of_header_end]
            # Reset the file pointer to begin at the end of the header information.
            f.seek(idx_of_header_end)
            kwargs["_header"] = sheet_header
            footer = b"</sheetData></worksheet>"
            # Use openpyxml to parse the data
            reader = WorksheetReader(ws, BytesIO(sheet_header + footer),
                                     ex.shared_strings, False)
            # Attach cells to the worksheet
            reader.bind_cells()
            data = PandasExcelParser.get_sheet_data(
                ws, kwargs.get("convert_float", True))
            # Extract column names from parsed data.
            column_names = pandas.Index(data[0])
            index_col = kwargs.get("index_col", None)
            # Remove column names that are specified as `index_col`
            if index_col is not None:
                column_names = column_names.drop(column_names[index_col])

            if not all(column_names):
                # some column names are empty, use pandas reader to take the names from it
                pandas_kw["nrows"] = 1
                df = pandas.read_excel(io, **pandas_kw)
                column_names = df.columns

            # Compute partition metadata upfront so it is uniform for all partitions
            chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)
            num_splits = min(len(column_names), num_partitions)
            kwargs["fname"] = io
            # Skiprows will be used to inform a partition how many rows come before it.
            kwargs["skiprows"] = 0
            rows_to_skip = 0
            data_ids = []
            index_ids = []
            dtypes_ids = []

            # Compute column metadata
            column_chunksize = compute_chunksize(
                pandas.DataFrame(columns=column_names), num_splits, axis=1)
            if column_chunksize > len(column_names):
                column_widths = [len(column_names)]
                # This prevents us from unnecessarily serializing a bunch of empty
                # objects.
                num_splits = 1
            else:
                column_widths = [
                    column_chunksize if len(column_names) >
                    (column_chunksize * (i + 1)) else 0 if len(column_names) <
                    (column_chunksize * i) else len(column_names) -
                    (column_chunksize * i) for i in range(num_splits)
                ]
            kwargs["num_splits"] = num_splits

            while f.tell() < total_bytes:
                args = kwargs
                args["skiprows"] = rows_to_skip
                args["start"] = f.tell()
                chunk = f.read(chunk_size)
                # This edge case can happen when we have reached the end of the data
                # but not the end of the file.
                if b"<row" not in chunk:
                    break
                row_close_tag = b"</row>"
                row_count = re.subn(row_close_tag, b"", chunk)[1]

                # Make sure we are reading at least one row.
                while row_count == 0:
                    chunk += f.read(chunk_size)
                    row_count += re.subn(row_close_tag, b"", chunk)[1]

                last_index = chunk.rindex(row_close_tag)
                f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1)
                args["end"] = f.tell()

                # If there is no data, exit before triggering computation.
                if b"</row>" not in chunk and b"</sheetData>" in chunk:
                    break
                # We need to make sure we include all rows, even those that have no
                # data. Getting the number of the last row will turn into the number of
                # skipped rows, so if there are any rows missing between the last row
                # seen here and the first row the next partition reads, the parser will
                # have to include those rows in that specific partition to match the
                # expected behavior. We subtract 1 here because the header is included
                # in the skip values, and we do not want to skip the header.
                rows_to_skip = (int(chunk[:last_index + len(row_close_tag)].
                                    split(b'<row r="')[-1].split(b'"')[0]) - 1)
                remote_results_list = cls.deploy(cls.parse, num_splits + 2,
                                                 args)
                data_ids.append(remote_results_list[:-2])
                index_ids.append(remote_results_list[-2])
                dtypes_ids.append(remote_results_list[-1])

                # The end of the spreadsheet
                if b"</sheetData>" in chunk:
                    break

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids)

        data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            data_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)
        if index_col is None:
            new_query_compiler._modin_frame._apply_index_objs(axis=0)
        return new_query_compiler
Esempio n. 21
0
    def read_sql(
        cls,
        sql,
        con,
        index_col=None,
        coerce_float=True,
        params=None,
        parse_dates=None,
        columns=None,
        chunksize=None,
        partition_column=None,
        lower_bound=None,
        upper_bound=None,
        max_sessions=None,
    ):
        """Read SQL query or database table into a DataFrame.

        Args:
            sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name.
            con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode)
            index_col: Column(s) to set as index(MultiIndex).
            coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to
                          floating point, useful for SQL result sets.
            params: List of parameters to pass to execute method. The syntax used
                    to pass parameters is database driver dependent. Check your
                    database driver documentation for which of the five syntax styles,
                    described in PEP 249's paramstyle, is supported.
            parse_dates:
                         - List of column names to parse as dates.
                         - Dict of ``{column_name: format string}`` where format string is
                           strftime compatible in case of parsing string times, or is one of
                           (D, s, ns, ms, us) in case of parsing integer timestamps.
                         - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
                           to the keyword arguments of :func:`pandas.to_datetime`
                           Especially useful with databases without native Datetime support,
                           such as SQLite.
            columns: List of column names to select from SQL table (only used when reading a table).
            chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk.
            partition_column: column used to share the data between the workers (MUST be a INTEGER column)
            lower_bound: the minimum value to be requested from the partition_column
            upper_bound: the maximum value to be requested from the partition_column
            max_sessions: the maximum number of simultaneous connections allowed to use

        Returns:
            Pandas Dataframe
        """
        from .sql import is_distributed, get_query_info

        if not is_distributed(partition_column, lower_bound, upper_bound):
            warnings.warn("Defaulting to Modin core implementation")
            return PandasOnRayIO.read_sql(
                sql,
                con,
                index_col,
                coerce_float=coerce_float,
                params=params,
                parse_dates=parse_dates,
                columns=columns,
                chunksize=chunksize,
            )
        #  starts the distributed alternative
        cols_names, query = get_query_info(sql, con, partition_column)
        num_parts = min(NPartitions.get(), max_sessions)
        num_splits = min(len(cols_names), num_parts)
        diff = (upper_bound - lower_bound) + 1
        min_size = diff // num_parts
        rest = diff % num_parts
        partition_ids = []
        index_ids = []
        end = lower_bound - 1
        for part in range(num_parts):
            if rest:
                size = min_size + 1
                rest -= 1
            else:
                size = min_size
            start = end + 1
            end = start + size - 1
            partition_id = _read_sql_with_offset_pandas_on_ray._remote(
                args=(
                    partition_column,
                    start,
                    end,
                    num_splits,
                    query,
                    con,
                    index_col,
                    coerce_float,
                    params,
                    parse_dates,
                    columns,
                    chunksize,
                ),
                num_returns=num_splits + 1,
            )
            partition_ids.append(
                [PandasOnRayFramePartition(obj) for obj in partition_id[:-1]])
            index_ids.append(partition_id[-1])
        new_index = pandas.RangeIndex(sum(ray.get(index_ids)))
        return cls.query_compiler_cls(
            cls.frame_cls(np.array(partition_ids), new_index, cols_names))
Esempio n. 22
0
    def partitioned_file(
        cls,
        f,
        num_partitions: int = None,
        nrows: int = None,
        skiprows: int = None,
        quotechar: bytes = b'"',
        is_quoting: bool = True,
    ):
        """
        Compute chunk sizes in bytes for every partition.

        Parameters
        ----------
        f: file to be partitioned
        num_partitions: int, optional
            For what number of partitions split a file.
            If not specified grabs the value from `modin.config.NPartitions.get()`
        nrows: int, optional
            Number of rows of file to read.
        skiprows: array or callable, optional
            Specifies rows to skip.
        quotechar: bytes, default b'"'
            Indicate quote in a file.
        is_quoting: bool, default True
            Whether or not to consider quotes.

        Returns
        -------
        An array, where each element of array is a tuple of two ints:
        beginning and the end offsets of the current chunk.
        """
        if num_partitions is None:
            num_partitions = NPartitions.get()

        rows_skipper = cls.rows_skipper_builder(f,
                                                quotechar,
                                                is_quoting=is_quoting)
        result = []

        file_size = cls.file_size(f)

        rows_skipper(skiprows)

        start = f.tell()

        if nrows:
            read_rows_counter = 0
            partition_size = max(1, num_partitions, nrows // num_partitions)
            while f.tell() < file_size and read_rows_counter < nrows:
                if read_rows_counter + partition_size > nrows:
                    # it's possible only if is_quoting==True
                    partition_size = nrows - read_rows_counter
                outside_quotes, read_rows = cls._read_rows(
                    f,
                    nrows=partition_size,
                    quotechar=quotechar,
                    is_quoting=is_quoting,
                )
                result.append((start, f.tell()))
                start = f.tell()
                read_rows_counter += read_rows

                # add outside_quotes
                if is_quoting and not outside_quotes:
                    warnings.warn("File has mismatched quotes")
        else:
            partition_size = max(1, num_partitions,
                                 file_size // num_partitions)
            while f.tell() < file_size:
                outside_quotes = cls.offset(
                    f,
                    offset_size=partition_size,
                    quotechar=quotechar,
                    is_quoting=is_quoting,
                )

                result.append((start, f.tell()))
                start = f.tell()

                # add outside_quotes
                if is_quoting and not outside_quotes:
                    warnings.warn("File has mismatched quotes")

        return result
Esempio n. 23
0
    def _read(cls, filepath_or_buffer, **kwargs):
        filepath_or_buffer_md = (cls.get_path(filepath_or_buffer)
                                 if isinstance(filepath_or_buffer, str) else
                                 cls.get_path_or_buffer(filepath_or_buffer))
        compression_infered = cls.infer_compression(filepath_or_buffer,
                                                    kwargs.get("compression"))
        use_modin_impl = cls._read_csv_check_support(filepath_or_buffer,
                                                     kwargs,
                                                     compression_infered)
        if not use_modin_impl:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        # Getting frequently used read_csv kwargs
        names = kwargs.get("names", None)
        index_col = kwargs.get("index_col", None)
        encoding = kwargs.get("encoding", None)
        skiprows = kwargs.get("skiprows")

        is_quoting = kwargs.get("quoting", "") != QUOTE_NONE
        quotechar = kwargs.get(
            "quotechar",
            '"').encode(encoding if encoding is not None else "UTF-8")

        # Define header size for further skipping (Header can be skipped because header
        # information will be obtained further from empty_df, so no need to handle it
        # by workers)
        header_size = cls._define_header_size(
            kwargs.get("header", "infer"),
            names,
        )
        # Since skiprows can be only integer here (non-integer skiprows trigger fallback
        # to pandas implementation for now) we can process header_size and skiprows
        # simultaneously
        skiprows = skiprows + header_size if skiprows else header_size

        # Now we need to define parameters, which are common for all partitions. These
        # parameters can be `sniffed` from empty dataframes created further
        if names is None:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly
            names = pandas.read_csv(
                filepath_or_buffer,
                **dict(kwargs,
                       usecols=None,
                       nrows=0,
                       skipfooter=0,
                       index_col=None),
            ).columns
        elif index_col is None and not kwargs.get("usecols", None):
            # When names is set to some list that is smaller than the number of columns
            # in the file, the first columns are built as a hierarchical index.
            empty_pd_df = pandas.read_csv(filepath_or_buffer,
                                          nrows=0,
                                          encoding=encoding)
            num_cols = len(empty_pd_df.columns)
            if num_cols > len(names):
                index_col = list(range(num_cols - len(names)))
                if len(index_col) == 1:
                    index_col = index_col[0]
        empty_pd_df = pandas.read_csv(
            filepath_or_buffer,
            **dict(kwargs, nrows=0, skipfooter=0, index_col=index_col),
        )
        column_names = empty_pd_df.columns

        # Max number of partitions available
        num_partitions = NPartitions.get()
        # This is the number of splits for the columns
        num_splits = min(len(column_names), num_partitions)
        # Metadata definition
        column_widths, num_splits = cls._define_metadata(
            empty_pd_df, num_splits, column_names)

        # kwargs that will be passed to the workers
        partition_kwargs = dict(
            kwargs,
            fname=filepath_or_buffer_md,
            num_splits=num_splits,
            header=None,
            names=names,
            skipfooter=0,
            skiprows=1 if encoding is not None else None,
            nrows=None,
            compression=compression_infered,
            index_col=index_col,
        )

        with cls.file_open(filepath_or_buffer_md, "rb",
                           compression_infered) as f:
            splits = cls.partitioned_file(
                f,
                num_partitions=num_partitions,
                nrows=kwargs.get("nrows", None),
                skiprows=skiprows,
                quotechar=quotechar,
                is_quoting=is_quoting,
            )

        partition_ids, index_ids, dtypes_ids = cls._launch_tasks(
            splits, **partition_kwargs)

        new_query_compiler = cls._get_new_qc(
            partition_ids=partition_ids,
            index_ids=index_ids,
            dtypes_ids=dtypes_ids,
            index_col_md=index_col,
            index_name=empty_pd_df.index.name,
            column_widths=column_widths,
            column_names=column_names,
            squeeze=kwargs.get("squeeze", False),
            skipfooter=kwargs.get("skipfooter", None),
            parse_dates=kwargs.get("parse_dates", False),
        )
        return new_query_compiler
Esempio n. 24
0
    def _read(cls, filepath_or_buffer, columns, custom_parser, **kwargs):
        r"""
        Read data from `filepath_or_buffer` according to the passed `read_custom_text` `kwargs` parameters.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of `read_custom_text` function.
        columns : list or callable(file-like object, \*\*kwargs -> list
            Column names of list type or callable that create column names from opened file
            and passed `kwargs`.
        custom_parser : callable(file-like object, \*\*kwargs -> pandas.DataFrame
            Function that takes as input a part of the `filepath_or_buffer` file loaded into
            memory in file-like object form.
        **kwargs : dict
            Parameters of `read_custom_text` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        filepath_or_buffer_md = (
            cls.get_path(filepath_or_buffer)
            if isinstance(filepath_or_buffer, str)
            else cls.get_path_or_buffer(filepath_or_buffer)
        )
        compression_infered = cls.infer_compression(
            filepath_or_buffer, kwargs["compression"]
        )

        with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f:
            splits = cls.partitioned_file(
                f,
                num_partitions=NPartitions.get(),
                is_quoting=kwargs.pop("is_quoting"),
                nrows=kwargs["nrows"],
            )

        if callable(columns):
            with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f:
                columns = columns(f, **kwargs)
        if not isinstance(columns, pandas.Index):
            columns = pandas.Index(columns)

        empty_pd_df = pandas.DataFrame(columns=columns)
        index_name = empty_pd_df.index.name
        column_widths, num_splits = cls._define_metadata(empty_pd_df, columns)

        # kwargs that will be passed to the workers
        partition_kwargs = dict(
            kwargs,
            fname=filepath_or_buffer_md,
            num_splits=num_splits,
            nrows=None,
            compression=compression_infered,
        )

        partition_ids, index_ids, dtypes_ids = cls._launch_tasks(
            splits, callback=custom_parser, **partition_kwargs
        )

        new_query_compiler = cls._get_new_qc(
            partition_ids=partition_ids,
            index_ids=index_ids,
            dtypes_ids=dtypes_ids,
            index_col=None,
            index_name=index_name,
            column_widths=column_widths,
            column_names=columns,
            nrows=kwargs["nrows"],
        )
        return new_query_compiler
Esempio n. 25
0
    def partitioned_file(
        cls,
        files,
        fnames: List[str],
        num_partitions: int = None,
        nrows: int = None,
        skiprows: int = None,
        skip_header: int = None,
        quotechar: bytes = b'"',
        is_quoting: bool = True,
    ) -> List[List[Tuple[str, int, int]]]:
        """
        Compute chunk sizes in bytes for every partition.

        Parameters
        ----------
        files : file or list of files
            File(s) to be partitioned.
        fnames : str or list of str
            File name(s) to be partitioned.
        num_partitions : int, optional
            For what number of partitions split a file.
            If not specified grabs the value from `modin.config.NPartitions.get()`.
        nrows : int, optional
            Number of rows of file to read.
        skiprows : int, optional
            Specifies rows to skip.
        skip_header : int, optional
            Specifies header rows to skip.
        quotechar : bytes, default: b'"'
            Indicate quote in a file.
        is_quoting : bool, default: True
            Whether or not to consider quotes.

        Returns
        -------
        list
            List, where each element of the list is a list of tuples. The inner lists
            of tuples contains the data file name of the chunk, chunk start offset, and
            chunk end offsets for its corresponding file.

        Notes
        -----
        The logic gets really complicated if we try to use the `TextFileDispatcher.partitioned_file`.
        """
        if type(files) != list:
            files = [files]

        if num_partitions is None:
            num_partitions = NPartitions.get()

        file_sizes = [cls.file_size(f) for f in files]
        partition_size = max(
            1, num_partitions, (nrows if nrows else sum(file_sizes)) // num_partitions
        )

        result = []
        split_result = []
        split_size = 0
        read_rows_counter = 0
        for f, fname, f_size in zip(files, fnames, file_sizes):
            if skiprows or skip_header:
                skip_amount = (skiprows if skiprows else 0) + (
                    skip_header if skip_header else 0
                )

                # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df.
                outside_quotes, read_rows = cls._read_rows(
                    f,
                    nrows=skip_amount,
                    quotechar=quotechar,
                    is_quoting=is_quoting,
                )
                if skiprows:
                    skiprows -= read_rows
                    if skiprows > 0:
                        # We have more rows to skip than the amount read in the file.
                        continue

            start = f.tell()

            while f.tell() < f_size:
                if split_size >= partition_size:
                    # Create a new split when the split has reached partition_size.
                    # This is mainly used when we are reading row-wise partitioned files.
                    result.append(split_result)
                    split_result = []
                    split_size = 0

                # We calculate the amount that we need to read based off of how much of the split we have already read.
                read_size = partition_size - split_size

                if nrows:
                    if read_rows_counter >= nrows:
                        # # Finish when we have read enough rows.
                        if len(split_result) > 0:
                            # Add last split into the result.
                            result.append(split_result)
                        return result
                    elif read_rows_counter + read_size > nrows:
                        # Ensure that we will not read more than nrows.
                        read_size = nrows - read_rows_counter

                    outside_quotes, read_rows = cls._read_rows(
                        f,
                        nrows=read_size,
                        quotechar=quotechar,
                        is_quoting=is_quoting,
                    )
                    split_size += read_rows
                    read_rows_counter += read_rows
                else:
                    outside_quotes = cls.offset(
                        f,
                        offset_size=read_size,
                        quotechar=quotechar,
                        is_quoting=is_quoting,
                    )

                split_result.append((fname, start, f.tell()))
                split_size += f.tell() - start
                start = f.tell()

                # Add outside_quotes.
                if is_quoting and not outside_quotes:
                    warnings.warn("File has mismatched quotes")

        # Add last split into the result.
        if len(split_result) > 0:
            result.append(split_result)

        return result
Esempio n. 26
0
    def partitioned_file(
        cls,
        f,
        num_partitions: int = None,
        nrows: int = None,
        skiprows: int = None,
        quotechar: bytes = b'"',
        is_quoting: bool = True,
        encoding: str = None,
        newline: bytes = None,
        header_size: int = 0,
        pre_reading: int = 0,
    ):
        """
        Compute chunk sizes in bytes for every partition.

        Parameters
        ----------
        f : file-like object
            File handle of file to be partitioned.
        num_partitions : int, optional
            For what number of partitions split a file.
            If not specified grabs the value from `modin.config.NPartitions.get()`.
        nrows : int, optional
            Number of rows of file to read.
        skiprows : int, optional
            Specifies rows to skip.
        quotechar : bytes, default: b'"'
            Indicate quote in a file.
        is_quoting : bool, default: True
            Whether or not to consider quotes.
        encoding : str, optional
            Encoding of `f`.
        newline : bytes, optional
            Byte or sequence of bytes indicating line endings.
        header_size : int, default: 0
            Number of rows, that occupied by header.
        pre_reading : int, default: 0
            Number of rows between header and skipped rows, that should be read.

        Returns
        -------
        list
            List with the next elements:
                int : partition start read byte
                int : partition end read byte
        """
        read_rows_counter = 0
        outside_quotes = True

        if num_partitions is None:
            num_partitions = NPartitions.get(
            ) - 1 if pre_reading else NPartitions.get()

        rows_skipper = cls.rows_skipper_builder(f,
                                                quotechar,
                                                is_quoting=is_quoting,
                                                encoding=encoding,
                                                newline=newline)
        result = []

        file_size = cls.file_size(f)

        rows_skipper(header_size)

        if pre_reading:
            pre_reading_start = f.tell()
            outside_quotes, read_rows = cls._read_rows(
                f,
                nrows=pre_reading,
                quotechar=quotechar,
                is_quoting=is_quoting,
                outside_quotes=outside_quotes,
                encoding=encoding,
                newline=newline,
            )
            read_rows_counter += read_rows

            result.append((pre_reading_start, f.tell()))

            # add outside_quotes
            if is_quoting and not outside_quotes:
                warnings.warn("File has mismatched quotes")

        rows_skipper(skiprows)

        start = f.tell()

        if nrows:
            partition_size = max(1, num_partitions, nrows // num_partitions)
            while f.tell() < file_size and read_rows_counter < nrows:
                if read_rows_counter + partition_size > nrows:
                    # it's possible only if is_quoting==True
                    partition_size = nrows - read_rows_counter
                outside_quotes, read_rows = cls._read_rows(
                    f,
                    nrows=partition_size,
                    quotechar=quotechar,
                    is_quoting=is_quoting,
                    encoding=encoding,
                    newline=newline,
                )
                result.append((start, f.tell()))
                start = f.tell()
                read_rows_counter += read_rows

                # add outside_quotes
                if is_quoting and not outside_quotes:
                    warnings.warn("File has mismatched quotes")
        else:
            partition_size = max(1, num_partitions,
                                 file_size // num_partitions)
            while f.tell() < file_size:
                outside_quotes = cls.offset(
                    f,
                    offset_size=partition_size,
                    quotechar=quotechar,
                    is_quoting=is_quoting,
                    encoding=encoding,
                    newline=newline,
                )

                result.append((start, f.tell()))
                start = f.tell()

                # add outside_quotes
                if is_quoting and not outside_quotes:
                    warnings.warn("File has mismatched quotes")

        return result
Esempio n. 27
0
    def _read(cls, filepath_or_buffer, **kwargs):
        """
        Read data from multiple `.csv` files passed with `filepath_or_buffer` simultaneously.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of ``read_csv`` function.
        **kwargs : dict
            Parameters of ``read_csv`` function.

        Returns
        -------
        new_query_compiler : BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        # Ensures that the file is a string file path. Otherwise, default to pandas.
        filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
        if isinstance(filepath_or_buffer, str):
            # os.altsep == None on Linux
            is_folder = any(
                filepath_or_buffer.endswith(sep) for sep in (os.sep, os.altsep) if sep
            )
            if "*" not in filepath_or_buffer and not is_folder:
                warnings.warn(
                    "Shell-style wildcard '*' must be in the filename pattern in order to read multiple "
                    + f"files at once. Did you forget it? Passed filename: '{filepath_or_buffer}'"
                )
            if not cls.file_exists(filepath_or_buffer):
                return cls.single_worker_read(filepath_or_buffer, **kwargs)
            filepath_or_buffer = cls.get_path(filepath_or_buffer)
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the
        # first file but with different data values.
        glob_filepaths = filepath_or_buffer
        filepath_or_buffer = filepath_or_buffer[0]

        compression_type = cls.infer_compression(
            filepath_or_buffer, kwargs.get("compression")
        )
        if compression_type is not None:
            if (
                compression_type == "gzip"
                or compression_type == "bz2"
                or compression_type == "xz"
            ):
                kwargs["compression"] = compression_type
            elif (
                compression_type == "zip"
                and sys.version_info[0] == 3
                and sys.version_info[1] >= 7
            ):
                # need python3.7 to .seek and .tell ZipExtFile
                kwargs["compression"] = compression_type
            else:
                return cls.single_worker_read(filepath_or_buffer, **kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            return cls.single_worker_read(filepath_or_buffer, **kwargs)

        nrows = kwargs.pop("nrows", None)
        names = kwargs.get("names", lib.no_default)
        index_col = kwargs.get("index_col", None)
        usecols = kwargs.get("usecols", None)
        encoding = kwargs.get("encoding", None)
        if names in [lib.no_default, None]:
            # For the sake of the empty df, we assume no `index_col` to get the correct
            # column names before we build the index. Because we pass `names` in, this
            # step has to happen without removing the `index_col` otherwise it will not
            # be assigned correctly.
            names = pandas.read_csv(
                filepath_or_buffer,
                **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None),
            ).columns
        elif index_col is None and not usecols:
            # When names is set to some list that is smaller than the number of columns
            # in the file, the first columns are built as a hierarchical index.
            empty_pd_df = pandas.read_csv(
                filepath_or_buffer, nrows=0, encoding=encoding
            )
            num_cols = len(empty_pd_df.columns)
            if num_cols > len(names):
                index_col = list(range(num_cols - len(names)))
                if len(index_col) == 1:
                    index_col = index_col[0]
                kwargs["index_col"] = index_col
        empty_pd_df = pandas.read_csv(
            filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0)
        )
        column_names = empty_pd_df.columns
        skipfooter = kwargs.get("skipfooter", None)
        skiprows = kwargs.pop("skiprows", None)
        usecols_md = cls._validate_usecols_arg(usecols)
        if usecols is not None and usecols_md[1] != "integer":
            del kwargs["usecols"]
            all_cols = pandas.read_csv(
                OpenFile(filepath_or_buffer, "rb"),
                **dict(kwargs, nrows=0, skipfooter=0),
            ).columns
            usecols = all_cols.get_indexer_for(list(usecols_md[0]))
        parse_dates = kwargs.pop("parse_dates", False)
        partition_kwargs = dict(
            kwargs,
            header=None,
            names=names,
            skipfooter=0,
            skiprows=None,
            parse_dates=parse_dates,
            usecols=usecols,
        )
        encoding = kwargs.get("encoding", None)
        quotechar = kwargs.get("quotechar", '"').encode(
            encoding if encoding is not None else "UTF-8"
        )
        is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE

        with ExitStack() as stack:
            files = [
                stack.enter_context(OpenFile(fname, "rb", compression_type))
                for fname in glob_filepaths
            ]

            # Skip the header since we already have the header information and skip the
            # rows we are told to skip.
            if isinstance(skiprows, int) or skiprows is None:
                if skiprows is None:
                    skiprows = 0
                header = kwargs.get("header", "infer")
                if header == "infer" and kwargs.get("names", lib.no_default) in [
                    lib.no_default,
                    None,
                ]:
                    skip_header = 1
                elif isinstance(header, int):
                    skip_header = header + 1
                elif hasattr(header, "__iter__") and not isinstance(header, str):
                    skip_header = max(header) + 1
                else:
                    skip_header = 0
            if kwargs.get("encoding", None) is not None:
                partition_kwargs["skiprows"] = 1
            # Launch tasks to read partitions
            partition_ids = []
            index_ids = []
            dtypes_ids = []
            column_widths, num_splits = cls._define_metadata(empty_pd_df, column_names)

            args = {
                "num_splits": num_splits,
                **partition_kwargs,
            }

            splits = cls.partitioned_file(
                files,
                glob_filepaths,
                num_partitions=NPartitions.get(),
                nrows=nrows,
                skiprows=skiprows,
                skip_header=skip_header,
                quotechar=quotechar,
                is_quoting=is_quoting,
            )

            for chunks in splits:
                args.update({"chunks": chunks})
                partition_id = cls.deploy(cls.parse, num_returns=num_splits + 2, **args)
                partition_ids.append(partition_id[:-2])
                index_ids.append(partition_id[-2])
                dtypes_ids.append(partition_id[-1])

        # Compute the index based on a sum of the lengths of each partition (by default)
        # or based on the column(s) that were requested.
        if index_col is None:
            row_lengths = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))
        else:
            index_objs = cls.materialize(index_ids)
            row_lengths = [len(o) for o in index_objs]
            new_index = index_objs[0].append(index_objs[1:])
            new_index.name = empty_pd_df.index.name

        # Compute dtypes by getting collecting and combining all of the partitions. The
        # reported dtypes from differing rows can be different based on the inference in
        # the limited data seen by each worker. We use pandas to compute the exact dtype
        # over the whole column for each column. The index is set below.
        dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None

        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
        # If parse_dates is present, the column names that we have might not be
        # the same length as the returned column names. If we do need to modify
        # the column names, we remove the old names from the column names and
        # insert the new one at the front of the Index.
        if parse_dates is not None:
            # We have to recompute the column widths if `parse_dates` is set because
            # we are not guaranteed to have the correct information regarding how many
            # columns are on each partition.
            column_widths = None
            if isinstance(parse_dates, list):
                for date in parse_dates:
                    # Lists within the parse_dates list are sequences of
                    # CSV columns that are parsed together as a single date
                    # column. They can be a list of either string column names
                    # or integer column indices. e.g. if parse_dates is
                    # [[1, 2]] and columns at indices 1 and 2 are "b" and "c",
                    # the output dataframe has the single date column "b_c". If
                    # parse_dates is [["a", 1]] and the column at index 1 is
                    # named "b", the output dataframe has the single date
                    # column "a_b".
                    if isinstance(date, list):
                        for i, part in enumerate(date):
                            if isinstance(part, int):
                                date[i] = column_names[part]
                        new_col_name = "_".join(date)
                        column_names = column_names.drop(date).insert(0, new_col_name)
            elif isinstance(parse_dates, dict):
                for new_col_name, group in parse_dates.items():
                    column_names = column_names.drop(group).insert(0, new_col_name)
        # Set the index for the dtypes to the column names
        if isinstance(dtypes, pandas.Series):
            dtypes.index = column_names
        else:
            dtypes = pandas.Series(dtypes, index=column_names)
        new_frame = cls.frame_cls(
            partition_ids,
            new_index,
            column_names,
            row_lengths,
            column_widths,
            dtypes=dtypes,
        )
        new_query_compiler = cls.query_compiler_cls(new_frame)

        if skipfooter:
            new_query_compiler = new_query_compiler.drop(
                new_query_compiler.index[-skipfooter:]
            )
        if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1:
            return new_query_compiler[new_query_compiler.columns[0]]
        if index_col is None:
            new_query_compiler._modin_frame.synchronize_labels(axis=0)
        return new_query_compiler
Esempio n. 28
0
    def _read(cls, sql, con, index_col=None, **kwargs):
        """
        Read a SQL query or database table into a query compiler.

        Parameters
        ----------
        sql : str or SQLAlchemy Selectable (select or text object)
            SQL query to be executed or a table name.
        con : SQLAlchemy connectable, str, sqlite3 connection, or ModinDatabaseConnection
            Connection object to database.
        index_col : str or list of str, optional
            Column(s) to set as index(MultiIndex).
        **kwargs : dict
            Parameters to pass into `pandas.read_sql` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        if isinstance(con, str):
            con = ModinDatabaseConnection("sqlalchemy", con)
        if not isinstance(con, ModinDatabaseConnection):
            warnings.warn(
                "To use parallel implementation of `read_sql`, pass either "
                + "the SQL connection string or a ModinDatabaseConnection "
                + "with the arguments required to make a connection, instead "
                + f"of {type(con)}. For documentation of ModinDatabaseConnection, see "
                + "https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html#connecting-to-a-database-for-read-sql"
            )
            return cls.single_worker_read(
                sql,
                con=con,
                index_col=index_col,
                read_sql_engine=ReadSqlEngine.get(),
                **kwargs,
            )
        row_count_query = con.row_count_query(sql)
        connection_for_pandas = con.get_connection()
        colum_names_query = con.column_names_query(sql)
        row_cnt = pandas.read_sql(row_count_query, connection_for_pandas).squeeze()
        cols_names_df = pandas.read_sql(
            colum_names_query, connection_for_pandas, index_col=index_col
        )
        cols_names = cols_names_df.columns
        num_partitions = NPartitions.get()
        partition_ids = []
        index_ids = []
        dtype_ids = []
        limit = math.ceil(row_cnt / num_partitions)
        for part in range(num_partitions):
            offset = part * limit
            query = con.partition_query(sql, limit, offset)
            partition_id = cls.deploy(
                cls.parse,
                num_returns=num_partitions + 2,
                num_splits=num_partitions,
                sql=query,
                con=con,
                index_col=index_col,
                read_sql_engine=ReadSqlEngine.get(),
                **kwargs,
            )
            partition_ids.append(
                [cls.frame_partition_cls(obj) for obj in partition_id[:-2]]
            )
            index_ids.append(partition_id[-2])
            dtype_ids.append(partition_ids[-1])
        if index_col is None:  # sum all lens returned from partitions
            index_lens = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(index_lens))
        else:  # concat index returned from partitions
            index_lst = [
                x for part_index in cls.materialize(index_ids) for x in part_index
            ]
            new_index = pandas.Index(index_lst).set_names(index_col)
        new_frame = cls.frame_cls(np.array(partition_ids), new_index, cols_names)
        new_frame.synchronize_labels(axis=0)
        return cls.query_compiler_cls(new_frame)
Esempio n. 29
0
    def broadcast_axis_partitions(
        cls,
        axis,
        apply_func,
        left,
        right,
        keep_partitioning=False,
        apply_indices=None,
        enumerate_partitions=False,
        lengths=None,
    ):
        """
        Broadcast the right partitions to left and apply a function along full axis.

        Parameters
        ----------
        axis : The axis to apply and broadcast over.
        apply_func : The function to apply.
        left : The left partitions.
        right : The right partitions.
        keep_partitioning : boolean. Default is False
            The flag to keep partitions for Modin Frame.
        apply_indices : list of ints (optional),
            Indices of `axis ^ 1` to apply function over.
        enumerate_partitions : bool (optional, default False),
            Whether or not to pass partition index into `apply_func`.
            Note that `apply_func` must be able to obtain `partition_idx` kwarg.
        lengths : list(int), default None
            The list of lengths to shuffle the object.

        Returns
        -------
        A new `np.array` of partition objects.
        """
        # Since we are already splitting the DataFrame back up after an
        # operation, we will just use this time to compute the number of
        # partitions as best we can right now.
        if keep_partitioning:
            num_splits = len(left) if axis == 0 else len(left.T)
        elif lengths:
            num_splits = len(lengths)
        else:
            num_splits = NPartitions.get()
        preprocessed_map_func = cls.preprocess_func(apply_func)
        left_partitions = cls.axis_partition(left, axis)
        right_partitions = None if right is None else cls.axis_partition(
            right, axis)
        # For mapping across the entire axis, we don't maintain partitioning because we
        # may want to line to partitioning up with another BlockPartitions object. Since
        # we don't need to maintain the partitioning, this gives us the opportunity to
        # load-balance the data as well.
        kw = {
            "num_splits": num_splits,
            "other_axis_partition": right_partitions,
        }
        if lengths:
            kw["_lengths"] = lengths
            kw["manual_partition"] = True

        if apply_indices is None:
            apply_indices = np.arange(len(left_partitions))

        result_blocks = np.array([
            left_partitions[i].apply(
                preprocessed_map_func,
                **kw,
                **({
                    "partition_idx": idx
                } if enumerate_partitions else {}),
            ) for idx, i in enumerate(apply_indices)
        ])
        # If we are mapping over columns, they are returned to use the same as
        # rows, so we need to transpose the returned 2D NumPy array to return
        # the structure to the correct order.
        return result_blocks.T if not axis else result_blocks
Esempio n. 30
0
    def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs):
        """
        Read data from `filepath_or_buffer` according to `kwargs` parameters.

        Used in `read_csv` and `read_fwf` Modin implementations.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of read functions.
        **kwargs : dict
            Parameters of read functions.

        Returns
        -------
        new_query_compiler : BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        filepath_or_buffer_md = (cls.get_path(filepath_or_buffer)
                                 if isinstance(filepath_or_buffer, str) else
                                 cls.get_path_or_buffer(filepath_or_buffer))
        compression_infered = cls.infer_compression(filepath_or_buffer,
                                                    kwargs["compression"])
        # Getting frequently used kwargs;
        # They should be defined in higher level
        names = kwargs["names"]
        index_col = kwargs["index_col"]
        encoding = kwargs["encoding"]
        skiprows = kwargs["skiprows"]
        header = kwargs["header"]
        # Define header size for further skipping (Header can be skipped because header
        # information will be obtained further from empty_df, so no need to handle it
        # by workers)
        header_size = cls._define_header_size(
            header,
            names,
        )
        (
            skiprows_md,
            pre_reading,
            skiprows_partitioning,
        ) = cls._manage_skiprows_parameter(skiprows, header_size)
        should_handle_skiprows = skiprows_md is not None and not isinstance(
            skiprows_md, int)

        use_modin_impl = cls.check_parameters_support(
            filepath_or_buffer,
            kwargs,
        )
        if not use_modin_impl:
            return cls.single_worker_read(filepath_or_buffer,
                                          callback=cls.read_callback,
                                          **kwargs)

        is_quoting = kwargs["quoting"] != QUOTE_NONE
        # In these cases we should pass additional metadata
        # to the workers to match pandas output
        pass_names = names in [
            None, lib.no_default
        ] and (skiprows is not None or kwargs["skipfooter"] != 0)

        pd_df_metadata = cls.read_callback(
            filepath_or_buffer,
            **dict(kwargs, nrows=1, skipfooter=0, index_col=index_col),
        )
        column_names = pd_df_metadata.columns
        column_widths, num_splits = cls._define_metadata(
            pd_df_metadata, column_names)

        # kwargs that will be passed to the workers
        partition_kwargs = dict(
            kwargs,
            fname=filepath_or_buffer_md,
            num_splits=num_splits,
            header_size=header_size if not pass_names else 0,
            names=names if not pass_names else column_names,
            header=header if not pass_names else "infer",
            skipfooter=0,
            skiprows=None,
            nrows=None,
            compression=compression_infered,
        )

        with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f:
            old_pos = f.tell()
            fio = io.TextIOWrapper(f, encoding=encoding, newline="")
            newline, quotechar = cls.compute_newline(
                fio, encoding, kwargs.get("quotechar", '"'))
            f.seek(old_pos)
            splits = cls.partitioned_file(
                f,
                num_partitions=NPartitions.get(),
                nrows=kwargs["nrows"] if not should_handle_skiprows else None,
                skiprows=skiprows_partitioning,
                quotechar=quotechar,
                is_quoting=is_quoting,
                encoding=encoding,
                newline=newline,
                header_size=header_size,
                pre_reading=pre_reading,
            )

        partition_ids, index_ids, dtypes_ids = cls._launch_tasks(
            splits, callback=cls.read_callback, **partition_kwargs)

        new_query_compiler = cls._get_new_qc(
            partition_ids=partition_ids,
            index_ids=index_ids,
            dtypes_ids=dtypes_ids,
            index_col=index_col,
            index_name=pd_df_metadata.index.name,
            column_widths=column_widths,
            column_names=column_names,
            skiprows_md=skiprows_md if should_handle_skiprows else None,
            header_size=header_size,
            skipfooter=kwargs["skipfooter"],
            parse_dates=kwargs["parse_dates"],
            nrows=kwargs["nrows"] if should_handle_skiprows else None,
        )
        return new_query_compiler