def _prepare_pyarrow_usecols(cls, read_csv_kwargs): """ Define `usecols` parameter in the way pyarrow can process it. ---------- read_csv_kwargs: read_csv function parameters. Returns ------- usecols_md: list Redefined `usecols` parameter. """ usecols = read_csv_kwargs.get("usecols", None) engine = read_csv_kwargs.get("engine", None) usecols_md, usecols_names_dtypes = _validate_usecols_arg(usecols) if usecols_md: empty_pd_df = pandas.read_csv(**dict( read_csv_kwargs, nrows=0, skipfooter=0, usecols=None, engine=None if engine == "arrow" else engine, )) column_names = empty_pd_df.columns if usecols_names_dtypes == "string": if usecols_md.issubset(set(column_names)): # columns should be sorted because pandas doesn't preserve columns order usecols_md = [ col_name for col_name in column_names if col_name in usecols_md ] else: raise NotImplementedError( "values passed in the `usecols` parameter don't match columns names" ) elif usecols_names_dtypes == "integer": # columns should be sorted because pandas doesn't preserve columns order usecols_md = sorted(usecols_md) if len(column_names) < usecols_md[-1]: raise NotImplementedError( "max usecols value is higher than the number of columns" ) usecols_md = [column_names[i] for i in usecols_md] elif callable(usecols_md): usecols_md = [ col_name for col_name in column_names if usecols_md(col_name) ] else: raise NotImplementedError("unsupported `usecols` parameter") return usecols_md
def read(cls, filepath_or_buffer, **kwargs): if isinstance(filepath_or_buffer, str): if not cls.file_exists(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) compression_type = cls.infer_compression( filepath_or_buffer, kwargs.get("compression", "infer") ) if compression_type is not None: if ( compression_type == "gzip" or compression_type == "bz2" or compression_type == "xz" ): kwargs["compression"] = compression_type elif ( compression_type == "zip" and sys.version_info[0] == 3 and sys.version_info[1] >= 7 ): # need python3.7 to .seek and .tell ZipExtFile kwargs["compression"] = compression_type else: return cls.single_worker_read(filepath_or_buffer, **kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: return cls.single_worker_read(filepath_or_buffer, **kwargs) # If infer_nrows is a significant portion of the number of rows, pandas may be # faster. infer_nrows = kwargs.get("infer_nrows", 100) if infer_nrows > 100: return cls.single_worker_read(filepath_or_buffer, **kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) # TODO: replace this by reading lines from file. if kwargs.get("nrows") is not None: return cls.single_worker_read(filepath_or_buffer, **kwargs) names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly names = pandas.read_fwf( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None) ).columns empty_pd_df = pandas.read_fwf( filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0) ) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols = kwargs.get("usecols", None) usecols_md = _validate_usecols_arg(usecols) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_fwf( cls.file_open(filepath_or_buffer, "rb"), **dict(kwargs, nrows=0, skipfooter=0) ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) encoding = kwargs.get("encoding", None) quotechar = kwargs.get("quotechar", '"').encode( encoding if encoding is not None else "UTF-8" ) with cls.file_open(filepath_or_buffer, "rb", compression_type) as f: # Skip the header since we already have the header information and skip the # rows we are told to skip. if isinstance(skiprows, int) or skiprows is None: if skiprows is None: skiprows = 0 header = kwargs.get("header", "infer") if header == "infer" and kwargs.get("names", None) is None: skiprows += 1 elif isinstance(header, int): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 for _ in range(skiprows): f.readline() if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] total_bytes = cls.file_size(f) # Max number of partitions available from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # This is the chunksize each partition will read chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] while f.tell() < total_bytes: args = { "fname": filepath_or_buffer, "num_splits": num_splits, **partition_kwargs, } partition_id = cls.call_deploy( f, chunk_size, num_splits + 2, args, quotechar=quotechar ) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) # pandas has a really weird edge case here. if kwargs.get("names", None) is not None and skiprows > 1: new_index = pandas.RangeIndex( skiprows - 1, new_index.stop + skiprows - 1 ) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # We have to recompute the column widths if `parse_dates` is set because # we are not guaranteed to have the correct information regarding how many # columns are on each partition. column_widths = None # Check if is list of lists if isinstance(parse_dates, list) and isinstance(parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert(0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert(0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:] ) if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler
def _read_csv_from_file_pandas_on_ray(cls, filepath, kwargs={}): """Constructs a DataFrame from a CSV file. Args: filepath (str): path to the CSV file. npartitions (int): number of partitions for the DataFrame. kwargs (dict): args excluding filepath provided to read_csv. Returns: DataFrame or Series constructed from CSV file. """ names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly kwargs["index_col"] = None names = pandas.read_csv(file_open(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)).columns kwargs["index_col"] = index_col empty_pd_df = pandas.read_csv(file_open(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols = kwargs.get("usecols", None) usecols_md = _validate_usecols_arg(kwargs.get("usecols", None)) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv(file_open(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names if kwargs.get("usecols") is None or kwargs.get("names") is not None else None, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) with file_open(filepath, "rb") as f: # Get the BOM if necessary prefix = b"" if kwargs.get("encoding", None) is not None: prefix = f.readline() partition_kwargs["skiprows"] = 1 f.seek(0, os.SEEK_SET) # Return to beginning of file prefix_id = ray.put(prefix) partition_kwargs_id = ray.put(partition_kwargs) # Skip the header since we already have the header information and skip the # rows we are told to skip. kwargs["skiprows"] = skiprows cls._skip_header(f, kwargs) # Launch tasks to read partitions partition_ids = [] index_ids = [] total_bytes = file_size(f) # Max number of partitions available num_parts = cls.frame_mgr_cls._compute_num_partitions() # This is the number of splits for the columns num_splits = min(len(column_names), num_parts) # This is the chunksize each partition will read chunk_size = max(1, (total_bytes - f.tell()) // num_parts) while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() # Read a whole number of lines partition_id = cls.read_csv_remote_task._remote( args=( filepath, num_splits, start, f.tell(), partition_kwargs_id, prefix_id, ), num_return_vals=num_splits + 1, ) partition_ids.append([ cls.frame_partition_cls(obj) for obj in partition_id[:-1] ]) index_ids.append(partition_id[-1]) if index_col is None: new_index = pandas.RangeIndex(sum(ray.get(index_ids))) else: new_index_ids = get_index.remote([empty_pd_df.index.name], *index_ids) new_index = ray.get(new_index_ids) # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # Check if is list of lists if isinstance(parse_dates, list) and isinstance( parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert( 0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert( 0, new_col_name) new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(np.array(partition_ids)), new_index, column_names) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:]) if kwargs.get("squeeze", False) and len( new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] return new_query_compiler
def _read_csv_from_file_ray(cls, filepath, kwargs={}): """Constructs a DataFrame from a CSV file. Args: filepath (str): path to the CSV file. npartitions (int): number of partitions for the DataFrame. kwargs (dict): args excluding filepath provided to read_csv. Returns: DataFrame or Series constructed from CSV file. """ names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly kwargs["index_col"] = None names = pandas.read_csv(filepath, **dict(kwargs, nrows=0, skipfooter=0)).columns kwargs["index_col"] = index_col empty_pd_df = pandas.read_csv(filepath, **dict(kwargs, nrows=0, skipfooter=0)) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols = kwargs.get("usecols", None) usecols_md = _validate_usecols_arg(kwargs.get("usecols", None)) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv(file_open(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names if kwargs.get("usecols") is None or kwargs.get("names") is not None else None, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) with file_open(filepath, "rb", kwargs.get("compression", "infer")) as f: # Get the BOM if necessary prefix = b"" if kwargs.get("encoding", None) is not None: prefix = f.readline() partition_kwargs["skiprows"] = 1 f.seek(0, os.SEEK_SET) # Return to beginning of file prefix_id = ray.put(prefix) partition_kwargs_id = ray.put(partition_kwargs) # Skip the header since we already have the header information and skip the # rows we are told to skip. kwargs["skiprows"] = skiprows cls._skip_header(f, kwargs) # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] total_bytes = file_size(f) # Max number of partitions available num_parts = cls.frame_mgr_cls._compute_num_partitions() # This is the number of splits for the columns num_splits = min(len(column_names), num_parts) # This is the chunksize each partition will read chunk_size = max(1, (total_bytes - f.tell()) // num_parts) # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() # Read a whole number of lines # The workers return multiple objects for each part of the file read: # - The first n - 2 objects are partitions of data # - The n - 1 object is the length of the partition or the index if # `index_col` is specified. We compute the index below. # - The nth object is the dtypes of the partition. We combine these to # form the final dtypes below. partition_id = cls.read_csv_remote_task._remote( args=( filepath, num_splits, start, f.tell(), partition_kwargs_id, prefix_id, ), num_return_vals=num_splits + 2, ) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = ray.get(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = ray.get(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply( lambda row: find_common_type(row.values), axis=1).squeeze(axis=0)) partition_ids = [[ cls.frame_partition_cls(partition_ids[i][j], length=row_lengths[i], width=column_widths[j]) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids))] # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # Check if is list of lists if isinstance(parse_dates, list) and isinstance( parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert( 0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert( 0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(np.array(partition_ids)), new_index, column_names, dtypes=dtypes, ) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:]) if kwargs.get("squeeze", False) and len( new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] return new_query_compiler
def _read(cls, filepath_or_buffer, **kwargs): """ Read data from multiple `.csv` files passed with `filepath_or_buffer` simultaneously. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of read_csv function. **kwargs : dict Parameters of `read_csv` function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ # Ensures that the file is a string file path. Otherwise, default to pandas. filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, str): if not cls.file_exists(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the # first file but with different data values. glob_filepaths = filepath_or_buffer filepath_or_buffer = filepath_or_buffer[0] compression_type = cls.infer_compression(filepath_or_buffer, kwargs.get("compression")) if compression_type is not None: if (compression_type == "gzip" or compression_type == "bz2" or compression_type == "xz"): kwargs["compression"] = compression_type elif (compression_type == "zip" and sys.version_info[0] == 3 and sys.version_info[1] >= 7): # need python3.7 to .seek and .tell ZipExtFile kwargs["compression"] = compression_type else: return cls.single_worker_read(filepath_or_buffer, **kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: return cls.single_worker_read(filepath_or_buffer, **kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) nrows = kwargs.pop("nrows", None) names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) usecols = kwargs.get("usecols", None) encoding = kwargs.get("encoding", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly. names = pandas.read_csv( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None), ).columns elif index_col is None and not usecols: # When names is set to some list that is smaller than the number of columns # in the file, the first columns are built as a hierarchical index. empty_pd_df = pandas.read_csv(filepath_or_buffer, nrows=0, encoding=encoding) num_cols = len(empty_pd_df.columns) if num_cols > len(names): index_col = list(range(num_cols - len(names))) if len(index_col) == 1: index_col = index_col[0] kwargs["index_col"] = index_col empty_pd_df = pandas.read_csv(filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0)) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols_md = _validate_usecols_arg(usecols) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv( cls.file_open(filepath_or_buffer, "rb"), **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) encoding = kwargs.get("encoding", None) quotechar = kwargs.get( "quotechar", '"').encode(encoding if encoding is not None else "UTF-8") is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE with ExitStack() as stack: files = [ stack.enter_context( cls.file_open(fname, "rb", compression_type)) for fname in glob_filepaths ] # Skip the header since we already have the header information and skip the # rows we are told to skip. if isinstance(skiprows, int) or skiprows is None: if skiprows is None: skiprows = 0 header = kwargs.get("header", "infer") if header == "infer" and kwargs.get("names", None) is None: skip_header = 1 elif isinstance(header, int): skip_header = header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skip_header = max(header) + 1 else: skip_header = 0 if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] # Max number of partitions available num_partitions = NPartitions.get() # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] args = { "num_splits": num_splits, **partition_kwargs, } splits = cls.partitioned_file( files, glob_filepaths, num_partitions=num_partitions, nrows=nrows, skiprows=skiprows, skip_header=skip_header, quotechar=quotechar, is_quoting=is_quoting, ) for chunks in splits: args.update({"chunks": chunks}) partition_id = cls.deploy(cls.parse, num_splits + 2, args) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # We have to recompute the column widths if `parse_dates` is set because # we are not guaranteed to have the correct information regarding how many # columns are on each partition. column_widths = None # Check if is list of lists if isinstance(parse_dates, list) and isinstance( parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert( 0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert( 0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:]) if kwargs.get("squeeze", False) and len( new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler