Example #1
0
 def read(self,
          *files: os.PathLike,
          columns: typing.List[str] = None,
          **kwargs) -> pandas.DataFrame:
     frames = [
         self._read(chunk=f, columns=columns, **kwargs) for f in files
         if os.path.getsize(f) > 0
     ]
     if len(frames) == 1:
         return frames[0]
     elif len(frames) > 1:
         return pandas.concat(frames, copy=True)
     return pandas.DataFrame()
Example #2
0
    def read(self, columns=None, concat=False, truncate_extra_columns=True, **kwargs):
        """
        When this function is called, one chunk will be read and received as a Pandas data frame.  Once all chunks
        have been read, this function will return None.

        :param list[Text] columns: A list of columns to read.  They must be a subset of the columns
            defined for the Schema object.  If specified, truncate_extra_columns must be True.
        :param bool concat:  If true, the entire object will be returned in one large data frame.
        :param bool truncate_extra_columns: If true, only columns from the underlying parquet file will be read if
            they are specified as columns in the schema object (except for empty schemas which will read all columns
            regardless). If false, if there are additional columns in the underlying parquet file, they will also be
            read.
        :rtype: pandas.DataFrame
        """
        if columns is not None and truncate_extra_columns is False:
            raise _user_exceptions.FlyteAssertion(
                "When reading a schema object, it is not possible to both specify a set of columns to read and "
                "additionally not truncate_extra_columns.  Either columns must not be specified or "
                "truncate_extra_columns must be set to True (or not specified)."
            )

        self._access_guard()

        parquet_engine = _sdk_config.PARQUET_ENGINE.get()
        if parquet_engine not in {'fastparquet', 'pyarrow'}:
            raise _user_exceptions.FlyteAssertion(
                "environment variable parquet_engine must be one of 'pyarrow', 'fastparquet', or be unset")

        df_out = None
        if not columns:
            columns = list(self._schema.type.sdk_columns.keys())

        if len(columns) == 0 or truncate_extra_columns is False:
            columns = None

        if concat:
            frames = [
                # A hacky hack
                # TODO: follow up the issue opened in the fastparquet repo for a more general fix
                # issue URL:
                _SchemaReader._read_parquet_with_type_promotion_override(
                    chunk=chunk, columns=columns, parquet_engine=parquet_engine
                )
                # _pd.read_parquet(chunk, columns=columns, engine=parquet_engine)
                for chunk in self._chunks[self._index:]
                if _os.path.getsize(chunk) > 0
            ]
            if len(frames) == 1:
                df_out = frames[0]
            elif len(frames) > 1:
                df_out = _pd.concat(
                    frames,
                    copy=True)
            self._index = len(self._chunks)
        else:
            while self._index < len(self._chunks) and df_out is None:
                # Skip empty chunks so the user appears to have a continuous stream of data.
                if _os.path.getsize(self._chunks[self._index]) > 0:
                    df_out = _SchemaReader._read_parquet_with_type_promotion_override(
                        chunk=self._chunks[self._index],
                        columns=columns,
                        parquet_engine=parquet_engine,
                        **kwargs)
                self._index += 1

        if df_out is not None:
            self._schema.compare_dataframe_to_schema(df_out, read=True, column_subset=columns)

            # Make sure the columns are renamed to exactly what the user specifies.  This prevents unexpected
            # unicode v. string mismatches.  Also, if a schema is mapped with strict_names=False, the input might
            # have totally different names.
            user_columns = columns or _six.iterkeys(self._schema.type.sdk_columns)
            # User-specified columns may or may not be unicode
            # Since, in python 2, dictionary does a transparent translation between unicode and str for the key,
            # (https://stackoverflow.com/a/24532329)
            # we use this characteristic to create a trivial lookup dictionary, to make sure we can use either
            # unicode or str to lookup, but get back whatever type the user actually used
            user_column_dict = {c: c for c in user_columns}
            if len(self._schema.type.columns) > 0:
                # Avoid using pandas.DataFrame.rename() as this function incurs significant memory overhead
                df_out.columns = [
                    user_column_dict[col] if col in user_columns else col
                    for col in df_out.columns.values]
        return df_out