def _are_contents_empty(self, args: GcsfsIngestArgs,
                         contents_handle: GcsfsFileContentsHandle) -> bool:
     """Returns true if the CSV file is emtpy, i.e. it contains no non-header
      rows.
      """
     delegate = ReadOneGcsfsCsvReaderDelegate()
     self.csv_reader.streaming_read(args.file_path,
                                    delegate=delegate,
                                    chunk_size=1,
                                    skiprows=1)
     return delegate.df is None
    def _get_validated_columns(
            self, path: GcsfsFilePath,
            file_config: DirectIngestRawFileConfig) -> List[str]:
        """Returns a list of normalized column names for the raw data file at the given path."""
        # TODO(#3020): We should not derive the columns from what we get in the uploaded raw data CSV - we should
        # instead define the set of columns we expect to see in each input CSV (with mandatory documentation) and update
        # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow
        # to gracefully any raw data re-imports where a new column gets introduced in a later file.

        delegate = ReadOneGcsfsCsvReaderDelegate()
        self.csv_reader.streaming_read(
            path,
            delegate=delegate,
            chunk_size=1,
            nrows=1,
            **self._common_read_csv_kwargs(file_config),
        )
        df = delegate.df

        if not isinstance(df, pd.DataFrame):
            raise ValueError(f"Unexpected type for DataFrame: [{type(df)}]")

        columns = self.remove_column_non_printable_characters(df.columns)

        # Strip whitespace from head/tail of column names
        columns = [c.strip() for c in columns]

        normalized_columns = set()
        for i, column_name in enumerate(columns):
            if not column_name:
                raise ValueError(
                    f"Found empty column name in [{file_config.file_tag}]")

            column_name = self._convert_non_allowable_bq_column_chars(
                column_name)

            # BQ doesn't allow column names to begin with a number, so we prepend an underscore in that case
            if column_name[0] in string.digits:
                column_name = "_" + column_name

            if column_name in normalized_columns:
                raise ValueError(
                    f"Multiple columns with name [{column_name}] after normalization."
                )
            normalized_columns.add(column_name)
            columns[i] = column_name

        return columns
    def _file_meets_file_line_limit(self, line_limit: int,
                                    path: GcsfsFilePath) -> bool:
        delegate = ReadOneGcsfsCsvReaderDelegate()

        # Read a chunk up to one line bigger than the acceptable size
        self.csv_reader.streaming_read(path,
                                       delegate=delegate,
                                       chunk_size=(line_limit + 1))

        if delegate.df is None:
            # If the file is empty, it's fine.
            return True

        # If length of the only chunk is less than or equal to the acceptable
        # size, file meets line limit.
        return len(delegate.df) <= line_limit
    def _get_validated_columns(
            self, path: GcsfsFilePath,
            file_config: DirectIngestRawFileConfig) -> List[str]:
        """Returns a list of normalized column names for the raw data file at the given path."""
        # TODO(3020): We should not derive the columns from what we get in the uploaded raw data CSV - we should instead
        # define the set of columns we expect to see in each input CSV (with mandatory documentation) and update
        # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow
        # to gracefully any raw data re-imports where a new column gets introduced in a later file.

        delegate = ReadOneGcsfsCsvReaderDelegate()
        self.csv_reader.streaming_read(
            path,
            delegate=delegate,
            chunk_size=1,
            nrows=1,
            **self._common_read_csv_kwargs(file_config))
        df = delegate.df

        if not isinstance(df, pd.DataFrame):
            raise ValueError(f'Unexpected type for DataFrame: [{type(df)}]')

        columns = self.remove_column_non_printable_characters(df.columns)

        # Strip whitespace from head/tail of column names
        columns = [c.strip() for c in columns]

        for column_name in columns:
            if not column_name:
                raise ValueError(
                    f'Found empty column name in [{file_config.file_tag}]')

            non_allowable_chars = self._get_non_allowable_bq_column_chars(
                column_name)
            if non_allowable_chars:
                # TODO(3020): Some regions (US_MO) are known to have unsupported chars in their column names - will need
                #  to implement how we reliably convert these column names.
                raise ValueError(
                    f'Column [{column_name}] for file has non-allowable characters {non_allowable_chars}.'
                )

        return columns
Ejemplo n.º 5
0
    def _get_validated_columns(
            self, path: GcsfsFilePath,
            file_config: DirectIngestRawFileConfig) -> List[str]:
        """Returns a list of normalized column names for the raw data file at the given path."""
        # TODO(#3807): We should not derive the columns from what we get in the uploaded raw data CSV - we should
        # instead define the set of columns we expect to see in each input CSV (with mandatory documentation) and update
        # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow
        # to gracefully any raw data re-imports where a new column gets introduced in a later file.

        delegate = ReadOneGcsfsCsvReaderDelegate()
        self.csv_reader.streaming_read(
            path,
            delegate=delegate,
            chunk_size=1,
            encodings_to_try=file_config.encodings_to_try(),
            nrows=1,
            **self._common_read_csv_kwargs(file_config),
        )
        df = delegate.df

        if not isinstance(df, pd.DataFrame):
            raise ValueError(f"Unexpected type for DataFrame: [{type(df)}]")

        columns = self.remove_column_non_printable_characters(df.columns)

        # Strip whitespace from head/tail of column names
        columns = [c.strip() for c in columns]

        normalized_columns = set()
        for i, column_name in enumerate(columns):
            if not column_name:
                raise ValueError(
                    f"Found empty column name in [{file_config.file_tag}]")

            column_name = self._convert_non_allowable_bq_column_chars(
                column_name)

            # BQ doesn't allow column names to begin with a number, so we prepend an underscore in that case
            if column_name[0] in string.digits:
                column_name = "_" + column_name

            # If the capitalization of the column name doesn't match the capitalization
            # listed in the file config, update the capitalization.
            if column_name not in file_config.columns:
                caps_normalized_col = file_config.caps_normalized_col(
                    column_name)
                if caps_normalized_col:
                    column_name = caps_normalized_col

            if column_name in normalized_columns:
                raise ValueError(
                    f"Multiple columns with name [{column_name}] after normalization."
                )
            normalized_columns.add(column_name)
            columns[i] = column_name

        if len(normalized_columns) == 1:
            # A single-column file is almost always indicative of a parsing error. If
            # this column name is not registered in the file config, we throw.
            column = one(normalized_columns)
            if column not in file_config.columns:
                raise ValueError(
                    f"Found only one column: [{column}]. Columns likely did not "
                    f"parse properly. Are you using the correct separator and encoding "
                    f"for this file? If this file really has just one column, the "
                    f"column name must be registered in the raw file config before "
                    f"upload.")

        return columns