def build(self): if self.file_path is None: raise RuntimeError("File path is required") if self.format is None: self.format = FileFormat.from_file_name(self.file_path) if self.format is None: raise RuntimeError("File format is required") if self.file_size_in_bytes < 0: raise RuntimeError("File size is required") if self.record_count < 0: raise RuntimeError("Record count is required") if self.block_size_in_bytes is None: self.block_size_in_bytes = DataFiles.DEFAULT_BLOCK_SIZE return GenericDataFile(self.file_path, self.format, self.file_size_in_bytes, self.block_size_in_bytes, partition=self.partition_data.copy() if self.is_partitioned else None, metrics=Metrics( row_count=self.record_count, column_sizes=self.column_sizes, value_counts=self.value_counts, null_value_counts=self.null_value_counts, lower_bounds=self.lower_bounds, upper_bounds=self.upper_bounds))
def entries(self, columns=None): if columns is None: columns = ManifestReader.ALL_COLUMNS file_format = FileFormat.from_file_name(self.file.location()) if file_format is None: raise RuntimeError("Unable to determine format of manifest: %s" % self.file) proj_schema = ManifestEntry.project_schema(self.spec.partition_type(), columns) if self._entries is None: if file_format is FileFormat.AVRO: self._entries = list() for read_entry in AvroToIceberg.read_avro_row( proj_schema, self._avro_reader): entry = ManifestEntry( schema=proj_schema, partition_type=self.spec.partition_type()) for i, key in enumerate(read_entry.keys()): entry.put(i, read_entry[key]) self._entries.append(entry) self._fo.close() self._avro_reader = None return self._entries
def test_parquet(): file_fmt = FileFormat.PARQUET file_name = "test_file.parquet" add_extension_file = "test_file" assert file_fmt.is_splittable() assert FileFormat.from_file_name(file_name) == FileFormat.PARQUET assert file_name == FileFormat.PARQUET.add_extension(add_extension_file)
def test_orc(): file_fmt = FileFormat.ORC file_name = "test_file.orc" add_extension_file = "test_file" assert file_fmt.is_splittable() assert FileFormat.from_file_name(file_name) == FileFormat.ORC assert file_name == FileFormat.ORC.add_extension(add_extension_file)
def test_avro(): file_fmt = FileFormat.AVRO file_name = "test_file.avro" add_extension_file = "test_file" assert file_fmt.is_splittable() assert FileFormat.from_file_name(file_name) == FileFormat.AVRO assert file_name == FileFormat.AVRO.add_extension(add_extension_file)
def entries(self, columns=None): if columns is None: columns = ManifestReader.ALL_COLUMNS format = FileFormat.from_file_name(self.file.location()) if format is None: raise RuntimeError("Unable to determine format of manifest: " + self.file) proj_schema = ManifestEntry.project_schema(self.spec.partition_type(), columns) read_entries = list() if format == FileFormat.AVRO: with self.file.new_fo() as fo: avro_reader = fastavro.reader(fo) for read_entry in AvroToIceberg.read_avro_row( proj_schema, avro_reader): entry = ManifestEntry( schema=proj_schema, partition_type=self.spec.partition_type()) for i, key in enumerate(read_entry.keys()): entry.put(i, read_entry[key]) read_entries.append(entry) else: raise RuntimeError("Invalid format for manifest file: " + format) return read_entries
def from_stat(stat, row_count, partition_data=None, metrics=None): location = stat.path format = FileFormat.from_file_name(location) return GenericDataFile(location, format, stat.length, stat.block_size, row_count=row_count, partition=partition_data, metrics=metrics)