def build(self):
        if self.file_path is None:
            raise RuntimeError("File path is required")
        if self.format is None:
            self.format = FileFormat.from_file_name(self.file_path)

        if self.format is None:
            raise RuntimeError("File format is required")

        if self.file_size_in_bytes < 0:
            raise RuntimeError("File size is required")

        if self.record_count < 0:
            raise RuntimeError("Record count is required")

        if self.block_size_in_bytes is None:
            self.block_size_in_bytes = DataFiles.DEFAULT_BLOCK_SIZE

        return GenericDataFile(self.file_path,
                               self.format,
                               self.file_size_in_bytes,
                               self.block_size_in_bytes,
                               partition=self.partition_data.copy()
                               if self.is_partitioned else None,
                               metrics=Metrics(
                                   row_count=self.record_count,
                                   column_sizes=self.column_sizes,
                                   value_counts=self.value_counts,
                                   null_value_counts=self.null_value_counts,
                                   lower_bounds=self.lower_bounds,
                                   upper_bounds=self.upper_bounds))
    def entries(self, columns=None):
        if columns is None:
            columns = ManifestReader.ALL_COLUMNS

        file_format = FileFormat.from_file_name(self.file.location())
        if file_format is None:
            raise RuntimeError("Unable to determine format of manifest: %s" %
                               self.file)

        proj_schema = ManifestEntry.project_schema(self.spec.partition_type(),
                                                   columns)

        if self._entries is None:
            if file_format is FileFormat.AVRO:
                self._entries = list()
                for read_entry in AvroToIceberg.read_avro_row(
                        proj_schema, self._avro_reader):
                    entry = ManifestEntry(
                        schema=proj_schema,
                        partition_type=self.spec.partition_type())
                    for i, key in enumerate(read_entry.keys()):
                        entry.put(i, read_entry[key])
                    self._entries.append(entry)
                self._fo.close()
                self._avro_reader = None

        return self._entries
Beispiel #3
0
def test_parquet():
    file_fmt = FileFormat.PARQUET
    file_name = "test_file.parquet"
    add_extension_file = "test_file"
    assert file_fmt.is_splittable()
    assert FileFormat.from_file_name(file_name) == FileFormat.PARQUET
    assert file_name == FileFormat.PARQUET.add_extension(add_extension_file)
Beispiel #4
0
def test_orc():
    file_fmt = FileFormat.ORC
    file_name = "test_file.orc"
    add_extension_file = "test_file"
    assert file_fmt.is_splittable()
    assert FileFormat.from_file_name(file_name) == FileFormat.ORC
    assert file_name == FileFormat.ORC.add_extension(add_extension_file)
Beispiel #5
0
def test_avro():
    file_fmt = FileFormat.AVRO
    file_name = "test_file.avro"
    add_extension_file = "test_file"
    assert file_fmt.is_splittable()
    assert FileFormat.from_file_name(file_name) == FileFormat.AVRO
    assert file_name == FileFormat.AVRO.add_extension(add_extension_file)
Beispiel #6
0
    def entries(self, columns=None):
        if columns is None:
            columns = ManifestReader.ALL_COLUMNS

        format = FileFormat.from_file_name(self.file.location())
        if format is None:
            raise RuntimeError("Unable to determine format of manifest: " +
                               self.file)

        proj_schema = ManifestEntry.project_schema(self.spec.partition_type(),
                                                   columns)
        read_entries = list()
        if format == FileFormat.AVRO:
            with self.file.new_fo() as fo:
                avro_reader = fastavro.reader(fo)

                for read_entry in AvroToIceberg.read_avro_row(
                        proj_schema, avro_reader):
                    entry = ManifestEntry(
                        schema=proj_schema,
                        partition_type=self.spec.partition_type())
                    for i, key in enumerate(read_entry.keys()):
                        entry.put(i, read_entry[key])
                    read_entries.append(entry)
        else:
            raise RuntimeError("Invalid format for manifest file: " + format)

        return read_entries
 def from_stat(stat, row_count, partition_data=None, metrics=None):
     location = stat.path
     format = FileFormat.from_file_name(location)
     return GenericDataFile(location,
                            format,
                            stat.length,
                            stat.block_size,
                            row_count=row_count,
                            partition=partition_data,
                            metrics=metrics)