def create_timestamps(self) -> pa.Table:
     """
     :return: converts the audio metadata into a data table
     """
     result_array = [[], [], []]
     for m in self.metadata:
         timestamps = calc_evenly_sampled_timestamps(
             m[0], m[1].num_rows, self.sample_interval_micros)
         result_array[0].extend(timestamps)
         result_array[1].extend(timestamps)
         result_array[2].extend(m[1]["microphone"].to_numpy())
     for gs, ge in self.gaps:
         num_samples = int((ge - gs) / self.sample_interval_micros) - 1
         timestamps = calc_evenly_sampled_timestamps(
             gs + self.sample_interval_micros, num_samples,
             self.sample_interval_micros)
         gap_array = [timestamps, np.full(len(timestamps), np.nan)]
         result_array[0].extend(gap_array[0])
         result_array[1].extend(gap_array[0])
         result_array[2].extend(gap_array[1])
     ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS,
                                            result_array)))
     return pc.take(
         ptable,
         pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
Exemple #2
0
def test_call_function_with_memory_pool():
    arr = pa.array(["foo", "bar", "baz"])
    indices = np.array([2, 2, 1])
    result1 = arr.take(indices)
    result2 = pc.call_function('take', [arr, indices],
                               memory_pool=pa.default_memory_pool())
    expected = pa.array(["baz", "baz", "bar"])
    assert result1.equals(expected)
    assert result2.equals(expected)

    result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool())
    assert result3.equals(expected)
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])

# truncate the title after 101 characters (matching display logic)
truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="")
table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title)

# ensure all dictionaries in the file use the same key/value mappings
table = table.unify_dictionaries()

# filter out non-numeric dates (e.g. null, "1850-1853")
# matches the hack in index.js:37
mask = pc.invert(pc.is_null(table.column("date")))
table = table.filter(mask)

# sorting by the date improves the loading aesthetics
# comment this out to exactly match the original appearance
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = pc.take(table, indices)

# after sorting replace ix with an accurate row index
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32()))

temp_path.unlink()

local = fs.LocalFileSystem()

with local.open_output_stream(str(target_path)) as file:
    with pa.RecordBatchStreamWriter(file, table.schema) as writer:
        writer.write_table(table, 10000)