Ejemplo n.º 1
0
def test_read_files(in_memory, dataset, arrow_file):
    filename = arrow_file
    reader = ArrowReader("", None)
    previous_allocated_memory = pa.total_allocated_bytes()
    dataset_kwargs = reader.read_files([{"filename": filename}], in_memory=in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0
    assert dataset_kwargs.keys() == set(["arrow_table", "data_files", "info", "split"])
    table = dataset_kwargs["arrow_table"]
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict
    assert increased_allocated_memory == in_memory
Ejemplo n.º 2
0
def test_read_files(in_memory, dataset, arrow_file):
    filename = arrow_file
    reader = ArrowReader("", None)
    with assert_arrow_memory_increases(
    ) if in_memory else assert_arrow_memory_doesnt_increase():
        dataset_kwargs = reader.read_files([{
            "filename": filename
        }],
                                           in_memory=in_memory)
    assert dataset_kwargs.keys() == set(["arrow_table", "info", "split"])
    table = dataset_kwargs["arrow_table"]
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(
        dataset.data.to_pydict())  # to_pydict returns OrderedDict
Ejemplo n.º 3
0
def test_read_table(in_memory, dataset, arrow_file):
    filename = arrow_file
    with assert_arrow_memory_increases(
    ) if in_memory else assert_arrow_memory_doesnt_increase():
        table = ArrowReader.read_table(filename, in_memory=in_memory)
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(
        dataset.data.to_pydict())  # to_pydict returns OrderedDict
Ejemplo n.º 4
0
def test_read_table(in_memory, dataset, arrow_file):
    filename = arrow_file
    previous_allocated_memory = pa.total_allocated_bytes()
    table = ArrowReader.read_table(filename, in_memory=in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict
    assert increased_allocated_memory == in_memory
Ejemplo n.º 5
0
local = fs.LocalFileSystem()
# info = local.get_file_info(fs.FileSelector("weibo"))
# print(info)
# info1 = local.get_file_info(fs.FileSelector("weibo/", recursive=True))
# print(info1)

from datasets.arrow_reader import ArrowReader
from datasets import DatasetInfo

arrow_dir = "weibo/"
info_dir = "weibo/"
#读取dataset_info.json文件,转换成dict格式
info = DatasetInfo.from_directory(info_dir)
#初始化一个arrow reader
myreader = ArrowReader(path=arrow_dir, info=info)
#读取目录下的weibo_ner_corpus-train.arrow文件,跳过skip行,读取到take行,这里是读取前100条
files = [{'filename': 'weibo_ner_corpus-train.arrow', 'skip': 0, 'take': 100}]
#这里instructions只是标记用,暂时
instructions = "train"
# 读取文件,生成arrow格式的专用table,类似padas的DataFrame格式
dataset_kwargs = myreader.read_files(files=files,
                                     original_instructions=instructions)
#打印第2列的内容
print(dataset_kwargs['arrow_table'].column(2))

#read_files
"""
read_files(files=files, original_instructions=instructions)
files = {list: 1} [{'filename': 'msra_ner-train.arrow', 'skip': 0, 'take': 45001}]
instructions = {NamedSplit} train