def test_nlp_file_writer(): """test nlp file writer using shard api""" schema_json = { "id": { "type": "string" }, "label": { "type": "number" }, "rating": { "type": "number" }, "input_ids": { "type": "array", "items": { "type": "number" } }, "input_mask": { "type": "array", "items": { "type": "number" } }, "segment_ids": { "type": "array", "items": { "type": "number" } } } data = list( get_nlp_data("../data/mindrecord/testAclImdbData/pos", "../data/mindrecord/testAclImdbData/vocab.txt", 10)) header = ShardHeader() schema = header.build_schema(schema_json, ["segment_ids"], "nlp_schema") schema_id = header.add_schema(schema) assert schema_id == 0, 'failed on adding schema' index_fields_list = ["id", "rating"] ret = header.add_index_fields(index_fields_list) assert ret == SUCCESS, 'failed on adding index fields.' writer = ShardWriter() paths = ["{}{}".format(NLP_FILE_NAME, x) for x in range(FILES_NUM)] ret = writer.open(paths) assert ret == SUCCESS, 'failed on opening files.' writer.set_header_size(1 << 14) writer.set_page_size(1 << 15) ret = writer.set_shard_header(header) assert ret == SUCCESS, 'failed on setting header.' ret = writer.write_raw_nlp_data({schema_id: data}) assert ret == SUCCESS, 'failed on writing raw data.' ret = writer.commit() assert ret == SUCCESS, 'failed on committing.' generator = ShardIndexGenerator(os.path.realpath(paths[0])) generator.build() generator.write_to_db()
def test_mkv_file_writer(): """test mkv file writer using shard api""" data = get_mkv_data("../data/mindrecord/testVehPerData/") schema_json = { "file_name": { "type": "string" }, "id": { "type": "number" }, "prelabel": { "type": "string" } } header = ShardHeader() img_schema = header.build_schema(schema_json, ["data"], "img_schema") schema_id = header.add_schema(img_schema) assert schema_id == 0, 'failed on building schema.' index_fields_list = ["id", "file_name"] ret = header.add_index_fields(index_fields_list) assert ret == SUCCESS, 'failed on adding index fields.' writer = ShardWriter() paths = ["{}{}".format(MKV_FILE_NAME, x) for x in range(FILES_NUM)] ret = writer.open(paths) assert ret == SUCCESS, 'failed on opening files.' writer.set_header_size(1 << 24) writer.set_page_size(1 << 25) ret = writer.set_shard_header(header) assert ret == SUCCESS, 'failed on setting header.' ret = writer.write_raw_cv_data({schema_id: data}) assert ret == SUCCESS, 'failed on writing raw data.' ret = writer.commit() assert ret == SUCCESS, 'failed on committing.' generator = ShardIndexGenerator(os.path.realpath(paths[0])) generator.build() generator.write_to_db()
def test_file_writer_fail_add_index(): """test file writer, read when failed on adding index.""" data_raw = get_data("../data/mindrecord/testImageNetData/") schema_json = { "file_name": { "type": "string" }, "label": { "type": "number" } } header = ShardHeader() schema = header.build_schema(schema_json, ["data"], "img") # create schema schema_id = header.add_schema(schema) # add schema with pytest.raises(TypeError, match="missing 1 "): ret = header.add_index_fields() assert ret == FAILED with pytest.raises(MRMAddIndexError): index_fields = [] ret = header.add_index_fields(index_fields) assert ret == FAILED file_name = os.path.join(os.getcwd(), "test_001.mindrecord") # set output filename writer = ShardWriter() # test_file_writer ret = writer.open([file_name]) assert ret == SUCCESS, 'failed on opening files.' ret = writer.set_shard_header(header) # write header assert ret == SUCCESS, 'failed on setting header.' ret = writer.write_raw_cv_data({schema_id: data_raw}) assert ret == SUCCESS, 'failed on writing raw data.' ret = writer.commit() # commit data assert ret == SUCCESS, "commit failed" # ShardIndexGenerator generator = ShardIndexGenerator(os.path.realpath(file_name)) generator.build() generator.write_to_db() reader = ShardReader() ret = reader.open(file_name) reader.launch() index = 0 _, blob_fields = reader.get_blob_fields() iterator = reader.get_next() while iterator: for blob, raw in iterator: raw[blob_fields[0]] = bytes(blob) logger.info("#item{}: {}".format(index, raw)) index += 1 iterator = reader.get_next() reader.finish() reader.close() os.remove("{}".format(file_name)) os.remove("{}.db".format(file_name))
def test_mkv_file_writer_with_exactly_schema(): """test mkv file writer using shard api""" header = ShardHeader() img_schema_json = { "annotation_name": { "type": "array", "items": { "type": "string" } }, "annotation_pose": { "type": "array", "items": { "type": "string" } }, "annotation_truncated": { "type": "array", "items": { "type": "string" } }, "annotation_difficult": { "type": "array", "items": { "type": "string" } }, "annotation_xmin": { "type": "array", "items": { "type": "number" } }, "annotation_ymin": { "type": "array", "items": { "type": "number" } }, "annotation_xmax": { "type": "array", "items": { "type": "number" } }, "annotation_ymax": { "type": "array", "items": { "type": "number" } }, "metadata_width": { "type": "number" }, "metadata_height": { "type": "number" }, "metadata_depth": { "type": "number" }, "img_path": { "type": "string" }, "annotation_path": { "type": "string" } } img_schema = header.build_schema(img_schema_json, ["data"], "image_schema") schema_id = header.add_schema(img_schema) assert schema_id == 0, 'failed on building schema.' writer = ShardWriter() paths = ["{}{}".format(MKV_FILE_NAME, x) for x in range(1)] ret = writer.open(paths) assert ret == SUCCESS, 'failed on opening files.' writer.set_header_size(1 << 24) writer.set_page_size(1 << 25) image_bytes = bytes("it's a image picutre", encoding="utf8") data = [] data.append({ "annotation_name": ["xxxxxxxxxx.jpg"], "annotation_pose": ["hahahahah"], "annotation_truncated": ["1"], "annotation_difficult": ["0"], "annotation_xmin": [100], "annotation_ymin": [200], "annotation_xmax": [300], "annotation_ymax": [400], "metadata_width": 333, "metadata_height": 222, "metadata_depth": 3, "img_path": "/tmp/", "annotation_path": "/tmp/annotation", "data": image_bytes }) data.append({ "annotation_name": ["xxxxxxxxxx.jpg"], "annotation_pose": ["hahahahah"], "annotation_truncated": ["1"], "annotation_difficult": ["0"], "annotation_xmin": [100], "annotation_ymin": [200], "annotation_xmax": [300], "annotation_ymax": [400], "metadata_width": 333, "metadata_height": 222, "metadata_depth": 3, "img_path": "/tmp/", "annotation_path": "/tmp/annotation", "data": image_bytes }) ret = writer.set_shard_header(header) assert ret == SUCCESS, 'failed on setting header.' ret = writer.write_raw_cv_data({schema_id: data}) assert ret == SUCCESS, 'failed on writing raw data.' ret = writer.commit() assert ret == SUCCESS, 'failed on committing.' generator = ShardIndexGenerator(os.path.realpath(paths[0])) generator.build() generator.write_to_db()