def test_nlp_file_writer():
    """test nlp file writer using shard api"""
    schema_json = {
        "id": {
            "type": "string"
        },
        "label": {
            "type": "number"
        },
        "rating": {
            "type": "number"
        },
        "input_ids": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "input_mask": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "segment_ids": {
            "type": "array",
            "items": {
                "type": "number"
            }
        }
    }
    data = list(
        get_nlp_data("../data/mindrecord/testAclImdbData/pos",
                     "../data/mindrecord/testAclImdbData/vocab.txt", 10))
    header = ShardHeader()
    schema = header.build_schema(schema_json, ["segment_ids"], "nlp_schema")
    schema_id = header.add_schema(schema)
    assert schema_id == 0, 'failed on adding schema'
    index_fields_list = ["id", "rating"]
    ret = header.add_index_fields(index_fields_list)
    assert ret == SUCCESS, 'failed on adding index fields.'
    writer = ShardWriter()
    paths = ["{}{}".format(NLP_FILE_NAME, x) for x in range(FILES_NUM)]
    ret = writer.open(paths)
    assert ret == SUCCESS, 'failed on opening files.'
    writer.set_header_size(1 << 14)
    writer.set_page_size(1 << 15)
    ret = writer.set_shard_header(header)
    assert ret == SUCCESS, 'failed on setting header.'
    ret = writer.write_raw_nlp_data({schema_id: data})
    assert ret == SUCCESS, 'failed on writing raw data.'
    ret = writer.commit()
    assert ret == SUCCESS, 'failed on committing.'
    generator = ShardIndexGenerator(os.path.realpath(paths[0]))
    generator.build()
    generator.write_to_db()
def test_mkv_file_writer():
    """test mkv file writer  using shard api"""
    data = get_mkv_data("../data/mindrecord/testVehPerData/")
    schema_json = {
        "file_name": {
            "type": "string"
        },
        "id": {
            "type": "number"
        },
        "prelabel": {
            "type": "string"
        }
    }
    header = ShardHeader()
    img_schema = header.build_schema(schema_json, ["data"], "img_schema")
    schema_id = header.add_schema(img_schema)
    assert schema_id == 0, 'failed on building schema.'
    index_fields_list = ["id", "file_name"]
    ret = header.add_index_fields(index_fields_list)
    assert ret == SUCCESS, 'failed on adding index fields.'

    writer = ShardWriter()
    paths = ["{}{}".format(MKV_FILE_NAME, x) for x in range(FILES_NUM)]
    ret = writer.open(paths)
    assert ret == SUCCESS, 'failed on opening files.'
    writer.set_header_size(1 << 24)
    writer.set_page_size(1 << 25)
    ret = writer.set_shard_header(header)
    assert ret == SUCCESS, 'failed on setting header.'
    ret = writer.write_raw_cv_data({schema_id: data})
    assert ret == SUCCESS, 'failed on writing raw data.'
    ret = writer.commit()
    assert ret == SUCCESS, 'failed on committing.'

    generator = ShardIndexGenerator(os.path.realpath(paths[0]))
    generator.build()
    generator.write_to_db()
Example #3
0
def test_file_writer_fail_add_index():
    """test file writer, read when failed on adding index."""
    data_raw = get_data("../data/mindrecord/testImageNetData/")
    schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "number"
        }
    }
    header = ShardHeader()
    schema = header.build_schema(schema_json, ["data"], "img")  # create schema
    schema_id = header.add_schema(schema)  # add schema
    with pytest.raises(TypeError, match="missing 1 "):
        ret = header.add_index_fields()
        assert ret == FAILED

    with pytest.raises(MRMAddIndexError):
        index_fields = []
        ret = header.add_index_fields(index_fields)
        assert ret == FAILED

    file_name = os.path.join(os.getcwd(),
                             "test_001.mindrecord")  # set output filename
    writer = ShardWriter()  # test_file_writer
    ret = writer.open([file_name])
    assert ret == SUCCESS, 'failed on opening files.'
    ret = writer.set_shard_header(header)  # write header
    assert ret == SUCCESS, 'failed on setting header.'
    ret = writer.write_raw_cv_data({schema_id: data_raw})
    assert ret == SUCCESS, 'failed on writing raw data.'
    ret = writer.commit()  # commit data
    assert ret == SUCCESS, "commit failed"
    # ShardIndexGenerator
    generator = ShardIndexGenerator(os.path.realpath(file_name))
    generator.build()
    generator.write_to_db()

    reader = ShardReader()
    ret = reader.open(file_name)
    reader.launch()
    index = 0
    _, blob_fields = reader.get_blob_fields()
    iterator = reader.get_next()
    while iterator:
        for blob, raw in iterator:
            raw[blob_fields[0]] = bytes(blob)
            logger.info("#item{}: {}".format(index, raw))
            index += 1
            iterator = reader.get_next()
    reader.finish()
    reader.close()

    os.remove("{}".format(file_name))
    os.remove("{}.db".format(file_name))
def test_mkv_file_writer_with_exactly_schema():
    """test mkv file writer using shard api"""
    header = ShardHeader()
    img_schema_json = {
        "annotation_name": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "annotation_pose": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "annotation_truncated": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "annotation_difficult": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "annotation_xmin": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "annotation_ymin": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "annotation_xmax": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "annotation_ymax": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "metadata_width": {
            "type": "number"
        },
        "metadata_height": {
            "type": "number"
        },
        "metadata_depth": {
            "type": "number"
        },
        "img_path": {
            "type": "string"
        },
        "annotation_path": {
            "type": "string"
        }
    }
    img_schema = header.build_schema(img_schema_json, ["data"], "image_schema")
    schema_id = header.add_schema(img_schema)
    assert schema_id == 0, 'failed on building schema.'

    writer = ShardWriter()
    paths = ["{}{}".format(MKV_FILE_NAME, x) for x in range(1)]
    ret = writer.open(paths)
    assert ret == SUCCESS, 'failed on opening files.'
    writer.set_header_size(1 << 24)
    writer.set_page_size(1 << 25)

    image_bytes = bytes("it's a image picutre", encoding="utf8")
    data = []
    data.append({
        "annotation_name": ["xxxxxxxxxx.jpg"],
        "annotation_pose": ["hahahahah"],
        "annotation_truncated": ["1"],
        "annotation_difficult": ["0"],
        "annotation_xmin": [100],
        "annotation_ymin": [200],
        "annotation_xmax": [300],
        "annotation_ymax": [400],
        "metadata_width": 333,
        "metadata_height": 222,
        "metadata_depth": 3,
        "img_path": "/tmp/",
        "annotation_path": "/tmp/annotation",
        "data": image_bytes
    })
    data.append({
        "annotation_name": ["xxxxxxxxxx.jpg"],
        "annotation_pose": ["hahahahah"],
        "annotation_truncated": ["1"],
        "annotation_difficult": ["0"],
        "annotation_xmin": [100],
        "annotation_ymin": [200],
        "annotation_xmax": [300],
        "annotation_ymax": [400],
        "metadata_width": 333,
        "metadata_height": 222,
        "metadata_depth": 3,
        "img_path": "/tmp/",
        "annotation_path": "/tmp/annotation",
        "data": image_bytes
    })
    ret = writer.set_shard_header(header)
    assert ret == SUCCESS, 'failed on setting header.'
    ret = writer.write_raw_cv_data({schema_id: data})
    assert ret == SUCCESS, 'failed on writing raw data.'
    ret = writer.commit()
    assert ret == SUCCESS, 'failed on committing.'

    generator = ShardIndexGenerator(os.path.realpath(paths[0]))
    generator.build()
    generator.write_to_db()