Example #1
0
def test_write_two_images_mindrecord():
    """test two images to mindrecord"""
    if os.path.exists("{}".format(CV_FILE_NAME + ".db")):
        os.remove(CV_FILE_NAME + ".db")
    if os.path.exists("{}".format(CV_FILE_NAME)):
        os.remove(CV_FILE_NAME)
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    data = get_two_bytes_data(MAP_FILE_NAME)
    cv_schema_json = {
        "img_data": {
            "type": "bytes"
        },
        "label_data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "two_images_schema")
    writer.write_raw_data(data)
    writer.commit()
    assert os.path.exists(CV_FILE_NAME)
    assert os.path.exists(CV_FILE_NAME + ".db")
    read(CV_FILE_NAME, 2)

    if os.path.exists("{}".format(CV_FILE_NAME + ".db")):
        os.remove(CV_FILE_NAME + ".db")
    if os.path.exists("{}".format(CV_FILE_NAME)):
        os.remove(CV_FILE_NAME)
Example #2
0
def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="ssd.mindrecord", file_num=8):
    """Create MindRecord file."""
    mindrecord_dir = config.mindrecord_dir
    mindrecord_path = os.path.join(mindrecord_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)
    if dataset == "coco":
        images, image_path_dict, image_anno_dict = create_coco_label(is_training)
    else:
        images, image_path_dict, image_anno_dict = filter_valid_data(config.image_dir, config.anno_path)

    ssd_json = {
        "img_id": {"type": "int32", "shape": [1]},
        "image": {"type": "bytes"},
        "annotation": {"type": "int32", "shape": [-1, 5]},
    }
    writer.add_schema(ssd_json, "ssd_json")

    for img_id in images:
        image_path = image_path_dict[img_id]
        with open(image_path, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[img_id], dtype=np.int32)
        img_id = np.array([img_id], dtype=np.int32)
        row = {"img_id": img_id, "image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
Example #3
0
def data_to_mindrecord_byte_image(image_dir, anno_path, mindrecord_dir, prefix,
                                  file_num):
    """Create MindRecord file by image_dir and anno_path."""
    mindrecord_path = os.path.join(mindrecord_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)
    image_files, image_anno_dict = filter_valid_data(image_dir, anno_path)

    yolo_json = {
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "int64",
            "shape": [-1, 5]
        },
    }
    writer.add_schema(yolo_json, "yolo_json")

    for image_name in image_files:
        image_path = os.path.join(image_dir, image_name)
        with open(image_path, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[image_name])
        row = {"image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
def test_cv_file_writer_no_raw():
    """test cv file writer without raw data."""
    writer = FileWriter(NLP_FILE_NAME)
    data = list(
        get_nlp_data("../data/mindrecord/testAclImdbData/pos",
                     "../data/mindrecord/testAclImdbData/vocab.txt", 10))
    nlp_schema_json = {
        "input_ids": {
            "type": "int64",
            "shape": [1, -1]
        },
        "input_mask": {
            "type": "int64",
            "shape": [1, -1]
        },
        "segment_ids": {
            "type": "int64",
            "shape": [1, -1]
        }
    }
    writer.add_schema(nlp_schema_json, "no_raw_schema")
    writer.write_raw_data(data)
    writer.commit()
    reader = FileReader(NLP_FILE_NAME)
    count = 0
    for index, x in enumerate(reader.get_next()):
        count += 1
        assert len(x) == 3
        logger.info("#item{}: {}".format(index, x))
    assert count == 10
    reader.close()
    os.remove(NLP_FILE_NAME)
    os.remove("{}.db".format(NLP_FILE_NAME))
Example #5
0
def create_diff_page_size_cv_mindrecord(files_num):
    """tutorial for cv dataset writer."""
    if os.path.exists(CV1_FILE_NAME):
        os.remove(CV1_FILE_NAME)
    if os.path.exists("{}.db".format(CV1_FILE_NAME)):
        os.remove("{}.db".format(CV1_FILE_NAME))
    writer = FileWriter(CV1_FILE_NAME, files_num)
    writer.set_page_size(1 << 26)  # 64MB
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "data": {
            "type": "bytes"
        }
    }
    data = [{
        "file_name": "001.jpg",
        "label": 43,
        "data": bytes('0xffsafdafda', encoding='utf-8')
    }]
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()
def test_cv_file_writer_shard_num_10():
    """test file writer when shard num equals 10."""
    writer = FileWriter(CV_FILE_NAME, 10)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int64"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    paths = [
        "{}{}".format(CV_FILE_NAME,
                      str(x).rjust(1, '0')) for x in range(10)
    ]
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
def test_cv_file_writer_without_data():
    """test cv file writer without data."""
    writer = FileWriter(CV_FILE_NAME, 1)
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int64"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.commit()
    reader = FileReader(CV_FILE_NAME)
    count = 0
    for index, x in enumerate(reader.get_next()):
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 0
    reader.close()
    os.remove(CV_FILE_NAME)
    os.remove("{}.db".format(CV_FILE_NAME))
Example #8
0
def test_file_writer_raw_data_038():
    """test write raw data without verify."""
    shard_num = 11
    writer = FileWriter("test_file_writer_raw_data_", shard_num)
    data_raw = get_data("../data/mindrecord/testImageNetData/")
    schema_json = {"file_name": {"type": "string"}, "label": {"type": "number"},
                   "data": {"type": "bytes"}}
    writer.add_schema(schema_json, "img_schema")
    writer.add_index(["file_name"])
    for _ in range(shard_num):
        writer.write_raw_data(data_raw, False)
    writer.commit()

    file_name = ""
    if shard_num > 1:
        file_name = '99' if shard_num > 99 else str(shard_num - 1)
    reader = FileReader("test_file_writer_raw_data_" + file_name)
    i = 0
    for _, _ in enumerate(reader.get_next()):
        i = i + 1
    assert i == shard_num * 10
    reader.close()
    if shard_num == 1:
        os.remove("test_file_writer_raw_data_")
        os.remove("test_file_writer_raw_data_.db")
        return
    for x in range(shard_num):
        n = str(x)
        if shard_num > 10:
            n = '0' + str(x) if x < 10 else str(x)
        if os.path.exists("test_file_writer_raw_data_{}".format(n)):
            os.remove("test_file_writer_raw_data_{}".format(n))
        if os.path.exists("test_file_writer_raw_data_{}.db".format(n)):
            os.remove("test_file_writer_raw_data_{}.db".format(n))
Example #9
0
    def file_based_convert_examples_to_features(self, input_file, output_file):
        """"Convert a set of `InputExample`s to a MindDataset file."""
        examples = self._read_tsv(input_file)

        writer = FileWriter(file_name=output_file, shard_num=1)
        nlp_schema = {
            "input_ids": {"type": "int64", "shape":[-1]},
            "input_mask": {"type": "int64", "shape":[-1]},
            "segment_ids": {"type": "int64", "shape":[-1]},
            "label_ids": {"type": "int64", "shape":[-1]},
        }
        writer.add_schema(nlp_schema, "proprocessed classification dataset")
        data = []
        for index, example in enumerate(examples):
            if index % 10000 == 0:
                logging.info("Writing example %d of %d" % (index, len(examples)))
            record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer)
            sample = {
                "input_ids": np.array(record.input_ids, dtype=np.int64),
                "input_mask": np.array(record.input_mask, dtype=np.int64),
                "segment_ids": np.array(record.segment_ids, dtype=np.int64),
                "label_ids": np.array([record.label_id], dtype=np.int64),
            }
            data.append(sample)
        writer.write_raw_data(data)
        writer.commit()
Example #10
0
def write_to_mindrecord(data, path, shared_num=1):
    """generate mindrecord"""
    if not os.path.isabs(path):
        path = os.path.abspath(path)

    writer = FileWriter(path, shared_num)
    data_schema = {
        "src_tokens": {
            "type": "int32",
            "shape": [-1]
        },
        "src_tokens_length": {
            "type": "int32",
            "shape": [-1]
        },
        "label_idx": {
            "type": "int32",
            "shape": [-1]
        }
    }
    writer.add_schema(data_schema, "fasttext")
    for item in data:
        item['src_tokens'] = np.array(item['src_tokens'], dtype=np.int32)
        item['src_tokens_length'] = np.array(item['src_tokens_length'],
                                             dtype=np.int32)
        item['label_idx'] = np.array(item['label_idx'], dtype=np.int32)
        writer.write_raw_data([item])
    writer.commit()
Example #11
0
def test_cv_minddataset_writer_tutorial():
    """tutorial for cv dataset writer."""
    paths = [
        "{}{}".format(CV_FILE_NAME,
                      str(x).rjust(1, '0')) for x in range(FILES_NUM)
    ]
    for x in paths:
        os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None
        os.remove("{}.db".format(x)) if os.path.exists(
            "{}.db".format(x)) else None
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    data = get_data(CV_DIR_NAME)
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
def test_cv_file_writer_loop_and_read():
    """tutorial for cv dataset loop writer."""
    writer = FileWriter(CV2_FILE_NAME, FILES_NUM)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {"file_name": {"type": "string"},
                      "label": {"type": "int64"}, "data": {"type": "bytes"}}
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    for row in data:
        writer.write_raw_data([row])
    writer.commit()

    reader = FileReader(CV2_FILE_NAME + "0")
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 10
    reader.close()

    paths = ["{}{}".format(CV2_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
Example #13
0
def create_text_mindrecord():
    # methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord
    from mindspore.mindrecord import FileWriter

    mindrecord_file_name = "test.mindrecord"
    data = [
        {
            "english": "This is a text file.",
            "chinese": "今天天气太好了我们一起去外面玩吧"
        },
        {
            "english": "Be happy every day.",
            "chinese": "男默女泪"
        },
        {
            "english": "Good luck to everyone.",
            "chinese": "江州市长江大桥参加了长江大桥的通车仪式"
        },
    ]
    writer = FileWriter(mindrecord_file_name)
    schema = {
        "english": {
            "type": "string"
        },
        "chinese": {
            "type": "string"
        },
    }
    writer.add_schema(schema)
    writer.write_raw_data(data)
    writer.commit()
Example #14
0
def add_and_remove_nlp_file():
    """add/remove nlp file"""
    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for x in paths:
        if os.path.exists("{}".format(x)):
            os.remove("{}".format(x))
        if os.path.exists("{}.db".format(x)):
            os.remove("{}.db".format(x))
    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
    data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
    nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"},
                       "rating": {"type": "float32"},
                       "input_ids": {"type": "int64",
                                     "shape": [-1]},
                       "input_mask": {"type": "int64",
                                      "shape": [1, -1]},
                       "segment_ids": {"type": "int64",
                                       "shape": [2, -1]}
                       }
    writer.set_header_size(1 << 14)
    writer.set_page_size(1 << 15)
    writer.add_schema(nlp_schema_json, "nlp_schema")
    writer.add_index(["id", "rating"])
    writer.write_raw_data(data)
    writer.commit()
    yield "yield_nlp_data"
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
Example #15
0
def write_mindrecord_tutorial():
    writer = FileWriter(MINDRECORD_FILE_NAME)
    data = get_data("./ImageNetDataSimulation")
    schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int64"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(MINDRECORD_FILE_NAME)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        count = count + 1
        # print("#item {}: {}".format(index, x))
    assert count == 20
    reader.close()
Example #16
0
def convert_to_mindrecord(features, labels, mindrecord_path):
    schema_json = {"id": {"type": "int32"},
                   "label": {"type": "int32"},
                   "feature": {"type": "int32", "shape": [-1]}}
    if not os.path.exists(mindrecord_path):
        os.makedirs(mindrecord_path)
    else:
        print(mindrecord_path, 'exists. Please make sure it is empty!')
    file_name = os.path.join(mindrecord_path, 'style.mindrecord')
    print('writing mindrecord into', file_name)
    def get_imdb_data(features, labels):
        data_list = []
        for i, (label, feature) in enumerate(zip(labels, features)):
            data_json = {"id": i,
                         "label": int(label),
                         "feature": feature.reshape(-1)}
            data_list.append(data_json)
        return data_list
    writer = FileWriter(file_name, shard_num=4)
    data = get_imdb_data(features, labels)
    writer.add_schema(schema_json, "style_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data)
    writer.commit()
    print('done')
def test_nlp_file_writer_tutorial():
    """tutorial for nlp file writer."""
    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
    data = list(
        get_nlp_data("../data/mindrecord/testAclImdbData/pos",
                     "../data/mindrecord/testAclImdbData/vocab.txt", 10))
    nlp_schema_json = {
        "id": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "rating": {
            "type": "float32"
        },
        "input_ids": {
            "type": "int64",
            "shape": [1, -1]
        },
        "input_mask": {
            "type": "int64",
            "shape": [1, -1]
        },
        "segment_ids": {
            "type": "int64",
            "shape": [1, -1]
        }
    }
    writer.add_schema(nlp_schema_json, "nlp_schema")
    writer.add_index(["id", "rating"])
    writer.write_raw_data(data)
    writer.commit()
Example #18
0
    def init_writer(mr_schema):
        """
        init writer
        """
        print("Init writer  ...")
        mr_writer = FileWriter(args.mindrecord_file,
                               args.mindrecord_partitions)

        # set the header size
        if args.mindrecord_header_size_by_bit != 24:
            header_size = 1 << args.mindrecord_header_size_by_bit
            mr_writer.set_header_size(header_size)

        # set the page size
        if args.mindrecord_page_size_by_bit != 25:
            page_size = 1 << args.mindrecord_page_size_by_bit
            mr_writer.set_page_size(page_size)

        # create the schema
        mr_writer.add_schema(mr_schema, "mindrecord_graph_schema")

        # open file and set header
        mr_writer.open_and_set_header()

        return mr_writer
def test_cv_file_writer_absolute_path():
    """test cv file writer when file name is absolute path."""
    file_name = "/tmp/" + str(uuid.uuid4())
    writer = FileWriter(file_name, FILES_NUM)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int64"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    paths = [
        "{}{}".format(file_name,
                      str(x).rjust(1, '0')) for x in range(FILES_NUM)
    ]
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
Example #20
0
def data_to_mindrecord_byte_image(is_training=True,
                                  prefix="deeptext.mindrecord",
                                  file_num=8):
    """Create MindRecord file."""
    mindrecord_dir = config.mindrecord_dir
    mindrecord_path = os.path.join(mindrecord_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)
    image_files, image_anno_dict = create_label(is_training)

    deeptext_json = {
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "int32",
            "shape": [-1, 6]
        },
    }
    writer.add_schema(deeptext_json, "deeptext_json")

    for image_name in image_files:
        with open(image_name, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[image_name], dtype=np.int32)
        row = {"image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
def test_cv_file_writer_no_blob():
    """test cv file writer without blob data."""
    writer = FileWriter(CV_FILE_NAME, 1)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int64"
        }
    }
    writer.add_schema(cv_schema_json, "no_blob_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()
    reader = FileReader(CV_FILE_NAME)
    count = 0
    for index, x in enumerate(reader.get_next()):
        count += 1
        assert len(x) == 2
        logger.info("#item{}: {}".format(index, x))
    assert count == 10
    reader.close()
    os.remove(CV_FILE_NAME)
    os.remove("{}.db".format(CV_FILE_NAME))
Example #22
0
def voc_data_to_mindrecord(mindrecord_dir,
                           is_training,
                           prefix="ssd.mindrecord",
                           file_num=8):
    """Create MindRecord file by image_dir and anno_path."""
    mindrecord_path = os.path.join(mindrecord_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)
    images, image_path_dict, image_anno_dict = create_voc_label(is_training)

    ssd_json = {
        "img_id": {
            "type": "int32",
            "shape": [1]
        },
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "int32",
            "shape": [-1, 5]
        },
    }
    writer.add_schema(ssd_json, "ssd_json")

    for img_id in images:
        image_path = image_path_dict[img_id]
        with open(image_path, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[img_id], dtype=np.int32)
        img_id = np.array([img_id], dtype=np.int32)
        row = {"img_id": img_id, "image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
Example #23
0
def _convert_to_mindrecord(data_home, features, labels, weight_np=None, training=True):
    """
    convert imdb dataset to mindrecoed dataset
    """
    if weight_np is not None:
        np.savetxt(os.path.join(data_home, 'weight.txt'), weight_np)

    # write mindrecord
    schema_json = {"id": {"type": "int32"},
                   "label": {"type": "int32"},
                   "feature": {"type": "int32", "shape": [-1]}}

    data_dir = os.path.join(data_home, "aclImdb_train.mindrecord")
    if not training:
        data_dir = os.path.join(data_home, "aclImdb_test.mindrecord")

    def get_imdb_data(features, labels):
        data_list = []
        for i, (label, feature) in enumerate(zip(labels, features)):
            data_json = {"id": i,
                         "label": int(label),
                         "feature": feature.reshape(-1)}
            data_list.append(data_json)
        return data_list

    writer = FileWriter(data_dir, shard_num=4)
    data = get_imdb_data(features, labels)
    writer.add_schema(schema_json, "nlp_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data)
    writer.commit()
def add_and_remove_cv_file():
    """add/remove cv file"""
    paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    try:
        for x in paths:
            if os.path.exists("{}".format(x)):
                os.remove("{}".format(x))
            if os.path.exists("{}.db".format(x)):
                os.remove("{}.db".format(x))
        writer = FileWriter(CV_FILE_NAME, FILES_NUM)
        data = get_data(CV_DIR_NAME)
        cv_schema_json = {"id": {"type": "int32"},
                          "file_name": {"type": "string"},
                          "label": {"type": "int32"},
                          "data": {"type": "bytes"}}
        writer.add_schema(cv_schema_json, "img_schema")
        writer.add_index(["file_name", "label"])
        writer.write_raw_data(data)
        writer.commit()
        yield "yield_cv_data"
    except Exception as error:
        for x in paths:
            os.remove("{}".format(x))
            os.remove("{}.db".format(x))
        raise error
    else:
        for x in paths:
            os.remove("{}".format(x))
            os.remove("{}.db".format(x))
Example #25
0
 def _build_mindrecord(self, mindrecord_path):
     writer = FileWriter(file_name=mindrecord_path, shard_num=self.shard_num)
     writer.add_schema(seg_schema, "seg_schema")
     data = []
     cnt = 0
     print('number of samples:', self.num_images)
     for idx in range(len(self.images)):
         sample_ = {'file_name': os.path.basename(self.images[idx])}
         with open(self.images[idx], 'rb') as f:
             sample_['data'] = f.read()
         white_io = BytesIO()
         mask = Image.open(self.masks[idx])
         mask = Image.fromarray(self._class_to_index(np.array(mask)).astype('uint8'))
         mask.save(white_io, 'PNG')
         mask_bytes = white_io.getvalue()
         sample_['label'] = white_io.getvalue()
         data.append(sample_)
         cnt += 1
         if cnt % 10 == 0:
             writer.write_raw_data(data)
             data = []
     if data:
         writer.write_raw_data(data)
     writer.commit()
     print('number of samples written:', cnt)
def convert_yolo_data_to_mindrecord():
    '''convert_yolo_data_to_mindrecord'''

    writer = FileWriter(mindrecord_file_name, mindrecord_num)
    yolo_json = {
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "float64",
            "shape": [-1, 6]
        }
    }

    print('Loading train data...')
    image_files, anno_files = prepare_file_paths()
    dataset_size = len(anno_files)
    assert dataset_size == len(image_files)
    logger.info("#size of dataset: {}".format(dataset_size))
    data = []
    for i in range(dataset_size):
        data.append(get_data(image_files[i], anno_files[i]))

    print('Writing train data to mindrecord...')
    writer.add_schema(yolo_json, "yolo_json")
    if data is None:
        raise ValueError("None needs writing to mindrecord.")
    writer.write_raw_data(data)
    writer.commit()
Example #27
0
def data_to_mindrecord_byte_image(dataset="coco",
                                  is_training=True,
                                  prefix="fasterrcnn.mindrecord",
                                  file_num=8):
    """Create MindRecord file."""
    mindrecord_dir = config.mindrecord_dir
    mindrecord_path = os.path.join(mindrecord_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)
    if dataset == "coco":
        image_files, image_anno_dict = create_coco_label(is_training)
    else:
        image_files, image_anno_dict = filter_valid_data(
            config.IMAGE_DIR, config.ANNO_PATH)

    fasterrcnn_json = {
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "int32",
            "shape": [-1, 6]
        },
    }
    writer.add_schema(fasterrcnn_json, "fasterrcnn_json")

    for image_name in image_files:
        with open(image_name, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[image_name], dtype=np.int32)
        row = {"image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
Example #28
0
def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="maskrcnn.mindrecord", file_num=8):
    """Create MindRecord file."""
    mindrecord_dir = config.mindrecord_dir
    mindrecord_path = os.path.join(mindrecord_dir, prefix)

    writer = FileWriter(mindrecord_path, file_num)
    if dataset == "coco":
        image_files, image_anno_dict, masks, masks_shape = create_coco_label(is_training)
    else:
        print("Error unsupported other dataset")
        return

    maskrcnn_json = {
        "image": {"type": "bytes"},
        "annotation": {"type": "int32", "shape": [-1, 6]},
        "mask": {"type": "bytes"},
        "mask_shape": {"type": "int32", "shape": [-1]},
    }
    writer.add_schema(maskrcnn_json, "maskrcnn_json")

    image_files_num = len(image_files)
    for ind, image_name in enumerate(image_files):
        with open(image_name, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[image_name], dtype=np.int32)
        mask = masks[image_name]
        mask_shape = masks_shape[image_name]
        row = {"image": img, "annotation": annos, "mask": mask, "mask_shape": mask_shape}
        if (ind + 1) % 10 == 0:
            print("writing {}/{} into mindrecord".format(ind + 1, image_files_num))
        writer.write_raw_data([row])
    writer.commit()
Example #29
0
def data_to_mindrecord_byte_image(image_files,
                                  image_anno_dict,
                                  dst_dir,
                                  prefix="cptn_mlt.mindrecord",
                                  file_num=1):
    """Create MindRecord file."""
    mindrecord_path = os.path.join(dst_dir, prefix)
    writer = FileWriter(mindrecord_path, file_num)

    ctpn_json = {
        "image": {
            "type": "bytes"
        },
        "annotation": {
            "type": "int32",
            "shape": [-1, 5]
        },
    }
    writer.add_schema(ctpn_json, "ctpn_json")
    for image_name in image_files:
        with open(image_name, 'rb') as f:
            img = f.read()
        annos = np.array(image_anno_dict[image_name], dtype=np.int32)
        print("img name is {}, anno is {}".format(image_name, annos))
        row = {"image": img, "annotation": annos}
        writer.write_raw_data([row])
    writer.commit()
Example #30
0
def test_shard_4_raw_data_1():
    """test file writer when shard_num equals 4 and number of sample equals 1."""
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "number"
        }
    }
    writer.add_schema(schema_json, "img_schema")
    writer.add_index(["label"])
    data = [{"file_name": "001.jpg", "label": 1}]
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(CV_FILE_NAME + "0")
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 2
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 1
    reader.close()
    paths = [
        "{}{}".format(CV_FILE_NAME,
                      str(x).rjust(1, '0')) for x in range(FILES_NUM)
    ]
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))