def random_split_trans2mindrecord(input_file_path,
                                  output_file_path,
                                  recommendation_dataset_stats_dict,
                                  part_rows=2000000,
                                  line_per_sample=1000,
                                  train_line_count=None,
                                  test_size=0.1,
                                  seed=2020,
                                  dense_dim=13,
                                  slot_dim=26):
    """Random split data and save mindrecord"""
    if train_line_count is None:
        raise ValueError("Please provide training file line count")
    test_size = int(train_line_count * test_size)
    all_indices = [i for i in range(train_line_count)]
    np.random.seed(seed)
    np.random.shuffle(all_indices)
    print("all_indices.size:{}".format(len(all_indices)))
    test_indices_set = set(all_indices[:test_size])
    print("test_indices_set.size:{}".format(len(test_indices_set)))
    print("-----------------------" * 10 + "\n" * 2)

    train_data_list = []
    test_data_list = []
    ids_list = []
    wts_list = []
    label_list = []

    writer_train = FileWriter(os.path.join(output_file_path, " .mindrecord"),
                              21)
    writer_test = FileWriter(
        os.path.join(output_file_path, "test_input_part.mindrecord"), 3)

    schema = {
        "label": {
            "type": "float32",
            "shape": [-1]
        },
        "feat_vals": {
            "type": "float32",
            "shape": [-1]
        },
        "feat_ids": {
            "type": "int32",
            "shape": [-1]
        }
    }
    writer_train.add_schema(schema, "CRITEO_TRAIN")
    writer_test.add_schema(schema, "CRITEO_TEST")

    with open(input_file_path, encoding="utf-8") as file_in:
        items_error_size_lineCount = []
        count = 0
        train_part_number = 0
        test_part_number = 0
        for i, line in enumerate(file_in):
            count += 1
            if count % 1000000 == 0:
                print("Have handle {}w lines.".format(count // 10000))
            line = line.strip("\n")
            items = line.split("\t")
            if len(items) != (1 + dense_dim + slot_dim):
                items_error_size_lineCount.append(i)
                continue
            label = float(items[0])
            values = items[1:1 + dense_dim]
            cats = items[1 + dense_dim:]

            assert len(values) == dense_dim, "values.size: {}".format(
                len(values))
            assert len(cats) == slot_dim, "cats.size: {}".format(len(cats))

            ids, wts = recommendation_dataset_stats_dict.map_cat2id(
                values, cats)

            ids_list.extend(ids)
            wts_list.extend(wts)
            label_list.append(label)

            if count % line_per_sample == 0:
                if i not in test_indices_set:
                    train_data_list.append({
                        "feat_ids":
                        np.array(ids_list, dtype=np.int32),
                        "feat_vals":
                        np.array(wts_list, dtype=np.float32),
                        "label":
                        np.array(label_list, dtype=np.float32)
                    })
                else:
                    test_data_list.append({
                        "feat_ids":
                        np.array(ids_list, dtype=np.int32),
                        "feat_vals":
                        np.array(wts_list, dtype=np.float32),
                        "label":
                        np.array(label_list, dtype=np.float32)
                    })
                if train_data_list and len(train_data_list) % part_rows == 0:
                    writer_train.write_raw_data(train_data_list)
                    train_data_list.clear()
                    train_part_number += 1

                if test_data_list and len(test_data_list) % part_rows == 0:
                    writer_test.write_raw_data(test_data_list)
                    test_data_list.clear()
                    test_part_number += 1

                ids_list.clear()
                wts_list.clear()
                label_list.clear()

        if train_data_list:
            writer_train.write_raw_data(train_data_list)
        if test_data_list:
            writer_test.write_raw_data(test_data_list)
    writer_train.commit()
    writer_test.commit()

    print("-------------" * 10)
    print("items_error_size_lineCount.size(): {}.".format(
        len(items_error_size_lineCount)))
    print("-------------" * 10)
    np.save("items_error_size_lineCount.npy", items_error_size_lineCount)
def test_write_read_process_with_multi_bytes_and_array():
    mindrecord_file_name = "test.mindrecord"
    data = [{"file_name": "001.jpg", "label": 4,
             "image1": bytes("image1 bytes abc", encoding='UTF-8'),
             "image2": bytes("image1 bytes def", encoding='UTF-8'),
             "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image1 bytes mno", encoding='UTF-8'),
             "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)},
            {"file_name": "002.jpg", "label": 5,
             "image1": bytes("image2 bytes abc", encoding='UTF-8'),
             "image2": bytes("image2 bytes def", encoding='UTF-8'),
             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image2 bytes mno", encoding='UTF-8'),
             "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)},
            {"file_name": "003.jpg", "label": 6,
             "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "image1": bytes("image3 bytes abc", encoding='UTF-8'),
             "image2": bytes("image3 bytes def", encoding='UTF-8'),
             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image3 bytes mno", encoding='UTF-8'),
             "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)},
            {"file_name": "004.jpg", "label": 7,
             "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "image1": bytes("image4 bytes abc", encoding='UTF-8'),
             "image2": bytes("image4 bytes def", encoding='UTF-8'),
             "image3": bytes("image4 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image4 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image4 bytes mno", encoding='UTF-8'),
             "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)},
            {"file_name": "005.jpg", "label": 8,
             "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64),
             "image1": bytes("image5 bytes abc", encoding='UTF-8'),
             "image2": bytes("image5 bytes def", encoding='UTF-8'),
             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image5 bytes mno", encoding='UTF-8'),
             "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)},
            {"file_name": "006.jpg", "label": 9,
             "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64),
             "image1": bytes("image6 bytes abc", encoding='UTF-8'),
             "image2": bytes("image6 bytes def", encoding='UTF-8'),
             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image6 bytes mno", encoding='UTF-8'),
             "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)}
            ]

    writer = FileWriter(mindrecord_file_name)
    schema = {"file_name": {"type": "string"},
              "image1": {"type": "bytes"},
              "image2": {"type": "bytes"},
              "source_sos_ids": {"type": "int64", "shape": [-1]},
              "source_sos_mask": {"type": "int64", "shape": [-1]},
              "image3": {"type": "bytes"},
              "image4": {"type": "bytes"},
              "image5": {"type": "bytes"},
              "target_sos_ids": {"type": "int64", "shape": [-1]},
              "target_sos_mask": {"type": "int64", "shape": [-1]},
              "target_eos_ids": {"type": "int64", "shape": [-1]},
              "target_eos_mask": {"type": "int64", "shape": [-1]},
              "label": {"type": "int32"}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 13
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "source_sos_mask",
                                                                 "target_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["image2", "source_sos_mask",
                                                                 "image3", "target_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 4
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image4",
                                                                 "source_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image5",
                                                                 "image4", "image3", "source_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 5
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "image5", "image2",
                                                                 "source_sos_mask", "label"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 5
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file",
                        type=str,
                        required=True,
                        help='Input raw text file.')
    parser.add_argument("--output_file",
                        type=str,
                        required=True,
                        help='Output MindRecord file.')
    parser.add_argument(
        "--num_splits",
        type=int,
        default=1,
        help='The MindRecord file will be split into the number of partition. '
    )
    parser.add_argument("--max_seq_length",
                        type=int,
                        required=True,
                        help='Maximum sequence length.')
    parser.add_argument("--vocab_file",
                        type=str,
                        required=True,
                        default='',
                        help='url of gpt2-vocab.json ')
    parser.add_argument("--merge_file",
                        type=str,
                        required=True,
                        default='',
                        help='url of gpt2-merges.txt ')
    parser.add_argument("--mode",
                        type=str,
                        required=True,
                        default='cnn_dailymail',
                        help='mode of dataset creation')
    args = parser.parse_args()

    tokenizer = tokenization.Tokenizer(vocab_file=args.vocab_file,
                                       merge_file=args.merge_file,
                                       mode=args.mode)
    input_file = args.input_file
    logging.info("***** Reading from input files *****")
    logging.info("Input File: %s", input_file)

    output_file = args.output_file
    logging.info("***** Writing to output files *****")
    logging.info("Output File: %s", output_file)

    writer = FileWriter(output_file, args.num_splits)
    data_schema = {
        "input_ids": {
            "type": "int64",
            "shape": [-1]
        },
        "input_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "label_ids": {
            "type": "int64",
            "shape": [-1]
        }
    }
    writer.add_schema(data_schema, "wikitext2-schema")

    total_written = 0
    total_read = 0

    logging.info("***** Reading from  %s *****", input_file)
    with open(input_file, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            total_read += 1
            if total_read % 500 == 0:
                logging.info("%d ...", total_read)

            output = create_instance(tokenizer, line, args.max_seq_length)
            features = write_instance_to_file(writer, instance=output)
            total_written += 1

            if total_written <= 20:
                logging.info("***** Example *****")
                logging.info("input tokens: %s",
                             tokenizer.decode(output["input_ids"][:-1]))
                logging.info("label tokens: %s",
                             tokenizer.decode(output["input_ids"][1:]))

                for feature_name in features.keys():
                    feature = features[feature_name]
                    logging.info("%s: %s", feature_name, feature)

    writer.commit()
    logging.info("Wrote %d total instances", total_written)
def test_write_read_process_with_multi_bytes():
    mindrecord_file_name = "test.mindrecord"
    data = [{"file_name": "001.jpg", "label": 43,
             "image1": bytes("image1 bytes abc", encoding='UTF-8'),
             "image2": bytes("image1 bytes def", encoding='UTF-8'),
             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image1 bytes mno", encoding='UTF-8')},
            {"file_name": "002.jpg", "label": 91,
             "image1": bytes("image2 bytes abc", encoding='UTF-8'),
             "image2": bytes("image2 bytes def", encoding='UTF-8'),
             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image2 bytes mno", encoding='UTF-8')},
            {"file_name": "003.jpg", "label": 61,
             "image1": bytes("image3 bytes abc", encoding='UTF-8'),
             "image2": bytes("image3 bytes def", encoding='UTF-8'),
             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image3 bytes mno", encoding='UTF-8')},
            {"file_name": "004.jpg", "label": 29,
             "image1": bytes("image4 bytes abc", encoding='UTF-8'),
             "image2": bytes("image4 bytes def", encoding='UTF-8'),
             "image3": bytes("image4 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image4 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image4 bytes mno", encoding='UTF-8')},
            {"file_name": "005.jpg", "label": 78,
             "image1": bytes("image5 bytes abc", encoding='UTF-8'),
             "image2": bytes("image5 bytes def", encoding='UTF-8'),
             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image5 bytes mno", encoding='UTF-8')},
            {"file_name": "006.jpg", "label": 37,
             "image1": bytes("image6 bytes abc", encoding='UTF-8'),
             "image2": bytes("image6 bytes def", encoding='UTF-8'),
             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image6 bytes mno", encoding='UTF-8')}
            ]
    writer = FileWriter(mindrecord_file_name)
    schema = {"file_name": {"type": "string"},
              "image1": {"type": "bytes"},
              "image2": {"type": "bytes"},
              "image3": {"type": "bytes"},
              "label": {"type": "int32"},
              "image4": {"type": "bytes"},
              "image5": {"type": "bytes"}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 7
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader2 = FileReader(file_name=mindrecord_file_name, columns=["image1", "image2", "image5"])
    count = 0
    for index, x in enumerate(reader2.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader2.close()

    reader3 = FileReader(file_name=mindrecord_file_name, columns=["image2", "image4"])
    count = 0
    for index, x in enumerate(reader3.get_next()):
        assert len(x) == 2
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader3.close()

    reader4 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2"])
    count = 0
    for index, x in enumerate(reader4.get_next()):
        assert len(x) == 2
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader4.close()

    reader5 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2", "label"])
    count = 0
    for index, x in enumerate(reader5.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader5.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
def test_write_read_process_with_multi_array():
    mindrecord_file_name = "test.mindrecord"
    data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)}
            ]
    writer = FileWriter(mindrecord_file_name)
    schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
              "source_sos_mask": {"type": "int64", "shape": [-1]},
              "source_eos_ids": {"type": "int64", "shape": [-1]},
              "source_eos_mask": {"type": "int64", "shape": [-1]},
              "target_sos_ids": {"type": "int64", "shape": [-1]},
              "target_sos_mask": {"type": "int64", "shape": [-1]},
              "target_eos_ids": {"type": "int64", "shape": [-1]},
              "target_eos_mask": {"type": "int64", "shape": [-1]}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 8
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_eos_ids", "source_eos_mask",
                                                                 "target_sos_ids", "target_sos_mask",
                                                                 "target_eos_ids", "target_eos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 6
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids",
                                                                 "target_sos_ids",
                                                                 "target_eos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask",
                                                                 "source_eos_mask",
                                                                 "source_sos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 1
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_file",
        type=str,
        required=True,
        help='Input raw text file (or comma-separated list of files).')
    parser.add_argument("--output_file",
                        type=str,
                        required=True,
                        help='Output MindRecord file.')
    parser.add_argument(
        "--num_splits",
        type=int,
        default=16,
        help='The MindRecord file will be split into the number of partition.')
    parser.add_argument(
        "--vocab_file",
        type=str,
        required=True,
        help='The vocabulary file that the Transformer model was trained on.')
    parser.add_argument("--clip_to_max_len",
                        type=bool,
                        default=False,
                        help='clip sequences to maximum sequence length.')
    parser.add_argument("--max_seq_length",
                        type=int,
                        default=128,
                        help='Maximum sequence length.')
    args = parser.parse_args()

    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)

    input_files = []
    for input_pattern in args.input_file.split(","):
        input_files.append(input_pattern)

    logging.info("*** Reading from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

    output_file = args.output_file
    logging.info("*** Writing to output files ***")
    logging.info("  %s", output_file)

    writer = FileWriter(output_file, args.num_splits)
    data_schema = {
        "source_sos_ids": {
            "type": "int64",
            "shape": [-1]
        },
        "source_sos_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "source_eos_ids": {
            "type": "int64",
            "shape": [-1]
        },
        "source_eos_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "target_sos_ids": {
            "type": "int64",
            "shape": [-1]
        },
        "target_sos_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "target_eos_ids": {
            "type": "int64",
            "shape": [-1]
        },
        "target_eos_mask": {
            "type": "int64",
            "shape": [-1]
        }
    }
    writer.add_schema(data_schema, "tranformer hisi")

    total_written = 0
    total_read = 0

    for input_file in input_files:
        logging.info("*** Reading from   %s ***", input_file)
        with open(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break

                total_read += 1
                if total_read % 100000 == 0:
                    logging.info("%d ...", total_read)

                source_line, target_line = line.strip().split("\t")
                source_tokens = tokenizer.tokenize(source_line)
                target_tokens = tokenizer.tokenize(target_line)

                if len(source_tokens) >= args.max_seq_length or len(
                        target_tokens) >= args.max_seq_length:
                    logging.info("ignore long sentence!")
                    continue

                instance = create_training_instance(
                    source_tokens,
                    target_tokens,
                    args.max_seq_length,
                    clip_to_max_len=args.clip_to_max_len)
                if instance is None:
                    continue

                features = write_instance_to_file(writer, instance, tokenizer,
                                                  args.max_seq_length)
                total_written += 1

                if total_written <= 20:
                    logging.info("*** Example ***")
                    logging.info(
                        "source tokens: %s", " ".join([
                            tokenization.convert_to_printable(x)
                            for x in instance.source_eos_tokens
                        ]))
                    logging.info(
                        "target tokens: %s", " ".join([
                            tokenization.convert_to_printable(x)
                            for x in instance.target_sos_tokens
                        ]))

                    for feature_name in features.keys():
                        feature = features[feature_name]
                        logging.info("%s: %s", feature_name, feature)

    writer.commit()
    logging.info("Wrote %d total instances", total_written)
Exemple #7
0
        lines = f.readlines()
    if args.shuffle:
        np.random.shuffle(lines)

    dst_dir = '/'.join(args.dst_path.split('/')[:-1])
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    print('number of samples:', len(lines))
    writer = FileWriter(file_name=args.dst_path, shard_num=args.num_shards)
    writer.add_schema(seg_schema, "seg_schema")
    cnt = 0
    for l in lines:
        img_path, label_path = l.strip().split(' ')
        sample_ = {"file_name": img_path.split('/')[-1]}
        with open(os.path.join(args.data_root, img_path), 'rb') as f:
            sample_['data'] = f.read()
        with open(os.path.join(args.data_root, label_path), 'rb') as f:
            sample_['label'] = f.read()
        data.append(sample_)
        cnt += 1
        if cnt % 1000 == 0:
            writer.write_raw_data(data)
            print('number of samples written:', cnt)
            data = []

    if data:
        writer.write_raw_data(data)
    writer.commit()
    print('number of samples written:', cnt)
Exemple #8
0
def test_case_02(add_and_remove_cv_file):  # muti-bytes
    data = [{
        "file_name":
        "001.jpg",
        "label":
        43,
        "float32_array":
        np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12345,
        "float64":
        1987654321.123456785,
        "source_sos_ids":
        np.array([1, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image1 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image1 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image1 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image1 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image1 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "002.jpg",
        "label":
        91,
        "float32_array":
        np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12445,
        "float64":
        1987654321.123456786,
        "source_sos_ids":
        np.array([11, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image2 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image2 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image2 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image2 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image2 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "003.jpg",
        "label":
        61,
        "float32_array":
        np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12545,
        "float64":
        1987654321.123456787,
        "source_sos_ids":
        np.array([21, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image3 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image3 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image3 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image3 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image3 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "004.jpg",
        "label":
        29,
        "float32_array":
        np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12645,
        "float64":
        1987654321.123456788,
        "source_sos_ids":
        np.array([31, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image4 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image4 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image4 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image4 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image4 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "005.jpg",
        "label":
        78,
        "float32_array":
        np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12745,
        "float64":
        1987654321.123456789,
        "source_sos_ids":
        np.array([41, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image5 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image5 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image5 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image5 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image5 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "006.jpg",
        "label":
        37,
        "float32_array":
        np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12745,
        "float64":
        1987654321.123456789,
        "source_sos_ids":
        np.array([51, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image6 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image6 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image6 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image6 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image6 bytes mno", encoding='UTF-8')
    }]
    schema = {
        "file_name": {
            "type": "string"
        },
        "float32_array": {
            "type": "float32",
            "shape": [-1]
        },
        "float64_array": {
            "type": "float64",
            "shape": [-1]
        },
        "float32": {
            "type": "float32"
        },
        "float64": {
            "type": "float64"
        },
        "source_sos_ids": {
            "type": "int32",
            "shape": [-1]
        },
        "source_sos_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "image1": {
            "type": "bytes"
        },
        "image2": {
            "type": "bytes"
        },
        "image3": {
            "type": "bytes"
        },
        "label": {
            "type": "int32"
        },
        "image4": {
            "type": "bytes"
        },
        "image5": {
            "type": "bytes"
        }
    }
    writer = FileWriter(CV_FILE_NAME1, FILES_NUM)
    writer.add_schema(schema, "schema")
    writer.write_raw_data(data)
    writer.commit()

    d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False)
    d1.save(CV_FILE_NAME2, FILES_NUM)
    data_value_to_list = []

    for item in data:
        new_data = {}
        new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
        new_data['float32_array'] = item["float32_array"]
        new_data['float64_array'] = item["float64_array"]
        new_data['float32'] = item["float32"]
        new_data['float64'] = item["float64"]
        new_data['source_sos_ids'] = item["source_sos_ids"]
        new_data['source_sos_mask'] = item["source_sos_mask"]
        new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
        data_value_to_list.append(new_data)

    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
                        num_parallel_workers=num_readers,
                        shuffle=False)
    assert d2.get_dataset_size() == 6
    num_iter = 0
    for item in d2.create_dict_iterator():
        assert len(item) == 13
        for field in item:
            if isinstance(item[field], np.ndarray):
                if item[field].dtype == np.float32:
                    assert (item[field] == np.array(
                        data_value_to_list[num_iter][field],
                        np.float32)).all()
                else:
                    assert (item[field] == data_value_to_list[num_iter][field]
                            ).all()
            else:
                assert item[field] == data_value_to_list[num_iter][field]
        num_iter += 1
    assert num_iter == 6
Exemple #9
0
def test_case_00(add_and_remove_cv_file):  # only bin data
    data = [{
        "image1": bytes("image1 bytes abc", encoding='UTF-8'),
        "image2": bytes("image1 bytes def", encoding='UTF-8'),
        "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image1 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image2 bytes abc", encoding='UTF-8'),
        "image2": bytes("image2 bytes def", encoding='UTF-8'),
        "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image2 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image3 bytes abc", encoding='UTF-8'),
        "image2": bytes("image3 bytes def", encoding='UTF-8'),
        "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image3 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image5 bytes abc", encoding='UTF-8'),
        "image2": bytes("image5 bytes def", encoding='UTF-8'),
        "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image5 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image6 bytes abc", encoding='UTF-8'),
        "image2": bytes("image6 bytes def", encoding='UTF-8'),
        "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image6 bytes mno", encoding='UTF-8')
    }]
    schema = {
        "image1": {
            "type": "bytes"
        },
        "image2": {
            "type": "bytes"
        },
        "image3": {
            "type": "bytes"
        },
        "image4": {
            "type": "bytes"
        },
        "image5": {
            "type": "bytes"
        }
    }
    writer = FileWriter(CV_FILE_NAME1, FILES_NUM)
    writer.add_schema(schema, "schema")
    writer.write_raw_data(data)
    writer.commit()

    d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False)
    d1.save(CV_FILE_NAME2, FILES_NUM)
    data_value_to_list = []

    for item in data:
        new_data = {}
        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
        data_value_to_list.append(new_data)

    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
                        num_parallel_workers=num_readers,
                        shuffle=False)
    assert d2.get_dataset_size() == 5
    num_iter = 0
    for item in d2.create_dict_iterator():
        assert len(item) == 5
        for field in item:
            if isinstance(item[field], np.ndarray):
                assert (
                    item[field] == data_value_to_list[num_iter][field]).all()
            else:
                assert item[field] == data_value_to_list[num_iter][field]
        num_iter += 1
    assert num_iter == 5
def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial():
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    cv_schema_json = {
        "id": {
            "type": "int32"
        },
        "image_0": {
            "type": "bytes"
        },
        "image_2": {
            "type": "bytes"
        },
        "image_3": {
            "type": "bytes"
        },
        "image_4": {
            "type": "bytes"
        },
        "input_mask": {
            "type": "int32",
            "shape": [-1]
        },
        "segments": {
            "type": "float32",
            "shape": [2, 3]
        }
    }
    writer.add_schema(cv_schema_json, "two_images_schema")
    with open("../data/mindrecord/testImageNetData/images/image_00010.jpg",
              "rb") as file_reader:
        img_data = file_reader.read()
    ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32)
    ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32)
    data = []
    for i in range(5):
        item = {
            "id": i,
            "image_0": img_data,
            "image_2": img_data,
            "image_3": img_data,
            "image_4": img_data,
            "input_mask": ndarray_1,
            "segments": ndarray_2
        }
        data.append(item)
    writer.write_raw_data(data)
    writer.commit()
    assert os.path.exists(CV_FILE_NAME)
    assert os.path.exists(CV_FILE_NAME + ".db")

    # tutorial for minderdataset.
    columns_list = [
        "id", "image_0", "image_2", "image_3", "image_4", "input_mask",
        "segments"
    ]
    num_readers = 1
    data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for item in data_set.create_dict_iterator():
        assert len(item) == 7
        logger.info("item: {}".format(item))
        assert item["image_0"].dtype == np.uint8
        assert (item["image_0"] == item["image_2"]).all()
        assert (item["image_3"] == item["image_4"]).all()
        assert (item["image_0"] == item["image_4"]).all()
        assert item["image_2"].dtype == np.uint8
        assert item["image_3"].dtype == np.uint8
        assert item["image_4"].dtype == np.uint8
        assert item["id"].dtype == np.int32
        assert item["input_mask"].shape == (5, )
        assert item["input_mask"].dtype == np.int32
        assert item["segments"].shape == (2, 3)
        assert item["segments"].dtype == np.float32
        num_iter += 1
    assert num_iter == 5

    if os.path.exists("{}".format(CV_FILE_NAME + ".db")):
        os.remove(CV_FILE_NAME + ".db")
    if os.path.exists("{}".format(CV_FILE_NAME)):
        os.remove(CV_FILE_NAME)
Exemple #11
0
    def transfer_coco_to_mindrecord(self,
                                    mindrecord_dir,
                                    file_name="coco_hp.train.mind",
                                    shard_num=1):
        """Create MindRecord file by image_dir and anno_path."""
        if not os.path.isdir(mindrecord_dir):
            os.makedirs(mindrecord_dir)
        if os.path.isdir(self.image_path) and os.path.exists(self.annot_path):
            logger.info("Create MindRecord based on COCO_HP dataset")
        else:
            raise ValueError(
                'data_dir {} or anno_path {} does not exist'.format(
                    self.image_path, self.annot_path))

        mindrecord_path = os.path.join(mindrecord_dir, file_name)
        writer = FileWriter(mindrecord_path, shard_num)
        centernet_json = {
            "image": {
                "type": "bytes"
            },
            "num_objects": {
                "type": "int32"
            },
            "keypoints": {
                "type": "int32",
                "shape": [-1, self.data_opt.num_joints * 3]
            },
            "bbox": {
                "type": "float32",
                "shape": [-1, 4]
            },
            "category_id": {
                "type": "int32",
                "shape": [-1]
            },
        }
        writer.add_schema(centernet_json, "centernet_json")

        for img_id in self.images:
            image_info = self.coco.loadImgs([img_id])
            annos = self.coco.loadAnns(self.anns[img_id])
            # get image
            img_name = image_info[0]['file_name']
            img_name = os.path.join(self.image_path, img_name)
            with open(img_name, 'rb') as f:
                image = f.read()
            # parse annos info
            keypoints = []
            category_id = []
            bbox = []
            num_objects = len(annos)
            for anno in annos:
                keypoints.append(anno['keypoints'])
                category_id.append(anno['category_id'])
                bbox.append(anno['bbox'])

            row = {
                "image": image,
                "num_objects": num_objects,
                "keypoints": np.array(keypoints, np.int32),
                "bbox": np.array(bbox, np.float32),
                "category_id": np.array(category_id, np.int32)
            }
            writer.write_raw_data([row])
        writer.commit()
        logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))
Exemple #12
0
def test_issue_84():
    """test file reader when db does not match."""
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "number"
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
    data = list(
        get_nlp_data("../data/mindrecord/testAclImdbData/pos",
                     "../data/mindrecord/testAclImdbData/vocab.txt", 10))
    nlp_schema_json = {
        "id": {
            "type": "string"
        },
        "label": {
            "type": "number"
        },
        "rating": {
            "type": "number"
        },
        "input_ids": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "input_mask": {
            "type": "array",
            "items": {
                "type": "number"
            }
        },
        "segment_ids": {
            "type": "array",
            "items": {
                "type": "number"
            }
        }
    }
    writer.set_header_size(1 << 14)
    writer.set_page_size(1 << 15)
    writer.add_schema(nlp_schema_json, "nlp_schema")
    writer.add_index(["id", "rating"])
    writer.write_raw_data(data)
    writer.commit()

    reader = ShardReader()
    os.rename("imagenet.mindrecord1.db", "imagenet.mindrecord1.db.bk")
    os.rename("aclImdb.mindrecord1.db", "imagenet.mindrecord1.db")
    file_name = os.path.join(os.getcwd(), "imagenet.mindrecord1")
    with pytest.raises(Exception) as e:
        reader.open(file_name)
    assert str(e.value) == "[MRMOpenError]: error_code: 1347690596, " \
                           "error_msg: " \
                           "MindRecord File could not open successfully."

    os.rename("imagenet.mindrecord1.db", "aclImdb.mindrecord1.db")
    paths = [
        "{}{}".format(NLP_FILE_NAME,
                      str(x).rjust(1, '0')) for x in range(FILES_NUM)
    ]
    for item in paths:
        os.remove("{}".format(item))
        os.remove("{}.db".format(item))

    os.rename("imagenet.mindrecord1.db.bk", "imagenet.mindrecord1.db")
    paths = [
        "{}{}".format(CV_FILE_NAME,
                      str(x).rjust(1, '0')) for x in range(FILES_NUM)
    ]
    for item in paths:
        os.remove("{}".format(item))
        os.remove("{}.db".format(item))
Exemple #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_file",
        type=str,
        required=True,
        help='Input raw text file (or comma-separated list of files).')
    parser.add_argument("--output_file",
                        type=str,
                        required=True,
                        help='Output MindRecord file.')
    parser.add_argument(
        "--num_splits",
        type=int,
        default=16,
        help='The MindRecord file will be split into the number of partition.')
    parser.add_argument(
        "--src_vocab_file",
        type=str,
        required=True,
        help='The vocabulary file that the Transformer model was trained on.')
    parser.add_argument(
        "--trg_vocab_file",
        type=str,
        required=True,
        help='The vocabulary file that the Transformer model was trained on.')
    parser.add_argument("--clip_to_max_len",
                        type=ast.literal_eval,
                        default=False,
                        help='clip sequences to maximum sequence length.')
    parser.add_argument("--max_seq_length",
                        type=int,
                        default=32,
                        help='Maximum sequence length.')
    parser.add_argument("--bucket",
                        type=ast.literal_eval,
                        default=[32],
                        help='bucket sequence length')
    args = parser.parse_args()
    tokenizer_src = tokenization.WhiteSpaceTokenizer(
        vocab_file=args.src_vocab_file)
    tokenizer_trg = tokenization.WhiteSpaceTokenizer(
        vocab_file=args.trg_vocab_file)
    input_files = []
    for input_pattern in args.input_file.split(","):
        input_files.append(input_pattern)
    logging.info("*** Read from input files ***")
    output_file = args.output_file
    logging.info("*** Write to output files ***")
    logging.info("  %s", output_file)
    total_written = 0
    total_read = 0
    feature_dict = {}
    for i in args.bucket:
        feature_dict[i] = []
    for input_file in input_files:
        logging.info("*** Reading from   %s ***", input_file)
        with open(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                total_read += 1
                if total_read % 100000 == 0:
                    logging.info("Read %d ...", total_read)
                if line.strip() == "":
                    continue
                source_line, target_line = line.strip().split("\t")
                source_tokens = tokenizer_src.tokenize(source_line)
                target_tokens = tokenizer_trg.tokenize(target_line)
                if len(source_tokens) >= args.max_seq_length or len(
                        target_tokens) >= args.max_seq_length:
                    logging.info("ignore long sentence!")
                    continue
                instance = create_training_instance(
                    source_tokens,
                    target_tokens,
                    args.max_seq_length,
                    clip_to_max_len=args.clip_to_max_len)
                if instance is None:
                    continue
                features, seq_max_bucket_length = get_instance_features(
                    instance, tokenizer_src, tokenizer_trg,
                    args.max_seq_length, args.bucket)
                for key in feature_dict:
                    if key == seq_max_bucket_length:
                        feature_dict[key].append(features)
                if total_read <= 10:
                    logging.info("*** Example ***")
                    logging.info(
                        "source tokens: %s", " ".join([
                            tokenization.convert_to_printable(x)
                            for x in instance.source_tokens
                        ]))
                    logging.info(
                        "target tokens: %s", " ".join([
                            tokenization.convert_to_printable(x)
                            for x in instance.target_tokens
                        ]))

                    for feature_name in features.keys():
                        feature = features[feature_name]
                        logging.info("%s: %s", feature_name, feature)
    for i in args.bucket:
        if args.num_splits == 1:
            output_file_name = output_file + '_' + str(i)
        else:
            output_file_name = output_file + '_' + str(i) + '_'
        writer = FileWriter(output_file_name, args.num_splits)
        data_schema = {
            "source_ids": {
                "type": "int64",
                "shape": [-1]
            },
            "source_mask": {
                "type": "int64",
                "shape": [-1]
            },
            "target_ids": {
                "type": "int64",
                "shape": [-1]
            },
            "target_mask": {
                "type": "int64",
                "shape": [-1]
            }
        }
        writer.add_schema(data_schema, "gru")
        features_ = feature_dict[i]
        logging.info("Bucket length %d has %d samples, start writing...", i,
                     len(features_))
        for item in features_:
            writer.write_raw_data([item])
            total_written += 1
        writer.commit()
    logging.info("Wrote %d total instances", total_written)
Exemple #14
0
def fsns_train_data_to_mindrecord(mindrecord_dir,
                                  prefix="data_ocr.mindrecord",
                                  file_num=8):

    anno_file_dirs = [config.train_annotation_file]
    images, image_path_dict, image_anno_dict = create_fsns_label(
        image_dir=config.data_root, anno_file_dirs=anno_file_dirs)
    vocab, _ = initialize_vocabulary(config.vocab_path)

    data_schema = {
        "image": {
            "type": "bytes"
        },
        "label": {
            "type": "int32",
            "shape": [-1]
        },
        "decoder_input": {
            "type": "int32",
            "shape": [-1]
        },
        "decoder_mask": {
            "type": "int32",
            "shape": [-1]
        },
        "decoder_target": {
            "type": "int32",
            "shape": [-1]
        },
        "annotation": {
            "type": "string"
        }
    }

    mindrecord_path = os.path.join(mindrecord_dir, prefix)

    writer = FileWriter(mindrecord_path, file_num)
    writer.add_schema(data_schema, "ocr")

    for img_id in images:

        image_path = image_path_dict[img_id]
        annotation = image_anno_dict[img_id]

        label_max_len = config.max_text_len
        text_max_len = config.max_text_len - 2

        if len(annotation) > text_max_len:
            continue
        label = serialize_annotation(image_path, annotation, vocab)

        if label is None:
            continue

        label_len = len(label)
        decoder_input_len = label_max_len

        if label_len <= decoder_input_len:
            label = np.concatenate(
                (label, np.zeros(decoder_input_len - label_len,
                                 dtype=np.int32)))
            one_mask_len = label_len - config.go_shift
            target_weight = np.concatenate(
                (np.ones(one_mask_len, dtype=np.float32),
                 np.zeros(decoder_input_len - one_mask_len, dtype=np.float32)))
        else:
            continue

        decoder_input = (np.array(label).T).astype(np.int32)
        target_weight = (np.array(target_weight).T).astype(np.int32)

        if not len(decoder_input) == len(target_weight):
            continue

        target = [decoder_input[i + 1] for i in range(len(decoder_input) - 1)]
        target = (np.array(target)).astype(np.int32)

        with open(image_path, 'rb') as f:
            img = f.read()

        row = {
            "image": img,
            "label": label,
            "decoder_input": decoder_input,
            "decoder_mask": target_weight,
            "decoder_target": target,
            "annotation": str(annotation)
        }

        writer.write_raw_data([row])
    writer.commit()
def test_write_read_process():
    mindrecord_file_name = "test.mindrecord"
    data = [{
        "file_name": "001.jpg",
        "label": 43,
        "score": 0.8,
        "mask": np.array([3, 6, 9], dtype=np.int64),
        "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32),
        "data": bytes("image bytes abc", encoding='UTF-8')
    }, {
        "file_name": "002.jpg",
        "label": 91,
        "score": 5.4,
        "mask": np.array([1, 4, 7], dtype=np.int64),
        "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32),
        "data": bytes("image bytes def", encoding='UTF-8')
    }, {
        "file_name": "003.jpg",
        "label": 61,
        "score": 6.4,
        "mask": np.array([7, 6, 3], dtype=np.int64),
        "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32),
        "data": bytes("image bytes ghi", encoding='UTF-8')
    }, {
        "file_name": "004.jpg",
        "label": 29,
        "score": 8.1,
        "mask": np.array([2, 8, 0], dtype=np.int64),
        "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32),
        "data": bytes("image bytes jkl", encoding='UTF-8')
    }, {
        "file_name": "005.jpg",
        "label": 78,
        "score": 7.7,
        "mask": np.array([3, 1, 2], dtype=np.int64),
        "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32),
        "data": bytes("image bytes mno", encoding='UTF-8')
    }, {
        "file_name": "006.jpg",
        "label": 37,
        "score": 9.4,
        "mask": np.array([7, 6, 7], dtype=np.int64),
        "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32),
        "data": bytes("image bytes pqr", encoding='UTF-8')
    }]
    writer = FileWriter(mindrecord_file_name)
    schema = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "score": {
            "type": "float64"
        },
        "mask": {
            "type": "int64",
            "shape": [-1]
        },
        "segments": {
            "type": "float32",
            "shape": [2, 2]
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 6
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
Exemple #16
0
    def transfer_coco_to_mindrecord(self,
                                    mindrecord_dir,
                                    file_name="coco_det.train.mind",
                                    shard_num=1):
        """Create MindRecord file by image_dir and anno_path."""
        if not os.path.isdir(mindrecord_dir):
            os.makedirs(mindrecord_dir)
        if os.path.isdir(self.image_path) and os.path.exists(self.annot_path):
            logger.info("Create MindRecord based on COCO_HP dataset")
        else:
            raise ValueError(
                'data_dir {} or anno_path {} does not exist'.format(
                    self.image_path, self.annot_path))

        mindrecord_path = os.path.join(mindrecord_dir, file_name)
        writer = FileWriter(mindrecord_path, shard_num)

        centernet_json = {
            "img_id": {
                "type": "int32",
                "shape": [1]
            },
            "image": {
                "type": "bytes"
            },
            "num_objects": {
                "type": "int32"
            },
            "bboxes": {
                "type": "float32",
                "shape": [-1, 4]
            },
            "category_id": {
                "type": "int32",
                "shape": [-1]
            },
        }

        writer.add_schema(centernet_json, "centernet_json")

        for img_id in self.images:
            image_info = self.coco.loadImgs([img_id])
            annos = self.coco.loadAnns(self.anns[img_id])
            # get image
            img_name = image_info[0]['file_name']
            img_name = os.path.join(self.image_path, img_name)
            with open(img_name, 'rb') as f:
                image = f.read()

            bboxes = []
            category_id = []
            num_objects = len(annos)
            for anno in annos:
                bbox = self._coco_box_to_bbox(anno['bbox'])
                class_name = self.classs_dict[anno["category_id"]]
                if class_name in self.train_cls:
                    x_min, x_max = bbox[0], bbox[2]
                    y_min, y_max = bbox[1], bbox[3]
                    bboxes.append([x_min, y_min, x_max, y_max])
                    category_id.append(self.train_cls_dict[class_name])

            row = {
                "img_id": np.array([img_id], dtype=np.int32),
                "image": image,
                "num_objects": num_objects,
                "bboxes": np.array(bboxes, np.float32),
                "category_id": np.array(category_id, np.int32)
            }
            writer.write_raw_data([row])

        writer.commit()
        logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))