def random_split_trans2mindrecord(input_file_path, output_file_path, recommendation_dataset_stats_dict, part_rows=2000000, line_per_sample=1000, train_line_count=None, test_size=0.1, seed=2020, dense_dim=13, slot_dim=26): """Random split data and save mindrecord""" if train_line_count is None: raise ValueError("Please provide training file line count") test_size = int(train_line_count * test_size) all_indices = [i for i in range(train_line_count)] np.random.seed(seed) np.random.shuffle(all_indices) print("all_indices.size:{}".format(len(all_indices))) test_indices_set = set(all_indices[:test_size]) print("test_indices_set.size:{}".format(len(test_indices_set))) print("-----------------------" * 10 + "\n" * 2) train_data_list = [] test_data_list = [] ids_list = [] wts_list = [] label_list = [] writer_train = FileWriter(os.path.join(output_file_path, " .mindrecord"), 21) writer_test = FileWriter( os.path.join(output_file_path, "test_input_part.mindrecord"), 3) schema = { "label": { "type": "float32", "shape": [-1] }, "feat_vals": { "type": "float32", "shape": [-1] }, "feat_ids": { "type": "int32", "shape": [-1] } } writer_train.add_schema(schema, "CRITEO_TRAIN") writer_test.add_schema(schema, "CRITEO_TEST") with open(input_file_path, encoding="utf-8") as file_in: items_error_size_lineCount = [] count = 0 train_part_number = 0 test_part_number = 0 for i, line in enumerate(file_in): count += 1 if count % 1000000 == 0: print("Have handle {}w lines.".format(count // 10000)) line = line.strip("\n") items = line.split("\t") if len(items) != (1 + dense_dim + slot_dim): items_error_size_lineCount.append(i) continue label = float(items[0]) values = items[1:1 + dense_dim] cats = items[1 + dense_dim:] assert len(values) == dense_dim, "values.size: {}".format( len(values)) assert len(cats) == slot_dim, "cats.size: {}".format(len(cats)) ids, wts = recommendation_dataset_stats_dict.map_cat2id( values, cats) ids_list.extend(ids) wts_list.extend(wts) label_list.append(label) if count % line_per_sample == 0: if i not in test_indices_set: train_data_list.append({ "feat_ids": np.array(ids_list, dtype=np.int32), "feat_vals": np.array(wts_list, dtype=np.float32), "label": np.array(label_list, dtype=np.float32) }) else: test_data_list.append({ "feat_ids": np.array(ids_list, dtype=np.int32), "feat_vals": np.array(wts_list, dtype=np.float32), "label": np.array(label_list, dtype=np.float32) }) if train_data_list and len(train_data_list) % part_rows == 0: writer_train.write_raw_data(train_data_list) train_data_list.clear() train_part_number += 1 if test_data_list and len(test_data_list) % part_rows == 0: writer_test.write_raw_data(test_data_list) test_data_list.clear() test_part_number += 1 ids_list.clear() wts_list.clear() label_list.clear() if train_data_list: writer_train.write_raw_data(train_data_list) if test_data_list: writer_test.write_raw_data(test_data_list) writer_train.commit() writer_test.commit() print("-------------" * 10) print("items_error_size_lineCount.size(): {}.".format( len(items_error_size_lineCount))) print("-------------" * 10) np.save("items_error_size_lineCount.npy", items_error_size_lineCount)
def test_write_read_process_with_multi_bytes_and_array(): mindrecord_file_name = "test.mindrecord" data = [{"file_name": "001.jpg", "label": 4, "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8'), "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, {"file_name": "002.jpg", "label": 5, "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8'), "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, {"file_name": "003.jpg", "label": 6, "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8'), "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, {"file_name": "004.jpg", "label": 7, "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8'), "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, {"file_name": "005.jpg", "label": 8, "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8'), "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, {"file_name": "006.jpg", "label": 9, "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8'), "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} ] writer = FileWriter(mindrecord_file_name) schema = {"file_name": {"type": "string"}, "image1": {"type": "bytes"}, "image2": {"type": "bytes"}, "source_sos_ids": {"type": "int64", "shape": [-1]}, "source_sos_mask": {"type": "int64", "shape": [-1]}, "image3": {"type": "bytes"}, "image4": {"type": "bytes"}, "image5": {"type": "bytes"}, "target_sos_ids": {"type": "int64", "shape": [-1]}, "target_sos_mask": {"type": "int64", "shape": [-1]}, "target_eos_ids": {"type": "int64", "shape": [-1]}, "target_eos_mask": {"type": "int64", "shape": [-1]}, "label": {"type": "int32"}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 13 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "source_sos_mask", "target_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["image2", "source_sos_mask", "image3", "target_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 4 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image4", "source_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image5", "image4", "image3", "source_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 5 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "image5", "image2", "source_sos_mask", "label"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 5 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True, help='Input raw text file.') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.') parser.add_argument( "--num_splits", type=int, default=1, help='The MindRecord file will be split into the number of partition. ' ) parser.add_argument("--max_seq_length", type=int, required=True, help='Maximum sequence length.') parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ') parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ') parser.add_argument("--mode", type=str, required=True, default='cnn_dailymail', help='mode of dataset creation') args = parser.parse_args() tokenizer = tokenization.Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file, mode=args.mode) input_file = args.input_file logging.info("***** Reading from input files *****") logging.info("Input File: %s", input_file) output_file = args.output_file logging.info("***** Writing to output files *****") logging.info("Output File: %s", output_file) writer = FileWriter(output_file, args.num_splits) data_schema = { "input_ids": { "type": "int64", "shape": [-1] }, "input_mask": { "type": "int64", "shape": [-1] }, "label_ids": { "type": "int64", "shape": [-1] } } writer.add_schema(data_schema, "wikitext2-schema") total_written = 0 total_read = 0 logging.info("***** Reading from %s *****", input_file) with open(input_file, "r") as f: while True: line = f.readline() if not line: break total_read += 1 if total_read % 500 == 0: logging.info("%d ...", total_read) output = create_instance(tokenizer, line, args.max_seq_length) features = write_instance_to_file(writer, instance=output) total_written += 1 if total_written <= 20: logging.info("***** Example *****") logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1])) logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) writer.commit() logging.info("Wrote %d total instances", total_written)
def test_write_read_process_with_multi_bytes(): mindrecord_file_name = "test.mindrecord" data = [{"file_name": "001.jpg", "label": 43, "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8')}, {"file_name": "002.jpg", "label": 91, "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8')}, {"file_name": "003.jpg", "label": 61, "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8')}, {"file_name": "004.jpg", "label": 29, "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8')}, {"file_name": "005.jpg", "label": 78, "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8')}, {"file_name": "006.jpg", "label": 37, "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8')} ] writer = FileWriter(mindrecord_file_name) schema = {"file_name": {"type": "string"}, "image1": {"type": "bytes"}, "image2": {"type": "bytes"}, "image3": {"type": "bytes"}, "label": {"type": "int32"}, "image4": {"type": "bytes"}, "image5": {"type": "bytes"}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 7 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader2 = FileReader(file_name=mindrecord_file_name, columns=["image1", "image2", "image5"]) count = 0 for index, x in enumerate(reader2.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader2.close() reader3 = FileReader(file_name=mindrecord_file_name, columns=["image2", "image4"]) count = 0 for index, x in enumerate(reader3.get_next()): assert len(x) == 2 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader3.close() reader4 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2"]) count = 0 for index, x in enumerate(reader4.get_next()): assert len(x) == 2 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader4.close() reader5 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2", "label"]) count = 0 for index, x in enumerate(reader5.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader5.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def test_write_read_process_with_multi_array(): mindrecord_file_name = "test.mindrecord" data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} ] writer = FileWriter(mindrecord_file_name) schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, "source_sos_mask": {"type": "int64", "shape": [-1]}, "source_eos_ids": {"type": "int64", "shape": [-1]}, "source_eos_mask": {"type": "int64", "shape": [-1]}, "target_sos_ids": {"type": "int64", "shape": [-1]}, "target_sos_mask": {"type": "int64", "shape": [-1]}, "target_eos_ids": {"type": "int64", "shape": [-1]}, "target_eos_mask": {"type": "int64", "shape": [-1]}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 8 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_eos_ids", "source_eos_mask", "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 6 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "target_sos_ids", "target_eos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "source_eos_mask", "source_sos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 1 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input_file", type=str, required=True, help='Input raw text file (or comma-separated list of files).') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.') parser.add_argument( "--num_splits", type=int, default=16, help='The MindRecord file will be split into the number of partition.') parser.add_argument( "--vocab_file", type=str, required=True, help='The vocabulary file that the Transformer model was trained on.') parser.add_argument("--clip_to_max_len", type=bool, default=False, help='clip sequences to maximum sequence length.') parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.') args = parser.parse_args() tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file) input_files = [] for input_pattern in args.input_file.split(","): input_files.append(input_pattern) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) output_file = args.output_file logging.info("*** Writing to output files ***") logging.info(" %s", output_file) writer = FileWriter(output_file, args.num_splits) data_schema = { "source_sos_ids": { "type": "int64", "shape": [-1] }, "source_sos_mask": { "type": "int64", "shape": [-1] }, "source_eos_ids": { "type": "int64", "shape": [-1] }, "source_eos_mask": { "type": "int64", "shape": [-1] }, "target_sos_ids": { "type": "int64", "shape": [-1] }, "target_sos_mask": { "type": "int64", "shape": [-1] }, "target_eos_ids": { "type": "int64", "shape": [-1] }, "target_eos_mask": { "type": "int64", "shape": [-1] } } writer.add_schema(data_schema, "tranformer hisi") total_written = 0 total_read = 0 for input_file in input_files: logging.info("*** Reading from %s ***", input_file) with open(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break total_read += 1 if total_read % 100000 == 0: logging.info("%d ...", total_read) source_line, target_line = line.strip().split("\t") source_tokens = tokenizer.tokenize(source_line) target_tokens = tokenizer.tokenize(target_line) if len(source_tokens) >= args.max_seq_length or len( target_tokens) >= args.max_seq_length: logging.info("ignore long sentence!") continue instance = create_training_instance( source_tokens, target_tokens, args.max_seq_length, clip_to_max_len=args.clip_to_max_len) if instance is None: continue features = write_instance_to_file(writer, instance, tokenizer, args.max_seq_length) total_written += 1 if total_written <= 20: logging.info("*** Example ***") logging.info( "source tokens: %s", " ".join([ tokenization.convert_to_printable(x) for x in instance.source_eos_tokens ])) logging.info( "target tokens: %s", " ".join([ tokenization.convert_to_printable(x) for x in instance.target_sos_tokens ])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) writer.commit() logging.info("Wrote %d total instances", total_written)
lines = f.readlines() if args.shuffle: np.random.shuffle(lines) dst_dir = '/'.join(args.dst_path.split('/')[:-1]) if not os.path.exists(dst_dir): os.makedirs(dst_dir) print('number of samples:', len(lines)) writer = FileWriter(file_name=args.dst_path, shard_num=args.num_shards) writer.add_schema(seg_schema, "seg_schema") cnt = 0 for l in lines: img_path, label_path = l.strip().split(' ') sample_ = {"file_name": img_path.split('/')[-1]} with open(os.path.join(args.data_root, img_path), 'rb') as f: sample_['data'] = f.read() with open(os.path.join(args.data_root, label_path), 'rb') as f: sample_['label'] = f.read() data.append(sample_) cnt += 1 if cnt % 1000 == 0: writer.write_raw_data(data) print('number of samples written:', cnt) data = [] if data: writer.write_raw_data(data) writer.commit() print('number of samples written:', cnt)
def test_case_02(add_and_remove_cv_file): # muti-bytes data = [{ "file_name": "001.jpg", "label": 43, "float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12345, "float64": 1987654321.123456785, "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8') }, { "file_name": "002.jpg", "label": 91, "float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12445, "float64": 1987654321.123456786, "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8') }, { "file_name": "003.jpg", "label": 61, "float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12545, "float64": 1987654321.123456787, "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8') }, { "file_name": "004.jpg", "label": 29, "float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12645, "float64": 1987654321.123456788, "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8') }, { "file_name": "005.jpg", "label": 78, "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12745, "float64": 1987654321.123456789, "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8') }, { "file_name": "006.jpg", "label": 37, "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12745, "float64": 1987654321.123456789, "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8') }] schema = { "file_name": { "type": "string" }, "float32_array": { "type": "float32", "shape": [-1] }, "float64_array": { "type": "float64", "shape": [-1] }, "float32": { "type": "float32" }, "float64": { "type": "float64" }, "source_sos_ids": { "type": "int32", "shape": [-1] }, "source_sos_mask": { "type": "int64", "shape": [-1] }, "image1": { "type": "bytes" }, "image2": { "type": "bytes" }, "image3": { "type": "bytes" }, "label": { "type": "int32" }, "image4": { "type": "bytes" }, "image5": { "type": "bytes" } } writer = FileWriter(CV_FILE_NAME1, FILES_NUM) writer.add_schema(schema, "schema") writer.write_raw_data(data) writer.commit() d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False) d1.save(CV_FILE_NAME2, FILES_NUM) data_value_to_list = [] for item in data: new_data = {} new_data['file_name'] = np.asarray(item["file_name"], dtype='S') new_data['float32_array'] = item["float32_array"] new_data['float64_array'] = item["float64_array"] new_data['float32'] = item["float32"] new_data['float64'] = item["float64"] new_data['source_sos_ids'] = item["source_sos_ids"] new_data['source_sos_mask'] = item["source_sos_mask"] new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) data_value_to_list.append(new_data) d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2, num_parallel_workers=num_readers, shuffle=False) assert d2.get_dataset_size() == 6 num_iter = 0 for item in d2.create_dict_iterator(): assert len(item) == 13 for field in item: if isinstance(item[field], np.ndarray): if item[field].dtype == np.float32: assert (item[field] == np.array( data_value_to_list[num_iter][field], np.float32)).all() else: assert (item[field] == data_value_to_list[num_iter][field] ).all() else: assert item[field] == data_value_to_list[num_iter][field] num_iter += 1 assert num_iter == 6
def test_case_00(add_and_remove_cv_file): # only bin data data = [{ "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8') }, { "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8') }, { "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8') }, { "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8') }, { "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8') }] schema = { "image1": { "type": "bytes" }, "image2": { "type": "bytes" }, "image3": { "type": "bytes" }, "image4": { "type": "bytes" }, "image5": { "type": "bytes" } } writer = FileWriter(CV_FILE_NAME1, FILES_NUM) writer.add_schema(schema, "schema") writer.write_raw_data(data) writer.commit() d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False) d1.save(CV_FILE_NAME2, FILES_NUM) data_value_to_list = [] for item in data: new_data = {} new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) data_value_to_list.append(new_data) d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2, num_parallel_workers=num_readers, shuffle=False) assert d2.get_dataset_size() == 5 num_iter = 0 for item in d2.create_dict_iterator(): assert len(item) == 5 for field in item: if isinstance(item[field], np.ndarray): assert ( item[field] == data_value_to_list[num_iter][field]).all() else: assert item[field] == data_value_to_list[num_iter][field] num_iter += 1 assert num_iter == 5
def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial(): writer = FileWriter(CV_FILE_NAME, FILES_NUM) cv_schema_json = { "id": { "type": "int32" }, "image_0": { "type": "bytes" }, "image_2": { "type": "bytes" }, "image_3": { "type": "bytes" }, "image_4": { "type": "bytes" }, "input_mask": { "type": "int32", "shape": [-1] }, "segments": { "type": "float32", "shape": [2, 3] } } writer.add_schema(cv_schema_json, "two_images_schema") with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: img_data = file_reader.read() ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) data = [] for i in range(5): item = { "id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, "input_mask": ndarray_1, "segments": ndarray_2 } data.append(item) writer.write_raw_data(data) writer.commit() assert os.path.exists(CV_FILE_NAME) assert os.path.exists(CV_FILE_NAME + ".db") # tutorial for minderdataset. columns_list = [ "id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments" ] num_readers = 1 data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) assert data_set.get_dataset_size() == 5 num_iter = 0 for item in data_set.create_dict_iterator(): assert len(item) == 7 logger.info("item: {}".format(item)) assert item["image_0"].dtype == np.uint8 assert (item["image_0"] == item["image_2"]).all() assert (item["image_3"] == item["image_4"]).all() assert (item["image_0"] == item["image_4"]).all() assert item["image_2"].dtype == np.uint8 assert item["image_3"].dtype == np.uint8 assert item["image_4"].dtype == np.uint8 assert item["id"].dtype == np.int32 assert item["input_mask"].shape == (5, ) assert item["input_mask"].dtype == np.int32 assert item["segments"].shape == (2, 3) assert item["segments"].dtype == np.float32 num_iter += 1 assert num_iter == 5 if os.path.exists("{}".format(CV_FILE_NAME + ".db")): os.remove(CV_FILE_NAME + ".db") if os.path.exists("{}".format(CV_FILE_NAME)): os.remove(CV_FILE_NAME)
def transfer_coco_to_mindrecord(self, mindrecord_dir, file_name="coco_hp.train.mind", shard_num=1): """Create MindRecord file by image_dir and anno_path.""" if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if os.path.isdir(self.image_path) and os.path.exists(self.annot_path): logger.info("Create MindRecord based on COCO_HP dataset") else: raise ValueError( 'data_dir {} or anno_path {} does not exist'.format( self.image_path, self.annot_path)) mindrecord_path = os.path.join(mindrecord_dir, file_name) writer = FileWriter(mindrecord_path, shard_num) centernet_json = { "image": { "type": "bytes" }, "num_objects": { "type": "int32" }, "keypoints": { "type": "int32", "shape": [-1, self.data_opt.num_joints * 3] }, "bbox": { "type": "float32", "shape": [-1, 4] }, "category_id": { "type": "int32", "shape": [-1] }, } writer.add_schema(centernet_json, "centernet_json") for img_id in self.images: image_info = self.coco.loadImgs([img_id]) annos = self.coco.loadAnns(self.anns[img_id]) # get image img_name = image_info[0]['file_name'] img_name = os.path.join(self.image_path, img_name) with open(img_name, 'rb') as f: image = f.read() # parse annos info keypoints = [] category_id = [] bbox = [] num_objects = len(annos) for anno in annos: keypoints.append(anno['keypoints']) category_id.append(anno['category_id']) bbox.append(anno['bbox']) row = { "image": image, "num_objects": num_objects, "keypoints": np.array(keypoints, np.int32), "bbox": np.array(bbox, np.float32), "category_id": np.array(category_id, np.int32) } writer.write_raw_data([row]) writer.commit() logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))
def test_issue_84(): """test file reader when db does not match.""" writer = FileWriter(CV_FILE_NAME, FILES_NUM) data = get_data("../data/mindrecord/testImageNetData/") cv_schema_json = { "file_name": { "type": "string" }, "label": { "type": "number" }, "data": { "type": "bytes" } } writer.add_schema(cv_schema_json, "img_schema") writer.add_index(["file_name", "label"]) writer.write_raw_data(data) writer.commit() writer = FileWriter(NLP_FILE_NAME, FILES_NUM) data = list( get_nlp_data("../data/mindrecord/testAclImdbData/pos", "../data/mindrecord/testAclImdbData/vocab.txt", 10)) nlp_schema_json = { "id": { "type": "string" }, "label": { "type": "number" }, "rating": { "type": "number" }, "input_ids": { "type": "array", "items": { "type": "number" } }, "input_mask": { "type": "array", "items": { "type": "number" } }, "segment_ids": { "type": "array", "items": { "type": "number" } } } writer.set_header_size(1 << 14) writer.set_page_size(1 << 15) writer.add_schema(nlp_schema_json, "nlp_schema") writer.add_index(["id", "rating"]) writer.write_raw_data(data) writer.commit() reader = ShardReader() os.rename("imagenet.mindrecord1.db", "imagenet.mindrecord1.db.bk") os.rename("aclImdb.mindrecord1.db", "imagenet.mindrecord1.db") file_name = os.path.join(os.getcwd(), "imagenet.mindrecord1") with pytest.raises(Exception) as e: reader.open(file_name) assert str(e.value) == "[MRMOpenError]: error_code: 1347690596, " \ "error_msg: " \ "MindRecord File could not open successfully." os.rename("imagenet.mindrecord1.db", "aclImdb.mindrecord1.db") paths = [ "{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM) ] for item in paths: os.remove("{}".format(item)) os.remove("{}.db".format(item)) os.rename("imagenet.mindrecord1.db.bk", "imagenet.mindrecord1.db") paths = [ "{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM) ] for item in paths: os.remove("{}".format(item)) os.remove("{}.db".format(item))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input_file", type=str, required=True, help='Input raw text file (or comma-separated list of files).') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.') parser.add_argument( "--num_splits", type=int, default=16, help='The MindRecord file will be split into the number of partition.') parser.add_argument( "--src_vocab_file", type=str, required=True, help='The vocabulary file that the Transformer model was trained on.') parser.add_argument( "--trg_vocab_file", type=str, required=True, help='The vocabulary file that the Transformer model was trained on.') parser.add_argument("--clip_to_max_len", type=ast.literal_eval, default=False, help='clip sequences to maximum sequence length.') parser.add_argument("--max_seq_length", type=int, default=32, help='Maximum sequence length.') parser.add_argument("--bucket", type=ast.literal_eval, default=[32], help='bucket sequence length') args = parser.parse_args() tokenizer_src = tokenization.WhiteSpaceTokenizer( vocab_file=args.src_vocab_file) tokenizer_trg = tokenization.WhiteSpaceTokenizer( vocab_file=args.trg_vocab_file) input_files = [] for input_pattern in args.input_file.split(","): input_files.append(input_pattern) logging.info("*** Read from input files ***") output_file = args.output_file logging.info("*** Write to output files ***") logging.info(" %s", output_file) total_written = 0 total_read = 0 feature_dict = {} for i in args.bucket: feature_dict[i] = [] for input_file in input_files: logging.info("*** Reading from %s ***", input_file) with open(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break total_read += 1 if total_read % 100000 == 0: logging.info("Read %d ...", total_read) if line.strip() == "": continue source_line, target_line = line.strip().split("\t") source_tokens = tokenizer_src.tokenize(source_line) target_tokens = tokenizer_trg.tokenize(target_line) if len(source_tokens) >= args.max_seq_length or len( target_tokens) >= args.max_seq_length: logging.info("ignore long sentence!") continue instance = create_training_instance( source_tokens, target_tokens, args.max_seq_length, clip_to_max_len=args.clip_to_max_len) if instance is None: continue features, seq_max_bucket_length = get_instance_features( instance, tokenizer_src, tokenizer_trg, args.max_seq_length, args.bucket) for key in feature_dict: if key == seq_max_bucket_length: feature_dict[key].append(features) if total_read <= 10: logging.info("*** Example ***") logging.info( "source tokens: %s", " ".join([ tokenization.convert_to_printable(x) for x in instance.source_tokens ])) logging.info( "target tokens: %s", " ".join([ tokenization.convert_to_printable(x) for x in instance.target_tokens ])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) for i in args.bucket: if args.num_splits == 1: output_file_name = output_file + '_' + str(i) else: output_file_name = output_file + '_' + str(i) + '_' writer = FileWriter(output_file_name, args.num_splits) data_schema = { "source_ids": { "type": "int64", "shape": [-1] }, "source_mask": { "type": "int64", "shape": [-1] }, "target_ids": { "type": "int64", "shape": [-1] }, "target_mask": { "type": "int64", "shape": [-1] } } writer.add_schema(data_schema, "gru") features_ = feature_dict[i] logging.info("Bucket length %d has %d samples, start writing...", i, len(features_)) for item in features_: writer.write_raw_data([item]) total_written += 1 writer.commit() logging.info("Wrote %d total instances", total_written)
def fsns_train_data_to_mindrecord(mindrecord_dir, prefix="data_ocr.mindrecord", file_num=8): anno_file_dirs = [config.train_annotation_file] images, image_path_dict, image_anno_dict = create_fsns_label( image_dir=config.data_root, anno_file_dirs=anno_file_dirs) vocab, _ = initialize_vocabulary(config.vocab_path) data_schema = { "image": { "type": "bytes" }, "label": { "type": "int32", "shape": [-1] }, "decoder_input": { "type": "int32", "shape": [-1] }, "decoder_mask": { "type": "int32", "shape": [-1] }, "decoder_target": { "type": "int32", "shape": [-1] }, "annotation": { "type": "string" } } mindrecord_path = os.path.join(mindrecord_dir, prefix) writer = FileWriter(mindrecord_path, file_num) writer.add_schema(data_schema, "ocr") for img_id in images: image_path = image_path_dict[img_id] annotation = image_anno_dict[img_id] label_max_len = config.max_text_len text_max_len = config.max_text_len - 2 if len(annotation) > text_max_len: continue label = serialize_annotation(image_path, annotation, vocab) if label is None: continue label_len = len(label) decoder_input_len = label_max_len if label_len <= decoder_input_len: label = np.concatenate( (label, np.zeros(decoder_input_len - label_len, dtype=np.int32))) one_mask_len = label_len - config.go_shift target_weight = np.concatenate( (np.ones(one_mask_len, dtype=np.float32), np.zeros(decoder_input_len - one_mask_len, dtype=np.float32))) else: continue decoder_input = (np.array(label).T).astype(np.int32) target_weight = (np.array(target_weight).T).astype(np.int32) if not len(decoder_input) == len(target_weight): continue target = [decoder_input[i + 1] for i in range(len(decoder_input) - 1)] target = (np.array(target)).astype(np.int32) with open(image_path, 'rb') as f: img = f.read() row = { "image": img, "label": label, "decoder_input": decoder_input, "decoder_mask": target_weight, "decoder_target": target, "annotation": str(annotation) } writer.write_raw_data([row]) writer.commit()
def test_write_read_process(): mindrecord_file_name = "test.mindrecord" data = [{ "file_name": "001.jpg", "label": 43, "score": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64), "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32), "data": bytes("image bytes abc", encoding='UTF-8') }, { "file_name": "002.jpg", "label": 91, "score": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64), "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32), "data": bytes("image bytes def", encoding='UTF-8') }, { "file_name": "003.jpg", "label": 61, "score": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64), "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32), "data": bytes("image bytes ghi", encoding='UTF-8') }, { "file_name": "004.jpg", "label": 29, "score": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64), "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32), "data": bytes("image bytes jkl", encoding='UTF-8') }, { "file_name": "005.jpg", "label": 78, "score": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64), "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32), "data": bytes("image bytes mno", encoding='UTF-8') }, { "file_name": "006.jpg", "label": 37, "score": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64), "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32), "data": bytes("image bytes pqr", encoding='UTF-8') }] writer = FileWriter(mindrecord_file_name) schema = { "file_name": { "type": "string" }, "label": { "type": "int32" }, "score": { "type": "float64" }, "mask": { "type": "int64", "shape": [-1] }, "segments": { "type": "float32", "shape": [2, 2] }, "data": { "type": "bytes" } } writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 6 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def transfer_coco_to_mindrecord(self, mindrecord_dir, file_name="coco_det.train.mind", shard_num=1): """Create MindRecord file by image_dir and anno_path.""" if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if os.path.isdir(self.image_path) and os.path.exists(self.annot_path): logger.info("Create MindRecord based on COCO_HP dataset") else: raise ValueError( 'data_dir {} or anno_path {} does not exist'.format( self.image_path, self.annot_path)) mindrecord_path = os.path.join(mindrecord_dir, file_name) writer = FileWriter(mindrecord_path, shard_num) centernet_json = { "img_id": { "type": "int32", "shape": [1] }, "image": { "type": "bytes" }, "num_objects": { "type": "int32" }, "bboxes": { "type": "float32", "shape": [-1, 4] }, "category_id": { "type": "int32", "shape": [-1] }, } writer.add_schema(centernet_json, "centernet_json") for img_id in self.images: image_info = self.coco.loadImgs([img_id]) annos = self.coco.loadAnns(self.anns[img_id]) # get image img_name = image_info[0]['file_name'] img_name = os.path.join(self.image_path, img_name) with open(img_name, 'rb') as f: image = f.read() bboxes = [] category_id = [] num_objects = len(annos) for anno in annos: bbox = self._coco_box_to_bbox(anno['bbox']) class_name = self.classs_dict[anno["category_id"]] if class_name in self.train_cls: x_min, x_max = bbox[0], bbox[2] y_min, y_max = bbox[1], bbox[3] bboxes.append([x_min, y_min, x_max, y_max]) category_id.append(self.train_cls_dict[class_name]) row = { "img_id": np.array([img_id], dtype=np.int32), "image": image, "num_objects": num_objects, "bboxes": np.array(bboxes, np.float32), "category_id": np.array(category_id, np.int32) } writer.write_raw_data([row]) writer.commit() logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))