Python Shuffled_Dataset_Factoryの例、steves_utils.ORACLE.shuffled_dataset_accessor.Shuffled_Dataset_Factory Pythonの例

コード例 #1

0

ファイルを表示

ファイル: limited_conv.py プロジェクト: stevester94/csc500-past-runs

def get_all_shuffled():
    from steves_utils.ORACLE.shuffled_dataset_accessor import Shuffled_Dataset_Factory
    from steves_utils import utils

    RANGE   = len(ALL_SERIAL_NUMBERS)
    path = os.path.join(utils.get_datasets_base_path(), "all_shuffled", "output")
    print(utils.get_datasets_base_path())
    print(path)
    datasets = Shuffled_Dataset_Factory(path, train_val_test_splits=(0.6, 0.2, 0.2))

    train_ds = datasets["train_ds"]
    val_ds = datasets["val_ds"]
    test_ds = datasets["test_ds"]    

    train_ds = train_ds.map(
        lambda x: (x["IQ"],tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True
    )

    val_ds = val_ds.map(
        lambda x: (x["IQ"],tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True
    )

    test_ds = test_ds.map(
        lambda x: (x["IQ"],tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True
    )

    return train_ds, val_ds, test_ds

コード例 #2

0

ファイルを表示

    def test_for_duplicates(self):
        datasets = Shuffled_Dataset_Factory(
            self.output_path, train_val_test_splits=self.train_val_test_splits)

        train_ds = datasets["train_ds"]
        val_ds = datasets["val_ds"]
        test_ds = datasets["test_ds"]

        all_ds = train_ds.concatenate(val_ds).concatenate(test_ds)

        train_hashes = []
        for e in train_ds.unbatch():
            train_hashes.append(
                hash((
                    int(e["serial_number_id"].numpy()),
                    int(e["distance_feet"].numpy()),
                    int(e["run"].numpy()),
                    int(e["index_in_file"].numpy()),
                )))

        val_hashes = []
        for e in val_ds.unbatch():
            val_hashes.append(
                hash((
                    int(e["serial_number_id"].numpy()),
                    int(e["distance_feet"].numpy()),
                    int(e["run"].numpy()),
                    int(e["index_in_file"].numpy()),
                )))

        test_hashes = []
        for e in test_ds.unbatch():
            test_hashes.append(
                hash((
                    int(e["serial_number_id"].numpy()),
                    int(e["distance_feet"].numpy()),
                    int(e["run"].numpy()),
                    int(e["index_in_file"].numpy()),
                )))

        all_hashes = []
        for e in all_ds.unbatch():
            all_hashes.append(
                hash((
                    int(e["serial_number_id"].numpy()),
                    int(e["distance_feet"].numpy()),
                    int(e["run"].numpy()),
                    int(e["index_in_file"].numpy()),
                )))

        self.assertTrue(
            len(all_hashes) == len(train_hashes + val_hashes + test_hashes))

        self.assertTrue(
            len(train_hashes + val_hashes + test_hashes) == len(
                set(train_hashes + val_hashes + test_hashes)))

コード例 #3

0

ファイルを表示

    def test_cardinality(self):
        # I believe because we are working with discrete files, some of them are being dropped. We allow up to 1% of data to be lost
        acceptable_cardinality_delta_percent = 0.01

        datasets = Shuffled_Dataset_Factory(
            self.output_path, train_val_test_splits=self.train_val_test_splits)

        train_ds = datasets["train_ds"]
        val_ds = datasets["val_ds"]
        test_ds = datasets["test_ds"]

        train_count = 0
        for e in train_ds:
            train_count += e["index_in_file"].shape[0]

        val_count = 0
        for e in val_ds:
            val_count += e["index_in_file"].shape[0]

        test_count = 0
        for e in test_ds:
            test_count += e["index_in_file"].shape[0]

        expected_cardinality = self.cardinality
        expected_train_count = expected_cardinality * self.train_val_test_splits[
            0]
        expected_val_count = expected_cardinality * self.train_val_test_splits[
            1]
        expected_test_count = expected_cardinality * self.train_val_test_splits[
            2]

        self.assertAlmostEqual(expected_cardinality,
                               train_count + val_count + test_count,
                               delta=expected_cardinality *
                               acceptable_cardinality_delta_percent)
        self.assertAlmostEqual(train_count,
                               expected_train_count,
                               delta=expected_train_count *
                               acceptable_cardinality_delta_percent)
        self.assertAlmostEqual(val_count,
                               expected_val_count,
                               delta=expected_val_count *
                               acceptable_cardinality_delta_percent)
        self.assertAlmostEqual(test_count,
                               expected_test_count,
                               delta=expected_test_count *
                               acceptable_cardinality_delta_percent)

コード例 #4

0

ファイルを表示

    def test_compare_chunks_to_original(self):
        datasets = Shuffled_Dataset_Factory(
            self.output_path, train_val_test_splits=self.train_val_test_splits)

        train_ds = datasets["train_ds"]
        val_ds = datasets["val_ds"]
        test_ds = datasets["test_ds"]

        all_ds = train_ds.concatenate(val_ds).concatenate(test_ds)

        for e in all_ds.unbatch():
            original_iq = get_chunk_of_IQ_based_on_metadata_and_index(
                serial_number_id=e["serial_number_id"].numpy(),
                distance_feet=e["distance_feet"].numpy(),
                run=e["run"].numpy(),
                index=e["index_in_file"].numpy(),
                num_samps_in_chunk=ORIGINAL_PAPER_SAMPLES_PER_CHUNK)

            self.assertTrue(np.array_equal(e["IQ"].numpy(), original_iq))

コード例 #5

0

ファイルを表示

ファイル: limited_conv.py プロジェクト: stevester94/csc500-past-runs

def get_all_shuffled():
    global RANGE
    from steves_utils.ORACLE.shuffled_dataset_accessor import Shuffled_Dataset_Factory
    from steves_utils import utils

    BATCH = 256

    path = os.path.join(utils.get_datasets_base_path(), "all_shuffled",
                        "output")
    print(utils.get_datasets_base_path())
    print(path)
    datasets = Shuffled_Dataset_Factory(path,
                                        train_val_test_splits=(0.6, 0.2, 0.2),
                                        reshuffle_train_each_iteration=False)

    train_ds = datasets["train_ds"]
    val_ds = datasets["val_ds"]
    test_ds = datasets["test_ds"]

    train_ds = train_ds.map(
        lambda x: (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    val_ds = val_ds.map(lambda x:
                        (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                        num_parallel_calls=tf.data.AUTOTUNE,
                        deterministic=True)

    test_ds = test_ds.map(lambda x:
                          (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                          num_parallel_calls=tf.data.AUTOTUNE,
                          deterministic=True)

    train_ds = train_ds.unbatch().take(200000 *
                                       len(ALL_SERIAL_NUMBERS)).batch(BATCH)
    val_ds = val_ds.unbatch().take(10000 *
                                   len(ALL_SERIAL_NUMBERS)).batch(BATCH)
    test_ds = test_ds.unbatch().take(50000 *
                                     len(ALL_SERIAL_NUMBERS)).batch(BATCH)

    return train_ds, val_ds, test_ds

コード例 #6

0

ファイルを表示

    def test_shuffling(self):
        """
        This one is a bit hard. How do you check for randomness?

        What I ended up doing is taking the 'index_in_file' metadata field, sorting it, and comparing it to the original.
        If they aren't the same then we should be good.
        """

        datasets = Shuffled_Dataset_Factory(
            self.output_path, train_val_test_splits=self.train_val_test_splits)

        train_ds = datasets["train_ds"]
        val_ds = datasets["val_ds"]
        test_ds = datasets["test_ds"]

        for ds in (train_ds, val_ds, test_ds):
            indices = []
            for e in ds:
                indices.extend(e["index_in_file"].numpy())

            sorted_indices = copy.deepcopy(indices)
            sorted_indices.sort()

            self.assertFalse(np.array_equal(indices, sorted_indices))

コード例 #7

0

ファイルを表示

ファイル: limited_conv.py プロジェクト: stevester94/csc500-past-runs

def get_all_shuffled_windowed():
    global RANGE
    from steves_utils.ORACLE.shuffled_dataset_accessor import Shuffled_Dataset_Factory
    from steves_utils import utils

    DATASET_BATCH_SIZE = 100
    BATCH = 256
    chunk_size = 4 * ORIGINAL_PAPER_SAMPLES_PER_CHUNK
    STRIDE_SIZE = 1

    NUM_REPEATS = math.floor(
        (chunk_size - ORIGINAL_PAPER_SAMPLES_PER_CHUNK) / STRIDE_SIZE) + 1

    path = os.path.join(utils.get_datasets_base_path(),
                        "all_shuffled_chunk-512", "output")
    print(utils.get_datasets_base_path())
    print(path)
    datasets = Shuffled_Dataset_Factory(path,
                                        train_val_test_splits=(0.6, 0.2, 0.2),
                                        reshuffle_train_each_iteration=False)

    train_ds = datasets["train_ds"]
    val_ds = datasets["val_ds"]
    test_ds = datasets["test_ds"]

    train_ds = train_ds.unbatch().take(200000 * len(ALL_SERIAL_NUMBERS))
    val_ds = val_ds.unbatch().take(10000 * len(ALL_SERIAL_NUMBERS))
    test_ds = test_ds.unbatch().take(50000 * len(ALL_SERIAL_NUMBERS))

    train_ds = train_ds.map(
        lambda x: (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    val_ds = val_ds.map(lambda x:
                        (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                        num_parallel_calls=tf.data.AUTOTUNE,
                        deterministic=True)

    test_ds = test_ds.map(lambda x:
                          (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                          num_parallel_calls=tf.data.AUTOTUNE,
                          deterministic=True)

    train_ds = train_ds.map(lambda x, y: (tf.transpose(
        tf.signal.frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK, STRIDE_SIZE), [
            1, 0, 2
        ]), tf.repeat(tf.reshape(y, (1, RANGE)), repeats=NUM_REPEATS, axis=0)),
                            num_parallel_calls=tf.data.AUTOTUNE,
                            deterministic=True)

    # We aren't really windowing the val and test data, we are just splitting them into 128 sample chunks so that they are
    # the same shape as the train data
    val_ds = val_ds.map(
        lambda x, y: (
            tf.transpose(
                # See, stride == length, meaning we are just splitting the chunks, not really windowing
                tf.signal.frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK,
                                ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                [1, 0, 2]),
            tf.repeat(tf.reshape(y, (1, RANGE)),
                      repeats=math.floor(chunk_size /
                                         ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                      axis=0)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    test_ds = test_ds.map(
        lambda x, y: (
            tf.transpose(
                # See, stride == length, meaning we are just splitting the chunks, not really windowing
                tf.signal.frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK,
                                ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                [1, 0, 2]),
            tf.repeat(tf.reshape(y, (1, RANGE)),
                      repeats=math.floor(chunk_size /
                                         ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                      axis=0)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    train_ds = train_ds.unbatch().take(200000 * len(ALL_SERIAL_NUMBERS))
    val_ds = val_ds.unbatch().take(10000 * len(ALL_SERIAL_NUMBERS))
    test_ds = test_ds.unbatch().take(50000 * len(ALL_SERIAL_NUMBERS))

    train_ds = train_ds.shuffle(DATASET_BATCH_SIZE * NUM_REPEATS * 3,
                                reshuffle_each_iteration=True)

    train_ds = train_ds.batch(BATCH)
    val_ds = val_ds.batch(BATCH)
    test_ds = test_ds.batch(BATCH)

    train_ds = train_ds.prefetch(100)
    val_ds = val_ds.prefetch(100)
    test_ds = test_ds.prefetch(100)

    return train_ds, val_ds, test_ds

コード例 #8

0

ファイルを表示

ファイル: limited_conv.py プロジェクト: stevester94/csc500-past-runs

def get_windowed_foxtrot_shuffled():
    from steves_utils.ORACLE.shuffled_dataset_accessor import Shuffled_Dataset_Factory
    from steves_utils import utils

    path = os.path.join(utils.get_datasets_base_path(), "foxtrot", "output")
    datasets = Shuffled_Dataset_Factory(path,
                                        train_val_test_splits=(0.6, 0.2, 0.2))

    train_ds = datasets["train_ds"]
    val_ds = datasets["val_ds"]
    test_ds = datasets["test_ds"]

    # count = 0
    # for e in train_ds.concatenate(val_ds).concatenate(test_ds):
    #     count += e["IQ"].shape[0]
    # print(count)
    # sys.exit(1)

    train_ds = train_ds.unbatch()
    val_ds = val_ds.unbatch()
    test_ds = test_ds.unbatch()

    # Chunk size and batch is determined by the shuffled dataset
    chunk_size = 4 * ORIGINAL_PAPER_SAMPLES_PER_CHUNK
    STRIDE_SIZE = 1
    BATCH = 1000
    REBATCH = 500

    NUM_REPEATS = math.floor(
        (chunk_size - ORIGINAL_PAPER_SAMPLES_PER_CHUNK) / STRIDE_SIZE) + 1

    # print(RANGE)
    # sys.exit(1)

    # serial_number_id ranges from [0,15]

    # train_ds = train_ds.filter(lambda x: x["serial_number_id"] < 13 or x["serial_number_id"] > 13)
    # val_ds = val_ds.filter(lambda x: x["serial_number_id"] < 13 or x["serial_number_id"] > 13)
    # test_ds = test_ds.filter(lambda x: x["serial_number_id"] < 13 or x["serial_number_id"] > 13)

    # train_ds = train_ds.filter(lambda x: x["serial_number_id"] !=  13)
    # val_ds = val_ds.filter(lambda x: x["serial_number_id"] !=  13)
    # test_ds = test_ds.filter(lambda x: x["serial_number_id"]  != 13)

    # train_ds = train_ds.filter(lambda x: x["serial_number_id"] < 15)
    # val_ds = val_ds.filter(lambda x: x["serial_number_id"]     < 15)
    # test_ds = test_ds.filter(lambda x: x["serial_number_id"]   < 15)

    # val_ds = val_ds.filter(lambda x: x["serial_number_id"] in target_serials)
    # test_ds = test_ds.filter(lambda x: x["serial_number_id"] in target_serials)

    train_ds = train_ds.map(
        lambda x: (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    val_ds = val_ds.map(lambda x:
                        (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                        num_parallel_calls=tf.data.AUTOTUNE,
                        deterministic=True)

    test_ds = test_ds.map(lambda x:
                          (x["IQ"], tf.one_hot(x["serial_number_id"], RANGE)),
                          num_parallel_calls=tf.data.AUTOTUNE,
                          deterministic=True)

    train_ds = train_ds.map(
        lambda x, y: (
            tf.transpose(
                tf.signal.
                frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK, STRIDE_SIZE
                      ),  # Somehow we get 9 frames from this
                [1, 0, 2]),
            tf.repeat(tf.reshape(y, (1, RANGE)), repeats=NUM_REPEATS, axis=0
                      )  # Repeat our one hot tensor 9 times
        ),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    val_ds = val_ds.map(
        lambda x, y: (
            tf.transpose(
                tf.signal.frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK,
                                ORIGINAL_PAPER_SAMPLES_PER_CHUNK
                                ),  # Somehow we get 9 frames from this
                [1, 0, 2]),
            tf.repeat(tf.reshape(y, (1, RANGE)),
                      repeats=math.floor(chunk_size /
                                         ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                      axis=0)  # Repeat our one hot tensor 9 times
        ),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    test_ds = test_ds.map(
        lambda x, y: (
            tf.transpose(
                tf.signal.frame(x, ORIGINAL_PAPER_SAMPLES_PER_CHUNK,
                                ORIGINAL_PAPER_SAMPLES_PER_CHUNK
                                ),  # Somehow we get 9 frames from this
                [1, 0, 2]),
            tf.repeat(tf.reshape(y, (1, RANGE)),
                      repeats=math.floor(chunk_size /
                                         ORIGINAL_PAPER_SAMPLES_PER_CHUNK),
                      axis=0)  # Repeat our one hot tensor 9 times
        ),
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True)

    train_ds = train_ds.unbatch()
    val_ds = val_ds.unbatch()
    test_ds = test_ds.unbatch()

    train_ds = train_ds.shuffle(BATCH * NUM_REPEATS * 4)
    val_ds = val_ds.shuffle(BATCH * NUM_REPEATS * 4)
    test_ds = test_ds.shuffle(BATCH * NUM_REPEATS * 4)

    # for e in test_ds:
    #     print(e[1])

    # sys.exit(1)

    train_ds = train_ds.batch(REBATCH)
    val_ds = val_ds.batch(REBATCH)
    test_ds = test_ds.batch(REBATCH)

    train_ds = train_ds.prefetch(100)
    val_ds = val_ds.prefetch(100)
    test_ds = test_ds.prefetch(100)

    return train_ds, val_ds, test_ds

コード例 #9

0

ファイルを表示

    def __init__(
        self,
        input_shuffled_ds_dir,
        input_shuffled_ds_num_samples_per_chunk,
        output_batch_size,
        output_max_file_size_MB,
        seed,
        num_windowed_examples_per_device,
        num_val_examples_per_device,
        num_test_examples_per_device,
        output_window_size,
        distances_to_filter_on,
        serials_to_filter_on,
        working_dir,
        output_format_str,
        stride_length,
        # output_format_str="shuffled_batchSize-{batch_size}_part-{part}.tfrecord_ds",
    ) -> None:
        self.serial_ids_to_filter_on                 = [serial_number_to_id(serial) for serial in  serials_to_filter_on]
        self.num_windowed_examples_per_device        = num_windowed_examples_per_device
        self.num_val_examples_per_device             = num_val_examples_per_device
        self.num_test_examples_per_device            = num_test_examples_per_device
        self.input_shuffled_ds_num_samples_per_chunk = input_shuffled_ds_num_samples_per_chunk
        self.output_batch_size                       = output_batch_size
        self.output_max_file_size_MB                 = output_max_file_size_MB
        self.seed                                    = seed
        self.output_window_size                      = output_window_size
        self.stride_length                           = stride_length

        self.window_pile_dir                         = os.path.join(working_dir, "pile_train")
        self.window_output_dir                       = os.path.join(working_dir, "train")
        self.val_pile_dir                            = os.path.join(working_dir, "pile_val")
        self.val_output_dir                          = os.path.join(working_dir, "val")
        self.test_pile_dir                           = os.path.join(working_dir, "pile_test")
        self.test_output_dir                         = os.path.join(working_dir, "test")

        # If necessary we can customize these
        self.window_output_format_str                = output_format_str
        self.val_output_format_str                   = output_format_str
        self.test_output_format_str                  = output_format_str

        self.num_devices = len(self.serial_ids_to_filter_on)
        
        # Yeah it's pretty hacky since we don't really need to split the dataset into test and val, but
        # it's already written and tested
        datasets = Shuffled_Dataset_Factory(
            input_shuffled_ds_dir, train_val_test_splits=(0.6, 0.2, 0.2), reshuffle_train_each_iteration=False
        )

        self.train_ds = datasets["train_ds"].unbatch()
        self.val_ds = datasets["val_ds"].unbatch()
        self.test_ds = datasets["test_ds"].unbatch()

        self.og_datasets = {
            "train_ds": self.train_ds, 
            "val_ds": self.val_ds,
            "test_ds": self.test_ds
        }

        # Since we are windowing, the number of examples we take from the original dataset is smaller
        # than the actual number of windows we want to generate
        replication_factor = math.floor((input_shuffled_ds_num_samples_per_chunk - output_window_size)/stride_length + 1)
        num_train_examples_to_get_per_device = math.ceil(num_windowed_examples_per_device/replication_factor)

        # These are a little different. Since we are basically unchunking by striding as big as our output window
        num_val_examples_per_device = math.ceil(num_val_examples_per_device / (input_shuffled_ds_num_samples_per_chunk/output_window_size))
        num_test_examples_per_device = math.ceil(num_test_examples_per_device / (input_shuffled_ds_num_samples_per_chunk/output_window_size))

        # print("Fetching {} Train Chunks Per Device".format(num_train_examples_to_get_per_device))
        # print("Fetching {} Val Chunks Per Device".format(num_val_examples_per_device))
        # print("Fetching {} Tess Chunks Per Device".format(num_test_examples_per_device))
        # print("Replication Factor:", replication_factor)

        self.train_ds = Windowed_Dataset_Shuffler.build_per_device_filtered_dataset(
            distances_to_filter_on=distances_to_filter_on,
            serial_ids_to_filter_on=self.serial_ids_to_filter_on,
            num_examples_per_serial_id=num_train_examples_to_get_per_device,
            ds=self.train_ds,
        )
        self.val_ds = Windowed_Dataset_Shuffler.build_per_device_filtered_dataset(
            distances_to_filter_on=distances_to_filter_on,
            serial_ids_to_filter_on=self.serial_ids_to_filter_on,
            num_examples_per_serial_id=num_val_examples_per_device,
            ds=self.val_ds,
        )
        self.test_ds = Windowed_Dataset_Shuffler.build_per_device_filtered_dataset(
            distances_to_filter_on=distances_to_filter_on,
            serial_ids_to_filter_on=self.serial_ids_to_filter_on,
            num_examples_per_serial_id=num_test_examples_per_device,
            ds=self.test_ds,
        )

        # print("Train Length Before Windowing:", utils.get_iterator_cardinality(self.train_ds))
        # print("Val Length Before Windowing:", utils.get_iterator_cardinality(self.val_ds))
        # print("Test Length Before Windowing:", utils.get_iterator_cardinality(self.test_ds))

        self.train_ds = self.window_ds(self.train_ds, self.stride_length)
        self.val_ds   = self.window_ds(self.val_ds, output_window_size)
        self.test_ds   = self.window_ds(self.test_ds, output_window_size)

        # self.train_ds = self.train_ds.batch(1000)
        # self.val_ds = self.val_ds.batch(1000)
        # self.test_ds = self.test_ds.batch(1000)

        # print("Train Length:", utils.get_iterator_cardinality(self.train_ds))
        # print("Val Length:", utils.get_iterator_cardinality(self.val_ds))
        # print("Test Length:", utils.get_iterator_cardinality(self.test_ds))


        # raise Exception("Done")

        # This is another straight up hack. The val and test aren't really shuffled, we're just using this to write the DS to file
        self.train_shuffler = self.make_train_shuffler()
        self.val_shuffler   = self.make_val_shuffler()
        self.test_shuffler  = self.make_test_shuffler()