Example #1
0
def motif_discovery_raw(train_file, test_file):
    subset_size = 690 * 190

    x_shape = len(range(101))
    train_gen = gen_from_fasta(train_file, None)
    test_gen = gen_from_fasta(test_file, None)

    # datasets
    bacth_size = 512
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = ((), ())
    output_types = (tf.string, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    # takes about 30 seconds to skip the training data
    val_ds = train_ds.skip(subset_size).take(690 * 10).map(vectorize_text)
    train_ds = train_ds.take(subset_size).shuffle(500).batch(bacth_size).map(
        vectorize_text).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.take(subset_size).batch(bacth_size).map(
        vectorize_text).prefetch(prefetch)

    x_val, y_val = [], []
    for d in val_ds:
        x_val.append(d[0])
        y_val.append(d[1])
    x_val = tf.convert_to_tensor(x_val)
    y_val = tf.convert_to_tensor(y_val)
    validation_data = (x_val, y_val)

    return x_shape, train_ds, validation_data, test_ds
Example #2
0
def make_generator(src_dir, valid_rate, input_size, batch_size):

    # インスタンス作成
    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       rotation_range=30,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=30,
                                       zoom_range=[0.7, 0.3],
                                       horizontal_flip=True,
                                       vertical_flip=True,
                                       validation_split=valid_rate)

    # ジェネレータ作成
    # --- 訓練用データ
    train_generator = train_datagen.flow_from_directory(
        directory=src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    # ジェネレータ作成
    # --- 検証用データ
    valid_generator = train_datagen.flow_from_directory(
        directory=src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    # ラッピング
    # --- 訓練データジェネレータ
    train_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    # ラッピング
    # --- 検証データジェネレータ
    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    # 各Datasetを無限に繰り返す設定
    train_ds = train_ds.repeat()
    valid_ds = valid_ds.repeat()

    return train_ds, train_generator.n, valid_ds, valid_generator.n
Example #3
0
def make_generator(src_dir, valid_rate, input_size, batch_size):
    '''Dataset generatorを作成する関数
    dir -> generator -> Datasetの流れでデータセットを作成
    src_dir下のディレクトリ名が自動でクラス名となる(flow_from_directory)
    ImageDataGeneratorのパラメータはターゲットにより柔軟に変更(道路標識の上下フリップは必要ないetc...)
    '''
    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       rotation_range=30,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=30,
                                       zoom_range=[0.7, 1.3],
                                       horizontal_flip=True,
                                       vertical_flip=True,
                                       validation_split=valid_rate)

    # directoryの構造・名称から自動でdata_generatorを作成
    train_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    valid_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    train_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    train_ds = train_ds.repeat()
    valid_ds = valid_ds.repeat()

    cls_info = {v: k for k, v in train_generator.class_indices.items()}

    return train_ds, train_generator.n, valid_ds, valid_generator.n, cls_info
Example #4
0
def make_generator(src_dir, valid_rate, input_size, batch_size):

    # インスタンス生成
    # --- ImageDataGeneratorクラス
    train_datagen = ImageDataGenerator(rescale=1 / 255,
                                       validation_split=valid_rate)

    # ジェネレータ作成
    # --- 訓練データの読込み
    # --- 250 * (1 - 0.2) = 200
    train_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    # ジェネレータ作成
    # --- 検証データの読込み
    # --- 250 * 0.2 = 50
    valid_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    # ラッピング
    # --- 訓練データジェネレータ
    trans_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    # ラッピング
    # --- 検証データジェネレータ
    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    # 各Datasetを無限に繰り返す設定
    trans_ds = trans_ds.repeat()
    trans_ds = trans_ds.repeat()

    return trans_ds, train_generator.n, valid_ds, valid_generator.n
Example #5
0
def h3(file, word_size=3, region_size=0, expand=True):
    sequences, labels = read_fasta(file)
    test_size = 0.15
    val_size = 0.15
    split_options = dict(test_size=test_size,
                         stratify=labels,
                         random_state=3264)
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        sequences, labels, **split_options)
    # normalize val_size and update options
    split_options.update(
        dict(test_size=val_size / (1 - test_size), stratify=y_train_val))
    x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val,
                                                      **split_options)
    del x_train_val, y_train_val

    encode_func = encode(word_size, region_size, expand=expand)
    x_shape = encoded_shape(sequences[0],
                            word_size,
                            region_size,
                            expand=expand)

    train_gen = gen_from_arrays(x_train, y_train, encode_func)
    val_gen = gen_from_arrays(x_val, y_val, encode_func)
    test_gen = gen_from_arrays(x_test, y_test, encode_func)

    # datasets
    batch_size = 32
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = (x_shape, ())
    output_types = (tf.float32, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    train_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.batch(batch_size).prefetch(prefetch)

    x_val_encode, y_val_encode = [], []
    for x, y in val_gen():
        x_val_encode.append(x)
        y_val_encode.append(y)
    x_val_encode = np.array(x_val_encode)
    y_val_encode = np.array(y_val_encode)
    validation_data = (x_val_encode, y_val_encode)

    return x_shape, train_ds, validation_data, test_ds
    def to_tf_dataset(self, dataset, shuffle):
        cfg = self.cfg

        keys = {"image", "bboxes", "categories"}
        _dataset = Dataset.from_generator(
            lambda: iter(dataset),
            {k: v
             for k, v in self.dtypes.items() if k in keys},
            {k: v
             for k, v in self.shapes.items() if k in keys},
        )

        _dataset = _dataset.map(self.add_features, cfg.num_workers)

        if shuffle and cfg.shuffle_buffer_size > 0:
            _dataset = _dataset.shuffle(
                buffer_size=cfg.shuffle_buffer_size,
                seed=cfg.seed,
                reshuffle_each_iteration=cfg.reshuffle_each_iteration,
            )

        if cfg.batch_size > 1:
            keys = {
                "image", "bboxes", "categories", "kpt", "ct", "wh", "offset"
            }
            _dataset = _dataset.padded_batch(
                cfg.batch_size * cfg.n_gpus,
                {k: v
                 for k, v in self.shapes.items() if k in keys},
                {k: v
                 for k, v in self.pad_values.items() if k in keys},
                drop_remainder=True,
            )
        return _dataset
Example #7
0
def eval_dataset(params: HParams, iterator: ner_data.Generator):
    """ test function for tf estimator """
    data = Dataset.from_generator(iterator.generator(), iterator.datatypes(),
                                  iterator.datashape())

    data = data.padded_batch(params.batch_size, iterator.datashape())
    return data
Example #8
0
    def prepare_train_generator(self):
        image_names = glob.glob(self.dir_name +
                                "/training_data/images/images/*.jpg")
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.png"))
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.bmp"))
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.tif"))
        sample_img = cv2.imread(image_names[0])
        target_shape = (sample_img.shape[0], sample_img.shape[1])

        crop_generator = CropGenerator(self.dir_name, target_shape)

        #image_dataset = tf.data.Dataset.list_files(self.dir_name + '/training_data/images/images/*')
        total_dataset = Dataset.range(1, 8).interleave(
            lambda x: Dataset.from_generator(
                CropGenerator(self.dir_name, target_shape),
                output_types=(tf.float32, tf.float32)),
            cycle_length=8)
        total_dataset = total_dataset.shuffle(buffer_size=20)
        #total_dataset = total_dataset.cache("./data_cache.")
        total_dataset = total_dataset.repeat()
        total_dataset = total_dataset.prefetch(buffer_size=20)
        data_tf = total_dataset.make_one_shot_iterator().get_next()
        return data_tf, crop_generator()
Example #9
0
 def get_dataset(self):
     dataset = Dataset.from_generator(self.image_generator, tf.float32,
                                      self.output_shape)
     dataset = dataset.repeat()
     dataset = dataset.batch(self.batch_size)
     dataset = dataset.prefetch(5)
     return dataset
Example #10
0
def h3_raw(file):
    sequences, labels = read_fasta(file)
    test_size = 0.15
    val_size = 0.15
    split_options = dict(test_size=test_size,
                         stratify=labels,
                         random_state=3264)
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        sequences, labels, **split_options)
    # normalize val_size and update options
    split_options.update(
        dict(test_size=val_size / (1 - test_size), stratify=y_train_val))
    x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val,
                                                      **split_options)
    del x_train_val, y_train_val

    x_shape = len(sequences[0])

    train_gen = gen_from_arrays(x_train, y_train, None)
    val_gen = gen_from_arrays(x_val, y_val, None)
    test_gen = gen_from_arrays(x_test, y_test, None)

    # datasets
    batch_size = 32
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = ((), ())
    output_types = (tf.string, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    train_ds = train_ds.shuffle(500).batch(batch_size).map(
        vectorize_text).prefetch(prefetch)

    val_ds = Dataset.from_generator(val_gen, output_types, output_shapes)
    val_ds = val_ds.map(vectorize_text).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.batch(batch_size).map(vectorize_text).prefetch(prefetch)
    x_val_encode, y_val_encode = [], []
    for x, y in val_ds:
        x_val_encode.append(x)
        y_val_encode.append(y)
    x_val_encode = np.array(x_val_encode)
    y_val_encode = np.array(y_val_encode)
    validation_data = (x_val_encode, y_val_encode)

    return x_shape, train_ds, validation_data, test_ds
Example #11
0
def train_dataset(params: HParams, iterator: ner_data.Generator):
    """ train function for tf estimator """
    data = Dataset.from_generator(iterator.generator(), iterator.datatypes(),
                                  iterator.datashape())

    data = data.shuffle(params.shuffle_buffer_size)
    data = data.padded_batch(params.batch_size, iterator.datashape())
    data = data.prefetch(None)
    return data
Example #12
0
def make_submission_file(folder_name):
    '''
    Creates a submission file
    '''

    # creates a dataframe with images to test
    pairs = pd.read_csv('sample_submission.csv',
                        usecols=['img_pair'],
                        squeeze=True)
    num_pairs = pairs.size
    images = pairs.str.split(pat='-', expand=True)

    # load model
    """ model = tf.keras.models.load_model('Training Plots/{}/trained_model.h5'.format(folder_name),
                                       custom_objects={'L2Norm2Prob': models.L2Norm2Prob, 'probability_logistic_loss': losses.probability_logistic_loss,
                                                       'pos_prob': losses.pos_prob, 'neg_prob': losses.neg_prob,
                                                       'pos_dist': losses.pos_dist, 'neg_dist': losses.neg_dist,
                                                       'ROC_custom_metric': losses.ROC_custom_metric}) """

    model = make_facenet_based_model()
    model.load_weights(
        'Training Plots/{}/saved_weights.h5'.format(folder_name))

    def prediction_input_generator(images):

        for i, img_pair in images.iterrows():
            img_1 = pre_processing(cv2.imread('test/' + img_pair.iloc[0]))
            img_2 = pre_processing(cv2.imread('test/' + img_pair.iloc[1]))

            # here, the 3th entry (Negative_input) is not necessary to compute the predictions, but it is needed because the model takes 3 inputs
            yield {
                'Anchor_input': img_1,
                'Positive_input': img_2,
                'Negative_input': img_1
            }

    dataset = Dataset.from_generator(
        lambda: prediction_input_generator(images),
        output_types=({
            'Anchor_input': tf.float32,
            'Positive_input': tf.float32,
            'Negative_input': tf.float32
        }))
    dataset = dataset.batch(1)

    predictions = model.predict(x=dataset, steps=num_pairs)
    predictions = predictions[0::4]

    dataframe = pd.read_csv('sample_submission.csv')
    dataframe.is_related = predictions

    dataframe.to_csv('Training Plots/{}/submission.csv'.format(folder_name),
                     index=False)

    # return every 4th element of output array, because model has 4 outputs (this was made this way because of training and training metrics)
    return predictions
def get_test(batch_size, shuffle=False, test_size=0.2):
    test = Dataset.from_generator(image_generator,
                                  output_types=(tf.float32),
                                  output_shapes=(tf.TensorShape(
                                      (512, 512, 3))),
                                  args=[test_size, True])
    if shuffle:
        test = test.shuffle(10)
    test = test.repeat().batch(batch_size)
    return test
Example #14
0
 def to_tensorflow_dataset(self):
     if not self._weighted:
         ds = Dataset.from_generator(
             self._flattened_gen,
             output_types=(float16, float32),
             output_shapes=((self._target_size[0], self._target_size[1], 3),
                            (1, )),
         )
     else:
         ds = Dataset.from_generator(
             self._flattened_gen,
             output_types=(float16, float32, float32),
             output_shapes=(
                 (self._target_size[0], self._target_size[1], 3),
                 (1, ),
                 (1, ),
             ),
         )
     ds = ds.cache()
     ds = ds.shuffle(len(self._paths), reshuffle_each_iteration=True)
     ds = ds.batch(self._batch_size, num_parallel_calls=AUTOTUNE)
     ds = ds.prefetch(AUTOTUNE)
     return ds
Example #15
0
def motif_discovery(train_file,
                    test_file,
                    word_size=3,
                    region_size=2,
                    expand=True):
    subset_size = 690 * 190

    x_shape = encoded_shape(range(101), word_size, region_size, expand=expand)
    encode_func = encode(word_size, region_size, expand=expand)
    train_gen = gen_from_fasta(train_file, encode_func)
    test_gen = gen_from_fasta(test_file, encode_func)

    # datasets
    bacth_size = 512
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = (x_shape, ())
    output_types = (tf.float32, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    # takes about 30 seconds to skip the training data
    val_ds = train_ds.skip(subset_size).take(690 * 10)
    train_ds = train_ds.take(subset_size).shuffle(500).batch(
        bacth_size).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.take(subset_size).batch(bacth_size).prefetch(prefetch)

    x_val, y_val = [], []
    for d in val_ds:
        x_val.append(d[0])
        y_val.append(d[1])
    x_val = tf.convert_to_tensor(x_val)
    y_val = tf.convert_to_tensor(y_val)
    validation_data = (x_val, y_val)

    return x_shape, train_ds, validation_data, test_ds
def file_input_fn_predict(input_files):
    # d = tf.data.Dataset.from_tensor_slices(input_files)
    # d = dataset.interleave(lambda x:tf.data.TextLineDataset(x).map(parse_line), cycle_length=4, block_length=16)
    def generate_fn():
        for input_file in input_files:
            with open(input_file,'r') as fp:
                for line in fp:
                    # model_item format:
                    # 
                    model_item = create_model_item(line)
                    yield model_item
    dataset = Dataset.from_generator(
        generate_fn,
        output_shapes=(tf.TensorShape([seq_length])))
    return dataset
Example #17
0
def input_fn_images(image_paths,epoch=1, batch_size=1, image_shape=(512, 512)):
    """
    input function for Estimator
    :param image_paths: list: list of path of png file with images
    :param epoch: int: number of epoch
    :param batch_size: int: batch size
    :param image_shape: (int, int): the size of image we want
    :param padding: bool: use padding or not
    :return: dataset for Estimator
    """
    dataset = Dataset.from_generator(generator=image_generator, output_types=(tf.float32),
                                     output_shapes=(tf.TensorShape([512, 512, 3])),
                                     args=((image_paths, image_shape)))
    dataset = dataset.repeat(epoch).batch(batch_size)

    return dataset
    def _get_dataset(self, filenames):
        # get number of data points
        num_transitions = 0
        for file in tqdm(filenames, desc='Count dataset samples'):
            with open(file, 'rb') as f:
                num_transitions += len(pickle.load(f))

        # define dataset generator
        def data_generator():
            shuffled_filenames = random.sample(filenames, len(filenames))
            for file_ix, filename in enumerate(shuffled_filenames):
                with open(filename, 'rb') as f:
                    data = pickle.load(f)
                for transition in data:
                    rgb = transition[0]['images'][..., :3]
                    d = transition[0]['images'][..., [-1]]
                    if self.config['view'] < 0:
                        # get random view
                        view_ix = random.randint(0, len(d) - 1)
                    else:
                        view_ix = self.config['view']
                    rgb = rgb[view_ix]
                    d = d[view_ix]

                    obs = np.concatenate([
                        transition[0]['observation'].flatten(),
                        transition[0]['desired_goal'].flatten()
                    ])
                    gt_aux = transition[0]['observation'].flatten()[3:6]
                    gt_output = transition[1].flatten()

                    yield rgb, d, obs, gt_aux, gt_output

        model_params = self.config['model_params']
        data_shapes = tuple([
            tf.TensorShape([model_params['image_size']] * 2 + [3]),
            tf.TensorShape([model_params['image_size']] * 2 + [1]),
            tf.TensorShape([model_params['obs_dim']]),
            tf.TensorShape([3]),
            tf.TensorShape([model_params['output_dim']])
        ])
        dataset = Dataset.from_generator(data_generator,
                                         output_types=tuple([tf.float32] * 5),
                                         output_shapes=data_shapes)
        dataset = dataset.prefetch(self.config['batch_size'] * 8).repeat()
        dataset = dataset.batch(self.config['batch_size'])
        return dataset, num_transitions
Example #19
0
def make_triplet_dataset(families, positive_relations):
    # =============================================================================
    #     Dataset Generator that returns a random anchor, positive and negative images each time it is called
    # =============================================================================

    dataset = Dataset.from_generator(
        lambda: make_triplet_generator(families, positive_relations),
        output_types=({
            'Anchor_input': tf.float32,
            'Positive_input': tf.float32,
            'Negative_input': tf.float32
        }, tf.int64))

    # batches the dataset
    dataset = dataset.batch(batch_size)

    return dataset
Example #20
0
def input_fn(image_paths, watermark_paths, num_epochs=2, batch_size=5):
    """
    input function for Estimator, uses the generator for this

    :param binary_mask_paths: list: list of full paths of the images
    :param boxes_paths: list: list of full path to pickle file with boxes
    :param threshold: int: treshhold for binary masks
    :param num_epochs: int: the number of epoch
    :param batch_size: int: batch size
    :return: tensorflow dataset for Estimator
    """
    dataset = Dataset.from_generator(generator=generator, output_types=(tf.float32, tf.float32),
                                     output_shapes=(tf.TensorShape([512, 512, 3]), tf.TensorShape([512, 512, 1])),
                                     args=(image_paths, watermark_paths))
    dataset = dataset.repeat(num_epochs).batch(batch_size)

    return dataset
Example #21
0
    def to_tf_test_dataset(self, dataset):
        cfg = self.cfg

        keys = {"image", "file"}
        _dataset = Dataset.from_generator(
            lambda: iter(dataset),
            {k: v
             for k, v in self.dtypes.items() if k in keys},
            {k: v
             for k, v in self.shapes.items() if k in keys},
        )

        if cfg.batch_size > 1:
            _dataset = _dataset.batch(
                cfg.batch_size,
                drop_remainder=False,
            )
        return _dataset
def input_fn(loader, ids, batch_size=None, random_state=None, mode=TRAIN_MODE):
    """Provides the input data for training, evaluation or prediction.

    Data is returned in the format used by tf.estimator.Estimator.

    Args:
        loader: The DataLoader instance to handle the loading of datapoints.
        ids (list<str>): The IDs of the datapoints to that will be needed.
        batch_size (int): The number of samples that comprise a batch. None if not used (in pred and eval mode)
        random_state (numpy.random.RandomState): Random state instance to shuffle the dataset prior to batching. Random states are used to enable different permutations of the training set for different epochs while ensuring that data is not prefetched.
        mode (int) The mode of the estimator to load the correct data. Defined as TRAIN_MODE = 0, EVAL_MODE = 1, PRED_MODE = 2
    Returns:
        next_batch: A nested structure of tensors that iterate over the dataset.
            Every iteration contains a batch of data.
    """
    def load():
        for i in ids:
            feat_seq, align_seq, _ = loader.load(i)
            length = feat_seq.shape[0]
            label_seq = align_seqs_to_breaking_labels(align_seq, length)
            # print('<Loaded %s>' % (i,))
            if mode == TRAIN_MODE or mode == EVAL_MODE:
                yield {'features': feat_seq, 'length': length}, label_seq
            elif mode == PRED_MODE:
                yield {'features': feat_seq, 'length': length}

    dtypes = ({'features': tf.float32, 'length': tf.int32}, tf.float32)
    shapes = ({
        'features': tf.TensorShape([None, N_FEATURES]),
        'length': tf.TensorShape([])
    }, tf.TensorShape([None]))

    if mode == TRAIN_MODE and random_state is not None:
        ids = shuffle(ids, random_state=random_state)
    elif mode == EVAL_MODE or mode == PRED_MODE or batch_size is None:
        batch_size = len(ids)

    if mode == PRED_MODE:
        dtypes = dtypes[0]
        shapes = shapes[0]

    dataset = Dataset.from_generator(load, dtypes, shapes)
    dataset = dataset.padded_batch(batch_size, shapes)
    return dataset.make_one_shot_iterator().get_next()
Example #23
0
 def _build_dataset(self, generator, output_types, output_shapes, 
                    batch_size, buffer_size, num_parallel_calls, take=None):
     
     if num_parallel_calls > 1:
         return _ParallelDataset(generator=generator,
                                 output_types=output_types, 
                                 output_shapes=output_shapes,
                                 batch_size=batch_size, 
                                 num_parallel_calls=num_parallel_calls, 
                                 take=take)
     else:
         dataset = tf_Dataset.from_generator(generator=generator,
                              output_types=output_types,
                              output_shapes=output_shapes).batch(batch_size).prefetch(buffer_size)
         
         if take is not None:
             dataset = dataset.take(take)
         
         return dataset
## reading data
train_file = dir + 'motif_discovery-train.txt'
valid_file = dir + 'motif_discovery-valid.txt'
test_file = dir + 'motif_discovery-test.txt'
ytrain = get_label(train_file)
yval = get_label(valid_file)
ytest = get_label(test_file)

train_gen = customize_generator(train_file)
valid_gen = customize_generator(valid_file)
test_gen = customize_generator(test_file)
output_types = (tf.float32, tf.float32)
prefetch = tf.data.experimental.AUTOTUNE
xtrain_seq = Dataset.from_generator(
    train_gen, output_types=output_types,
    output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch)
xval_seq = Dataset.from_generator(
    valid_gen, output_types=output_types,
    output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch)
xtest_seq = Dataset.from_generator(
    test_gen, output_types=output_types,
    output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch)

latent_size = 30
seq_len = 101
encoder = keras.Sequential([
    keras.Input(shape=(seq_len, )),
    keras.layers.Embedding(seq_len, latent_size),
    keras.layers.LSTM(latent_size, return_sequences=False),
])
            image_tensor: the tensorflow sensor of the image.
            category_tensor: the tensorflow sensor of the category.

        """
        for data in self.segment:
            with data.open() as fp:
                image_tensor = tf.convert_to_tensor(np.array(Image.open(fp)) /
                                                    255,
                                                    dtype=tf.float32)
            category = self.category_to_index[
                data.label.classification.category]
            category_tensor = tf.convert_to_tensor(category, dtype=tf.int32)
            yield image_tensor, category_tensor
            # """"""


"""Build a tensorflow dataset and run it"""
ACCESS_KEY = "Accesskey-*****"

dataset = Dataset.from_generator(
    MNISTSegment(GAS(ACCESS_KEY), "train"),
    output_signature=(
        tf.TensorSpec(shape=(28, 28), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    ),
).batch(4)

for index, (image, label) in enumerate(dataset):
    print(f"{index}: {label}")
""""""
def main():
    path = "corti-data-manager/tests/data/"
    entity_file_path = os.path.join(path, "may10-entity.json")
    intent_file_path = os.path.join(path, "may10-intent.json")

    loss_retained = 10
    maxlen = 30
    pattern_dist_history = {}
    loss_dist_history = {}
    list_labels = []  # logical form predicates/actions

    embedding_dims = 300
    lstm_units = 128

    num_features = 512
    num_labels = 0
    # Data Generator
    conf = process_conf(
        "corti-data-manager/tests/data/confs/stream_data_gen.yml")
    print(conf)
    sqg = StreamingDataGenerator(conf)
    number_of_patterns = len(sqg.streaming_parser.patterns) + 5
    print("Number of patterns: ", number_of_patterns)
    print(sqg.total_patterns)

    tokenizer = get_tokenizer(sqg)
    print("Vocab Size: ", len(tokenizer.word_counts))
    vocab_size = len(tokenizer.word_counts)

    list_labels = get_labels(intent_file_path)
    label_encoder = LabelEncoder()
    label_encoder.fit(list_labels)
    integer_encoded = label_encoder.transform(list_labels)
    num_labels = int(np.max(integer_encoded) + 1)
    print("Number of labels: ", num_labels)
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoder.fit(integer_encoded)
    # print(onehot_encoder.transform(integer_encoded))

    pdir = "test_norm"
    path1 = "data/" + pdir + "/train.json"
    path2 = "data/" + pdir + "/label.json"
    path3 = "data/" + pdir + "/index.json"
    with open(path1) as json_file:
        queries = json.load(json_file)
    with open(path2) as json_file:
        labels = json.load(json_file)
    with open(path3) as json_file:
        indexes = json.load(json_file)
    print(queries[0])
    print(len(queries))
    datagen = data_generator(tokenizer, maxlen, label_encoder, onehot_encoder,
                             queries, labels, indexes)
    checkpoint_prefix = "saved/after3/"
    # Model
    model = LSTMClassifier(vocab_size + 2, num_labels, embedding_dims,
                           lstm_units)
    optimizer = tf.train.AdamOptimizer()
    root = tf.train.Checkpoint(
        optimizer=optimizer,
        model=model,
        optimizer_step=tf.train.get_or_create_global_step())
    root.restore(tf.train.latest_checkpoint(checkpoint_prefix))

    pattern_dist = tf.math.softmax(np.zeros(number_of_patterns)).numpy()
    avg_loss = np.ones_like(pattern_dist) * 10

    BUFFER_SIZE = len(queries)
    losses = defaultdict(lambda: [0] * loss_retained)
    counts = defaultdict(int)
    queries_shape = [None, maxlen]
    labels_shape = [None, num_labels]
    indexes_shape = [
        None,
    ]
    minibatch_size = len(queries)
    dataset = Dataset.from_generator(
        datagen,
        args=[minibatch_size],
        output_types=(tf.int32, tf.int32, tf.int32),
        output_shapes=(tf.TensorShape(queries_shape),
                       tf.TensorShape(labels_shape),
                       tf.TensorShape(indexes_shape))).shuffle(BUFFER_SIZE)
    for i, (queries, labels, indexes) in enumerate(dataset):
        print(i)
        predictions = model(queries)
        # print(queries[0],labels[0],predictions[0])
        print(np.argmax(labels[0]), np.argmax(predictions[0]))
        # [batch_size, num_classes]
        accu = evaluate_accuracy(labels, predictions)
        np.set_printoptions(threshold=np.inf)
        print("accuracy: ", accu)
        loss = tf.losses.softmax_cross_entropy(labels,
                                               predictions,
                                               reduction=Reduction.NONE)
        print(loss.shape)
        loss_reduced = tf.math.reduce_mean(loss)
        nplabels = labels.numpy()
        for i, (a_type, a_loss) in enumerate(zip(indexes, loss)):
            a_type = int(a_type.numpy())
            idx = int(counts[a_type] % loss_retained)
            max_idx = np.argmax(nplabels[i])
            losses[a_type][idx] = float(np.mean(a_loss.numpy()))
            counts[a_type] = counts.get(a_type, 0.0) + 1.0
        for a_type in losses:
            num = int(counts[a_type])
            if num >= loss_retained:
                avg_loss[a_type] = np.mean(losses[a_type])
            else:
                avg_loss[a_type] = np.mean(losses[a_type][:num])
    print('iter:{0} --> Average Loss:{1}'.format(i, loss_reduced.numpy()))
    with open("output/test/pattern_losses_uniform.txt", "w") as f:
        f.write(json.dumps(list(avg_loss)))
Example #27
0
is_training_pl = True

bn_decay = True

n_pc = get_number_pc()

tf.reset_default_graph()

# np.random.seed(42)
tf.set_random_seed(2019)

with tf.Graph().as_default():
    with tf.device('/gpu:0'):
        dataset = Dataset.from_generator(
            generator, (tf.float32),
            output_shapes=(tf.TensorShape([NUM_POINT, 3])),
            args=([BATCH_SIZE * int(n_pc / BATCH_SIZE)]))
        # dataset = Dataset.from_generator(generator, (tf.float32, tf.float32), output_shapes=(tf.TensorShape([1000]), tf.TensorShape([ NUM_POINT, 3])))
        # dataset = dataset.repeat(1)
        dataset = dataset.shuffle(100)
        dataset = dataset.batch(BATCH_SIZE)

        # iterator = dataset.make_one_shot_iterator()
        iterator = dataset.make_initializable_iterator()

        # features, pointclouds_pl = iterator.get_next()0
        pointclouds_pl = iterator.get_next()
        pointclouds_pl = tf.reshape(pointclouds_pl, (BATCH_SIZE, NUM_POINT, 3))
        print("---------------------->   ", pointclouds_pl)
        # print(batch)
Example #28
0
                         test_batch_size=test_batch_size,
                         train_proportion=train_proportion,
                         class_proportion=class_proportion)
train_index_generator, test_index_generator = data_stream.split_by_patch_id(
    features[['image']], features[['destroyed']])
train_generator = data_stream.get_train_data_generator_from_index(
    [features['image'], features['destroyed']], train_index_generator)

test_indices = list(test_index_generator)
test_generator = data_stream.get_test_data_generator_from_index(
    features['image'], test_indices)

num_batches = ceil(len(features) / space['batch_size'])
num_batches_test = len(test_indices)
# Fit model and predict
train_dataset = Dataset.from_generator(lambda: train_generator,
                                       (tf.float32, tf.int32))
print('Training with space: \n')
print(space)
model = Model(**space)
model.fit_generator(train_dataset,
                    steps_per_epoch=num_batches,
                    verbose=1,
                    **space)

test_dataset = Dataset.from_generator(lambda: test_generator, tf.float32)
predictions = model.predict_generator(test_dataset, steps=num_batches_test)

test_indices_flattened = test_indices[0]
for index in test_indices[1:]:
    test_indices_flattened = test_indices_flattened.append(index)
Example #29
0
def main():

    # Get arguments
    args = get_argument()
    models_dir = args.models_directory
    if args.log_frequency != 'epoch' and args.log_frequency != 'batch':
        log_frequency = int(args.log_frequency)
    else:
        log_frequency = args.log_frequency

    # Prepare training dataset
    train_data = Dataset.from_generator(
        generator=onet_generator(args.batch_size, args.steps_per_epoch, args.data_folder),
        output_types=(tf.uint8, (tf.float32, tf.float32, tf.float32)),
        output_shapes=(tf.TensorShape([args.batch_size, 48, 48, 3]), (tf.TensorShape([args.batch_size, 3]), tf.TensorShape([args.batch_size, 5]), tf.TensorShape([args.batch_size, 11])))
    )
    train_data = train_data.map(augment_v2)

    # Prepare validation dataset
    val_data = load_validation_data(args.data_folder, args.batch_size)

    # Stop training if no improvements are made
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=args.early_stopping,
        mode='min'
    )

    # Model checkpoints
    model_checkpoint = ModelCheckpoint(
        filepath=models_dir + '/epoch_{epoch:04d}_val_loss_{val_loss:.4f}.hdf5',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=True,
        mode='min'
    )

    # Learning rate decay
    lr_decay = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=args.lr_decay_patience,
        mode='min',
        min_delta=args.lr_decay_min_delta
    )

    # Set up Tensorboard
    tensorboard = TensorBoard(
        log_dir=models_dir + '/log',
        write_graph=False,
        profile_batch=0,
        update_freq=log_frequency
    )

    # Create and compile the model from scratch
    model = onet()

    # Load the pre-trained model for finetuning
    model.load_weights(filepath=args.onet, by_name=True)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
        loss=[
            BCE_with_sti(args.hard_sample_mining, args.num_back),
            MSE_with_sti(args.hard_sample_mining, args.num_back),
            MSE_with_sti(args.hard_sample_mining, args.num_back)
        ],
        metrics=[[accuracy_(), recall_()], None, None],
        loss_weights=[1, 0.5, 1]
    )

    # Create folders
    if not path.exists(models_dir):
        os.makedirs(models_dir)
    if not path.exists(models_dir + '/log'):
        os.mkdir(models_dir + '/log')

    # Train the model
    history = model.fit(
        x=train_data,
        epochs=args.num_epochs,
        callbacks=[early_stopping, model_checkpoint, lr_decay, tensorboard],
        validation_data=val_data,
        steps_per_epoch=args.steps_per_epoch
    )
Example #30
0
 def get_input_fn():
     types, shapes = self.input_pipeline.feed_shape_type_def()
     tf_dataset = Dataset.from_generator(dataset_encoded, types[0],
                                         shapes[0])
     return tf_dataset.batch(1)