Ejemplo n.º 1
0
    def __call__(self, protocol, subset='train'):

        self.initialize(protocol, subset=subset)

        batch_size = self.batch_size
        batches_per_epoch = self.batches_per_epoch

        generators = []
        if self.parallel:

            for i in range(self.parallel):
                generator = self.generator()
                batches = batchify(generator,
                                   self.signature,
                                   batch_size=batch_size,
                                   prefetch=batches_per_epoch)
                generators.append(batches)
        else:
            generator = self.generator()
            batches = batchify(generator,
                               self.signature,
                               batch_size=batch_size,
                               prefetch=0)
            generators.append(batches)

        while True:
            # get `batches_per_epoch` batches from each generator
            for batches in generators:
                for _ in range(batches_per_epoch):
                    yield next(batches)
Ejemplo n.º 2
0
    def postprocess_ndarray(self, X):
        """Embed (fixed-length) sequences

        Parameters
        ----------
        X : (batch_size, n_samples, n_features) numpy array
            Batch of input sequences

        Returns
        -------
        fX : (batch_size, n_dimensions) numpy array
            Batch of sequence embeddings.
        """

        batch_size, n_samples, n_features = X.shape

        # this test is needed because .apply() may be called
        # with a ndarray of arbitrary size as input
        if batch_size <= self.batch_size:
            X = torch.tensor(X, dtype=torch.float32, device=self.device)
            cpu = torch.device('cpu')
            return self.model(X).detach().to(cpu).numpy()

        # if X contains too large a batch, split it in smaller batches...
        batches = batchify(iter(X), {'@': (None, np.stack)},
                           batch_size=self.batch_size,
                           incomplete=True, prefetch=0)

        # ... and process them in order, before re-concatenating them
        return np.vstack([self.postprocess_ndarray(x) for x in batches])
Ejemplo n.º 3
0
    def postprocess_ndarray(self, X):
        """Embed (fixed-length) sequences

        Parameters
        ----------
        X : (batch_size, n_samples, n_features) numpy array
            Batch of input sequences

        Returns
        -------
        fX : (batch_size, n_dimensions) numpy array
            Batch of sequence embeddings.
        """

        batch_size, n_samples, n_features = X.shape

        # this test is needed because .apply() may be called
        # with a ndarray of arbitrary size as input
        if batch_size <= self.batch_size:
            X = torch.tensor(X, dtype=torch.float32, device=self.device)
            cpu = torch.device('cpu')
            return self.model(X).detach().to(cpu).numpy()

        # if X contains too large a batch, split it in smaller batches...
        batches = batchify(iter(X), {'@': (None, np.stack)},
                           batch_size=self.batch_size,
                           incomplete=True,
                           prefetch=0)

        # ... and process them in order, before re-concatenating them
        return np.vstack([self.postprocess_ndarray(x) for x in batches])
Ejemplo n.º 4
0
    def __call__(self, protocol, subset='train'):
        """(Parallelized) batch generator"""

        # pre-load useful information about protocol once and for all
        self.initialize(protocol, subset=subset)

        # number of batches needed to complete an epoch
        batches_per_epoch = self.batches_per_epoch

        generators = []

        if self.parallel:
            for _ in range(self.parallel):

                # initialize one sample generator
                samples = self.samples()

                # batchify it and make sure at least
                # `batches_per_epoch` batches are prefetched.
                batches = batchify(samples, self.signature,
                                   batch_size=self.batch_size,
                                   prefetch=batches_per_epoch)

                # add batch generator to the list of (background) generators
                generators.append(batches)
        else:

            # initialize one sample generator
            samples = self.samples()

            # batchify it without prefetching
            batches = batchify(samples, self.signature,
                               batch_size=self.batch_size, prefetch=0)

            # add it to the list of generators
            # NOTE: this list will only contain one generator
            generators.append(batches)

        # loop on (background) generators indefinitely
        while True:
            for batches in generators:
                # yield `batches_per_epoch` batches from current generator
                # so that each epoch is covered by exactly one generator
                for _ in range(batches_per_epoch):
                    yield next(batches)
Ejemplo n.º 5
0
    def _get_batch_generator_y(self, data_h5):
        """Get batch generator

        Parameters
        ----------
        data_h5 : str
            Path to HDF5 file containing precomputed sequences.
            It must have to aligned datasets 'X' and 'y'.

        Returns
        -------
        batch_generator : iterable
        batches_per_epoch : int
        n_classes : int
        """

        fp = h5py.File(data_h5, mode='r')
        h5_X = fp['X']
        h5_y = fp['y']

        # keep track of number of labels and rename labels to integers
        unique, y = np.unique(h5_y, return_inverse=True)
        n_classes = len(unique)

        index_generator = random_label_index(y,
                                             per_label=self.per_label,
                                             return_label=False)

        def generator():
            while True:
                i = next(index_generator)
                yield {'X': h5_X[i], 'y': y[i]}

        signature = {'X': {'type': 'ndarray'}, 'y': {'type': 'ndarray'}}
        batch_size = self.per_batch * self.per_fold * self.per_label
        batch_generator = batchify(generator(),
                                   signature,
                                   batch_size=batch_size)

        batches_per_epoch = n_classes // (self.per_batch * self.per_fold) + 1

        return {
            'batch_generator': batch_generator,
            'batches_per_epoch': batches_per_epoch,
            'n_classes': n_classes,
            'classes': unique
        }
Ejemplo n.º 6
0
def train(x_paths, y_paths, weights_dir, x_paths_audio=None):
    n_samples, n_positive = statistics(y_paths)

    # estimate performance of "majority class" baseline
    baseline = 100 * n_positive / n_samples
    print('Baseline = {0:.1f}%'.format(baseline))

    # estimate number of batches per epoch
    steps_per_epoch = n_samples // BATCH_SIZE

    # create batch generator
    generator = get_generator(x_paths, y_paths)
    if CATEGORICAL:
        signature = ({'type': 'ndarray'}, {'type': 'ndarray'})
    else:
        signature = ({'type': 'ndarray'}, {'type': 'scalar'})

    batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE)
    print(INPUT_DIMS)
    # create model
    if CATEGORICAL:
        model = StackedLSTM()((25, INPUT_DIMS))
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
    else:

        model = StackedLSTM(final_activation="sigmoid",
                            n_classes=1)((25, INPUT_DIMS))
        #only 4 units
        #model = StackedLSTM(lstm=[4,],mlp=[],final_activation="sigmoid",n_classes=1)((25, INPUT_DIMS))

        model.compile(loss='binary_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

    # train model
    model_h5 = weights_dir + '/{epoch:04d}.h5'
    callbacks = [ModelCheckpoint(model_h5, period=1)]
    model.fit_generator(batch_generator,
                        steps_per_epoch,
                        epochs=1000,
                        verbose=1,
                        callbacks=callbacks,
                        workers=1)
Ejemplo n.º 7
0
def validate_ev(x_paths, y_paths, input_model):

    generator = get_generator(x_paths, y_paths, forever=False)
    signature = ({'type': 'ndarray'}, {'type': 'scalar'})
    batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE)

    Y_true, Y_pred = [], []
    for X, y in batch_generator:
        # Y_pred.append(model.predict(X)[:, :, 1].reshape((-1, 1)))
        # Y_true.append(y[:, :, 1].reshape((-1, 1)))
        Y_pred.append(input_model.predict(X).reshape((-1, 1)))
        Y_true.append(y.reshape((-1, 1)))

    y_true = np.vstack(Y_true)
    #check y_pred values
    y_pred = np.vstack(Y_pred)

    return y_true, y_pred
Ejemplo n.º 8
0
def validate(x_paths, y_paths, weights_dir):

    epoch = 0
    f = open(WEIGHTS_DIR + "/list_test_prec_rec_auc", 'w')
    while True:

        # sleep until next epoch is finished
        model_h5 = weights_dir + '/{epoch:04d}.h5'.format(epoch=epoch)
        if not os.path.isfile(model_h5):
            time.sleep(10)
            continue
        model = load_model(model_h5)

        generator = get_generator(x_paths, y_paths, forever=False)
        if CATEGORICAL:
            signature = ({'type': 'ndarray'}, {'type': 'ndarray'})
        else:
            signature = ({'type': 'ndarray'}, {'type': 'scalar'})
        batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE)

        Y_true, Y_pred = [], []
        for X, y in batch_generator:
            #Y_pred.append(model.predict(X)[:, :, 1].reshape((-1, 1)))
            #Y_true.append(y[:, :, 1].reshape((-1, 1)))
            Y_pred.append(model.predict(X).reshape((-1, 1)))
            Y_true.append(y.reshape((-1, 1)))

        y_true = np.vstack(Y_true)
        y_pred = np.vstack(Y_pred)

        #auc = roc_auc_score(y_true, y_pred, average='macro', sample_weight=None)
        auc = average_precision_score(y_true,
                                      y_pred,
                                      average='macro',
                                      sample_weight=None)

        print('#{epoch:04d} {auc:.4f}%'.format(epoch=epoch + 1, auc=100 * auc))
        f.write("{},".format(auc))
        f.flush()

        epoch += 1
Ejemplo n.º 9
0
    def embed(self, embedding, X, internal=False):
        """Apply embedding on sequences

        Parameters
        ----------
        embedding : keras.Model
            Current state of embedding network
        X : (n_sequences, n_samples, n_features) numpy array
            Batch of input sequences
        internal : bool, optional
            Set to True to return internal representation

        Returns
        -------
        fX : (n_sequences, ...) numpy array
            Batch of embeddings.

        """

        if internal:

            embed = K.function(
                [embedding.get_layer(name='input').input, K.learning_phase()],
                [embedding.get_layer(name='internal').output])

            # split large batch in smaller batches if needed
            if len(X) > self.batch_size:
                batch_generator = batchify(iter(X), {'type': 'ndarray'},
                                           batch_size=self.batch_size,
                                           incomplete=True)
                fX = np.vstack(embed([x, 0])[0] for x in batch_generator)
            else:
                fX = embed([X, 0])[0]

        else:
            fX = embedding.predict(X, batch_size=self.batch_size)

        return fX.astype(self.float_autograd_)
Ejemplo n.º 10
0
    training_file, development_file, BATCH_SIZE, optimizer_name, output_path = process_arguments(
    )

    training_no_samples, training_sequence_no_samples, validation_no_samples, validation_sequence_no_samples, \
    validation_start, development_no_samples, development_sequence_no_samples, index_arr_train, index_arr_validate,index_array_dev  = utils.set_no_samples(training_file,development_file,True,USE_VALIDATION,TRAINING_RATIO,VALIDATION_RATIO,SEQUENCE_LENGTH,STEP)

    # create batch generator
    signature = ({'type': 'ndarray'}, {'type': 'ndarray'})

    training_generator = utils.lstm_generator(
        training_file, "training", validation_start, index_arr_train,
        index_arr_validate, SEQUENCE_LENGTH, STEP, FIRST_DERIVATIVE,
        SECOND_DERIVATIVE)
    batch_training_generator = batchify(training_generator,
                                        signature,
                                        batch_size=BATCH_SIZE)

    steps_per_epoch_train, _, _ = utils.calculate_steps_per_epoch(
        training_sequence_no_samples, 0, 0, batch_size=BATCH_SIZE)

    training_percentage = utils.compute_samples_majority_class(
        training_file, type="training", start=0, end=training_no_samples)
    print("Training set +ve label percentage: " + str(training_percentage))

    if SAVE_SEQUENCES:
        validation_generator = utils.lstm_generator(training_file,
                                                    "validation",
                                                    validation_start,
                                                    index_arr_train,
                                                    index_arr_validate,
Ejemplo n.º 11
0
    def _get_batch_generator_z(self, data_h5):
        """"""

        fp = h5py.File(data_h5, mode='r')
        h5_X = fp['X']
        h5_y = fp['y']
        h5_z = fp['z']

        df = pd.DataFrame({'y': h5_y, 'z': h5_z})
        z_groups = df.groupby('z')

        y_groups = [group.y.iloc[0] for _, group in z_groups]

        # keep track of number of labels and rename labels to integers
        unique, y = np.unique(y_groups, return_inverse=True)
        n_classes = len(unique)

        index_generator = random_label_index(y,
                                             per_label=self.per_label,
                                             return_label=True,
                                             repeat=False)

        def generator():
            while True:
                # get next group
                i, label = next(index_generator)

                # select at most 10 sequences of current group
                selector = list(z_groups.get_group(i).index)
                selector = np.random.choice(selector,
                                            size=min(10, len(selector)),
                                            replace=False)

                X = np.array(h5_X[sorted(selector)])
                n = X.shape[0]
                yield {'X': X, 'y': label, 'n': n}

        signature = {
            'X': {
                'type': 'ndarray'
            },
            'y': {
                'type': 'scalar'
            },
            'n': {
                'type': 'complex'
            }
        }

        batch_size = self.per_batch * self.per_fold * self.per_label
        batch_generator = batchify(generator(),
                                   signature,
                                   batch_size=batch_size)

        batches_per_epoch = n_classes // (self.per_batch * self.per_fold) + 1

        return {
            'batch_generator': batch_generator,
            'batches_per_epoch': batches_per_epoch,
            'n_classes': n_classes,
            'classes': unique
        }
Ejemplo n.º 12
0
    def _get_batch_generator_y(self, data_h5):
        """Get batch generator

        Parameters
        ----------
        data_h5 : str
            Path to HDF5 file containing precomputed sequences.
            It must have to aligned datasets 'X' and 'y'.

        Returns
        -------
        batch_generator : iterable
        batches_per_epoch : int
        n_classes : int
        """

        fp = h5py.File(data_h5, mode='r')
        h5_X = fp['X']
        h5_y = fp['y']

        # keep track of number of labels and rename labels to integers
        unique, y = np.unique(h5_y, return_inverse=True)
        n_classes = len(unique)

        # iterates over sequences of class jC
        # in random order, and forever
        def class_generator(jC):
            indices = np.where(y == jC)[0]
            while True:
                np.random.shuffle(indices)
                for i in indices:
                    yield i

        def generator():

            centers = np.arange(n_classes)
            class_generators = [class_generator(jC) for jC in centers]

            previous_label = None

            while True:

                # loop over each centers in random order
                np.random.shuffle(centers)
                for iC in centers:

                    try:
                        # get "per_fold" closest centers to current centers
                        distances = cdist(self.fC_[iC, np.newaxis],
                                          self.fC_,
                                          metric=self.metric)[0]
                    except AttributeError as e:
                        # when on_train_begin hasn't been called yet,
                        # attribute fC_ doesn't exist --> fake it
                        distances = np.random.rand(len(centers))
                        distances[iC] = 0.

                    closest_centers = np.argpartition(
                        distances, self.per_fold)[:self.per_fold]

                    # corner case where last center of previous loop
                    # is the same as first center of current loop
                    if closest_centers[0] == previous_label:
                        closest_centers[:-1] = closest_centers[1:]
                        closest_centers[-1] = previous_label

                    for jC in closest_centers:
                        for _ in range(self.per_label):
                            i = next(class_generators[jC])
                            yield {'X': h5_X[i], 'y': y[i]}
                        previous_label = jC

        signature = {'X': {'type': 'ndarray'}, 'y': {'type': 'ndarray'}}
        batch_size = self.per_batch * self.per_fold * self.per_label
        batch_generator = batchify(generator(),
                                   signature,
                                   batch_size=batch_size)

        # each fold contains one center and its `per_fold` closest centers
        # therefore, the only way to be sure that we've seen every class in
        # one epoch is to go through `n_classes` folds,
        # i.e. n_classes / per_batch batches
        batches_per_epoch = n_classes // self.per_batch

        return {
            'batch_generator': batch_generator,
            'batches_per_epoch': batches_per_epoch,
            'n_classes': n_classes,
            'classes': unique
        }