Example #1
0
    def __init__(self, data, label=None, batch_size=1, shuffle=False,
                 last_batch_handle='pad', data_name='data',
                 label_name='softmax_label'):
        super(SparseMatrixDataIter, self).__init__(batch_size)

        assert(isinstance(data, scipy.sparse.csr.csr_matrix))
        
        self.data = _init_data(data, allow_empty=False, default_name=data_name)
        self.label = _init_data(label, allow_empty=True, default_name=label_name)
        self.num_data = self.data[0][1].shape[0]

        # shuffle data
        if shuffle:
            sh_data = []
            d = self.data[0][1]
            if len(self.label[0][1]) > 0:
                l = self.label[0][1]
                ds, dl = sk_shuffle(d, l)
                self.data = _init_data(ds, allow_empty=False, default_name=data_name)
                self.label = _init_data(dl, allow_empty=True, default_name=label_name)
            else:
                ds = sk_shuffle(d)
                self.data = _init_data(ds, allow_empty=False, default_name=data_name)

        # batching
        if last_batch_handle == 'discard':
            new_n = self.data[0][1].shape[0] - self.data[0][1].shape[0] % batch_size
            self.num_data = new_n

        self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label]
        assert self.num_data >= batch_size, "batch_size needs to be smaller than data size."
        self.cursor = -batch_size
        self.batch_size = batch_size
        self.last_batch_handle = last_batch_handle
def get_mask_splits(dim, pick_name=None, model_type='Mobilenet', bw=False):

    #Train Set
    train_paths, train_labels = get_mask_classes('Train')
    train_images = np.array(
        [get_image_value(i, dim, bw, model_type) for i in train_paths])
    train_dict = dict(images=train_images, labels=train_labels)

    #Test Set
    test_paths, test_labels = get_mask_classes('Test')
    test_images = np.array(
        [get_image_value(i, dim, bw, model_type) for i in test_paths])
    test_images, test_labels = sk_shuffle(test_images, test_labels)

    #Validation Set
    val_paths, val_labels = get_mask_classes('Validation')
    val_images = np.array(
        [get_image_value(i, dim, bw, model_type) for i in val_paths])
    val_images, val_labels = sk_shuffle(val_images, val_labels)

    tts = train_images, test_images, train_labels, test_labels, val_images, val_labels

    if pick_name:
        print('Pickling The Data')
        pickle.dump(tts,
                    open(f'../Pickles/TTSMask_{pick_name}.p', 'wb'),
                    protocol=4)
        print('Finished Pickling')
    return tts
Example #3
0
def balance_dataset(dataset, balance_ratio):
    """Reduce the number of unrelated data examples to match the related ones.
    Input:
        dataset: pandas dataframe containing the data
        balance_ratio: precentage of the balancing: 0.5 = equal balancing
    """

    RELATION_RATIO = balance_ratio

    labelMatrix = dataset['label'].to_numpy()
    numberOfRelations = np.count_nonzero(labelMatrix)
    relationRatio = numberOfRelations / len(dataset)

    if relationRatio < RELATION_RATIO:
        dataset['labelAbs'] = dataset['label'].abs()

        print('-----DATA IS UNBALANCED CURRENT SIZE: ' + str(len(dataset)) +
              ' CLASS RATIO: ' + str(relationRatio) + ' ... BALANCING DATA')

        shuffled = sk_shuffle(dataset)

        orderedDataset = shuffled.sort_values(by=['labelAbs'], ascending=False)
        cutOff = int(1 / RELATION_RATIO * numberOfRelations)

        balanced = sk_shuffle(orderedDataset.head(cutOff))
        balanced = balanced.drop('labelAbs', axis=1)

        print('-----BALANCED DATASET WITH SIZE: ' + str(len(balanced)))
        return balanced
    else:

        print('-----DATASET IS ALREADY BALANCED - CLASS RATIO: ' +
              str(relationRatio) + '-----')

        return dataset
Example #4
0
def image_generator_ysn(x,
                        y=None,
                        batch_size=64,
                        shuffle=True,
                        enable_shift=True):
    """
    Arguments:
        x: input image of size (n_x, n_h, n_w, n_ch)
        y: second image of same size as x. Use only if the same operation is to be performed on another image y. Most likely should be None.
        batch_size: size of batches of the generator
        shuffle: enable or disable shuffling of data in the beginneing.
        enable_shift: enable or disable vertical/horizontal shift. If disabled, this acts as a simple generator with batch_size.
    """
    if shuffle:
        if y is not None:
            x, y = sk_shuffle(x, y)
        else:
            x = sk_shuffle(x)
    n_x = len(x)
    n_h = x.shape[1]
    n_w = x.shape[2]
    i = 0

    # Generator loop
    while True:
        # Get the batch data
        batch_x = x[i:min(i + batch_size, n_x)]
        if y is not None:
            batch_y = y[i:min(i + batch_size, n_x)]

        if enable_shift:
            # horizontal shift by a random number shift
            # We have a slightly higher chance for zero shift. See if you can figure out why.
            shift = np.random.randint(-n_w, n_w, 1)
            batch_x = np.roll(batch_x, shift=shift, axis=2)

            # vertical shift
            shift = np.random.randint(-n_w, n_w, 1)
            batch_x = np.roll(batch_x, shift=shift, axis=1)

        # Handle batch increment/end
        i += batch_size
        if i >= n_x:
            i = 0

        # Yield data
        if y is not None:
            yield (batch_x, batch_y)
        else:
            yield (batch_x)
def make_hdf5(filenames, out_file, crop=None):

    color_files = ['raw/frames/%s' % f for f in filenames]
    sketch_files = ['raw/sketch/%s' % f for f in filenames]

    color_imgs = parallelize(read_img, color_files, verbose=10)
    sketch_imgs = parallelize(read_img, sketch_files, verbose=10, params=1)

    img_data = [(c, s, f)
                for c, s, f in zip(color_imgs, sketch_imgs, filenames)
                if (c is not None and s is not None)]

    img_data = sk_shuffle(img_data)
    color_imgs, sketch_imgs, files = zip(*img_data)
    color_imgs = np.asanyarray(color_imgs)
    sketch_imgs = np.asanyarray(sketch_imgs)
    files = np.array(files)

    # crop images if required
    if crop is not None:
        color_imgs = center_crop(color_imgs, crop)
        sketch_imgs = center_crop(sketch_imgs, crop)

    img_mean = np.mean(color_imgs, 0)

    with h5py.File(out_file, 'w') as hf:
        hf.create_dataset('img_files', data=files)
        hf.create_dataset('col_sketch_data', data=color_imgs)
        hf.create_dataset('bw_sketch_data', data=sketch_imgs)
        hf.create_dataset('col_reference_data', data=color_imgs)
        hf.create_dataset('img_mean', data=img_mean)
    def __init__(
        self,
        X,
        y,
        x_val=None,
        y_val=None,
        x_test=None,
        y_test=None,
        val_split=0.2,
        test_split=0.1,
        num_workers=0,
        random_state=1234,
        shuffle=True,
        batch_size: int = 16,
        pin_memory=True,
        drop_last=False,
        *args,
        **kwargs,
    ) -> None:

        super().__init__(*args, **kwargs)
        self.num_workers = num_workers
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pin_memory = pin_memory
        self.drop_last = drop_last

        # shuffle x and y
        if shuffle and _SKLEARN_AVAILABLE:
            X, y = sk_shuffle(X, y, random_state=random_state)
        elif shuffle and not _SKLEARN_AVAILABLE:  # pragma: no cover
            raise ModuleNotFoundError(
                "You want to use shuffle function from `scikit-learn` which is not installed yet."
            )

        val_split = 0 if x_val is not None or y_val is not None else val_split
        test_split = 0 if x_test is not None or y_test is not None else test_split

        hold_out_split = val_split + test_split
        if hold_out_split > 0:
            val_split = val_split / hold_out_split
            hold_out_size = math.floor(len(X) * hold_out_split)
            x_holdout, y_holdout = X[:hold_out_size], y[:hold_out_size]
            test_i_start = int(val_split * hold_out_size)
            x_val_hold_out, y_val_holdout = x_holdout[:
                                                      test_i_start], y_holdout[:
                                                                               test_i_start]
            x_test_hold_out, y_test_holdout = x_holdout[
                test_i_start:], y_holdout[test_i_start:]
            X, y = X[hold_out_size:], y[hold_out_size:]

        # if don't have x_val and y_val create split from X
        if x_val is None and y_val is None and val_split > 0:
            x_val, y_val = x_val_hold_out, y_val_holdout

        # if don't have x_test, y_test create split from X
        if x_test is None and y_test is None and test_split > 0:
            x_test, y_test = x_test_hold_out, y_test_holdout

        self._init_datasets(X, y, x_val, y_val, x_test, y_test)
Example #7
0
    def __init__(self,
                 paths=["", ""],
                 batch_size=32,
                 augment=False,
                 seed=1,
                 domain="A",
                 name="No Name",
                 shuffle=True):
        # super(MyDataset, self).__init__()
        self.batch_size = batch_size
        self.paths = paths
        self.augment = augment
        self.seed = seed
        self.domain = domain
        self.name = name
        # train_generator, train_steps

        print("Loading file...")
        self.X_train, self.Y_train = np.load(paths[0]), np.load(paths[1])
        print("+ Done.")
        print("Total datasets samples: {}".format(len(self.Y_train)))
        if shuffle:
            print("Shuffling datasets...")
            self.X_train, self.Y_train = sk_shuffle(self.X_train, self.Y_train)
            print("+ Done.")

        print("Preprocessing: rescale pixel value to (-1,1)...")
        self.X_train = self.preprocessing(self.X_train)
        print("+ Done.")

        self.generator, self.steps = self.my_generator(
            batch_size=self.batch_size, augment=self.augment, seed=self.seed)
Example #8
0
    def fit_transform(self, X, y):
        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(X)
            self.y = pd.Series(y)
        else:
            self.X = X.copy(deep=True)
            self.y = y.copy(deep=True)
        if not isinstance(self.X, pd.DataFrame):
            raise ValueError('%s is not supported' % type(X))
        self.shape_before = self.X.shape

        self.X, self.col_was_null = self.__impute(self.X)

        self._label_encoder = None
        self._onehot_encoder = None
        self.X, self.del_columns = self.__encode(self.X)

        self._standardizer = None
        if self.standardize:
            self.X = self.__standardize(self.X)

        if self.shuffle:
            self.X, self.y = sk_shuffle(self.X,
                                        self.y,
                                        random_state=self.random_state)
Example #9
0
    def fit_svm(self,
                sess,
                train_data,
                validation_data,
                validation_label,
                epochs,
                input_tensor=None,
                shuffle=False):
        input_tensor = input_tensor if input_tensor is not None else self.input_tensor
        batches = int(len(train_data) / self.batch_size)

        data = train_data

        print('SVM train')
        for i in range(epochs):
            if shuffle:
                data = sk_shuffle(train_data, random_state=self.seed)
            for b in range(batches):
                batch_data = data[b * self.batch_size:(b + 1) *
                                  self.batch_size]
                sess.run([self.svm_optimizer, self.svm_loss],
                         feed_dict={input_tensor: batch_data})
            epoch_loss = sess.run(self.loss,
                                  feed_dict={input_tensor: train_data
                                             }) / len(train_data)
            epoch_loss_svm = sess.run(self.svm_loss,
                                      feed_dict={input_tensor: train_data
                                                 }) / len(train_data)
            predictions = sess.run(self.output,
                                   feed_dict={input_tensor: validation_data})
            print('Epoch:', i + 1, 'Loss:', epoch_loss_svm, 'AUROC:',
                  roc_auc_score(validation_label, predictions))
Example #10
0
def batch_generator(data_dir, image_paths, steering_angles, batch_size,
                    is_training):
    """
    Generate training image give image paths and associated steering angles
    """
    #images = np.empty([batch_size, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS])
    #steers = np.empty(batch_size)
    while True:
        i = 0
        images = []
        steers = []
        for index in np.random.permutation(image_paths.shape[0]):
            center, left, right = image_paths[index]
            steering_angle = steering_angles[index]
            # eliminate samples with steering close to zero
            if abs(float(steering_angle)) < STRAIGHT_STEERING:
                continue
            # augmentation
            if is_training and np.random.rand() < 0.6:
                image, steering_angle = augment(data_dir, center, left, right,
                                                steering_angle)
            else:
                image = load_image(data_dir, center)
            # add the image and steering angle to the batch
            images.append(preprocess(image, False))
            steers.append(steering_angle)
            i += 1
            if i == batch_size:
                break

        X_train = np.array(images)
        y_train = np.array(steers)
        # print("current batch size: {}".format(len(X_train)))
        yield sk_shuffle(X_train, y_train)
Example #11
0
    def fit_transform(self, X, y):
        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(X)
            self.y = pd.Series(y)
        else:
            self.X = X.copy(deep=True)
            if isinstance(y, pd.Series):
                self.y = y.copy(deep=True)
            else:
                self.y = y.iloc[:, 0]  # Convert Dataframe to Series
        if not isinstance(self.X, pd.DataFrame):
            raise ValueError(f'{type(X)} is not supported')
        if len(X) != len(y):
            raise ValueError(('Found input variables with inconsistent '
                             f'numbers of samples: [{len(X)}, {len(y)}]'))
        self.shape_before = self.X.shape

        self.X, self.col_was_null = self.__impute(self.X)

        self._label_encoder = None
        self._onehot_encoder = None
        self.X, self.del_columns = self.__encode(self.X)

        self._standardizer = None
        if self.standardize:
            self.X = self.__standardize(self.X)

        if self.shuffle:
            self.X, self.y = sk_shuffle(self.X, self.y,
                                        random_state=self.random_state)
Example #12
0
def generate_train_data(dataframe, nbr_classes, img_root_path, shuffle=True, 
                        augment=False, img_width=224, img_height=224, model='vgg16'):
    N = dataframe.shape[0]
    if shuffle:
        dataframe = sk_shuffle(dataframe)
    X_train = np.zeros((N, img_width, img_height, 3))
    Y_train = np.zeros((N, nbr_classes))
    for index, row in dataframe.iterrows():
        driver_id = row['subject']
        classname = row['classname']
        label = int(classname[-1])
        img_name = row['img']
        img_path = os.path.join(img_root_path, 'train', classname, img_name)

        img = load_img(img_path, img_width)
        X_train[index] = img
        Y_train[index, label] = 1
    
    X_train = X_train.astype(np.float16)
    if model == 'vgg16':
        X_train = vgg16_preprocess_input(X_train)
    elif model == 'inceptv3':
        X_train = incept3_preprocess_input(X_train)

    return X_train, Y_train
Example #13
0
def corrected_holdout_prediction(sorted_sessions,fg,tm_rng,clf,vctr):
	comp_corr_x,comp_corr_y = data_for_checkout_correction(sorted_sessions,fg,tm_rng)
	sh_comp_corr_x,sh_comp_corr_y = sk_shuffle(comp_corr_x,comp_corr_y)
	tr_sh_comp_corr_y = np.asarray(sh_comp_corr_y,dtype='string')
	preds = clf.predict(vctr.transform(to_lof_strings(sh_comp_corr_x)))
	print 'accuracy corrected holdout :'  + str(metrics.accuracy_score(y_true=tr_sh_comp_corr_y,y_pred=preds))
	return None
Example #14
0
    def yield_batches(self, batch_size, split_type, shuffle=True):
        """
        Yield batches of defined batch size.
        :param batch_size: (int) size of batches
        :param split_type: (string) set to yield from (train/val/test)
        :param shuffle: (boolean) shuffle on every epoch
        :return: (ndarray, ndarray) features and labels (generator)
        """
        with h5py.File(self.tmp_storage_path) as fr:
            count = fr[self.y_prefix + split_type][:].shape[0]
            indexes = np.arange(0, count)

            while 1:
                # shuffle in place
                if shuffle:
                    random.shuffle(indexes)

                # yield batches
                for index in range(0, count, batch_size):
                    batch_indexes = sorted(indexes[index:min(index + batch_size, count)])

                    X = fr[self.X_prefix + split_type][batch_indexes, :]
                    y = fr[self.y_prefix + split_type][batch_indexes]

                    if shuffle:
                        X, y = sk_shuffle(X, y)

                    if self.noise and split_type == "train":
                        X += np.random.normal(0, self.noise, X.shape)

                    yield (X, y)
def make_hdf5(filenames, out_file, train=False, shuffle=True):

    imgs = parallelize(read_img, filenames, 2)
    act_labels = parallelize(label_actions, filenames)
    obj_labels = parallelize(label_objects, filenames)

    # remove junk images (and corresponding labels)
    junk_idx = set([i for i in range(len(imgs)) if imgs[i] is None])
    imgs = remove_idx(imgs, junk_idx)
    img_files = remove_idx(filenames, junk_idx)
    act_labels = remove_idx(act_labels, junk_idx)
    obj_labels = remove_idx(obj_labels, junk_idx)

    if shuffle:
        imgs, img_files, act_labels, obj_labels = sk_shuffle(
            imgs, img_files, act_labels, obj_labels)

    img_data = np.asanyarray(imgs)
    act_labels = np.array(act_labels)
    obj_labels = np.array(obj_labels)
    img_files = np.array(img_files)

    print img_data.shape

    with h5py.File(out_file, 'w') as hf:
        hf.create_dataset('image_files', data=img_files)
        hf.create_dataset('images', data=img_data)
        hf.create_dataset('obj_labels', data=obj_labels)
        hf.create_dataset('act_labels', data=act_labels)
        hf.attrs['num_acts'] = len(class_dict)  #imsitu
        hf.attrs['num_objs'] = 1000  #imagenet

        if train:
            hf.create_dataset('img_mean', data=img_mean)
Example #16
0
def batch_generator(dataframe, nbr_classes, img_root_path, batch_size, shuffle=True, augment=False,
                     return_label=True, img_width=224, img_height=224, model='vgg16'):
    N = dataframe.shape[0]
    if shuffle:
        dataframe = sk_shuffle(dataframe)
    batch_index = 0
    while True:
        current_index = (batch_index * batch_size) % N
        if N >= (current_index + batch_size):
            current_batch_size = batch_size
            batch_index += 1
        else:
            current_batch_size = N - current_index
            batch_index = 0
            if shuffle:
                dataframe = sk_shuffle(dataframe)
        
        X_batch = np.zeros((current_batch_size, img_width, img_height, 3))
        Y_batch = np.zeros((current_batch_size, nbr_classes))
        
        for i in range(current_index, current_index + current_batch_size):
            row = dataframe.loc[i,:]
            driver_id = row['subject']
            classname = row['classname']
            label = int(classname[-1])
            img_name = row['img']
            img_path = os.path.join(img_root_path, 'train', classname, img_name)

            img = load_img(img_path, img_width)
            X_batch[i - current_index] = img
            if return_label:
                Y_batch[i - current_index, label] = 1

        if augment:
            X_batch = X_batch.astype(np.uint8)
            X_batch = seq.augment_images(X_batch)

        X_batch = X_batch.astype(np.float16)
        if model == 'vgg16':
            X_batch = vgg16_preprocess_input(X_batch)
        elif model == 'inceptv3':
            X_batch = incept3_preprocess_input(X_batch)

        if return_label:
            yield (X_batch, Y_batch)
        else:
            yield X_batch
Example #17
0
def balance_classes(X: np.ndarray, Y: list, batch_size: int):
    """
    Makes sure each batch has an equal amount of data from each class.
    Perfect balance

    Args:

        X: input features
        Y: mixed labels (ints)
        batch_size: the ultimate batch size
    """
    nb_classes = len(set(Y))

    nb_batches = math.ceil(len(Y) / batch_size)

    # sort by classes
    final_batches_x = [[] for i in range(nb_batches)]
    final_batches_y = [[] for i in range(nb_batches)]

    # Y needs to be np arr
    Y = np.asarray(Y)

    # pick chunk size for each class using the largest split
    chunk_size = []
    for class_i in range(nb_classes):
        mask = Y == class_i
        y = Y[mask]
        chunk_size.append(math.ceil(len(y) / nb_batches))
    chunk_size = max(chunk_size)
    # force chunk size to be even
    if chunk_size % 2 != 0:
        chunk_size -= 1

    # divide each class into each batch
    for class_i in range(nb_classes):
        mask = Y == class_i
        x = X[mask]
        y = Y[mask]

        # shuffle items in the class
        x, y = sk_shuffle(x, y, random_state=123)

        # divide the class into the batches
        for i_start in range(0, len(y), chunk_size):
            batch_i = i_start // chunk_size
            i_end = i_start + chunk_size

            if len(final_batches_x) > batch_i:
                final_batches_x[batch_i].append(x[i_start: i_end])
                final_batches_y[batch_i].append(y[i_start: i_end])

    # merge into full dataset
    final_batches_x = [np.concatenate(x, axis=0) for x in final_batches_x if len(x) > 0]
    final_batches_x = np.concatenate(final_batches_x, axis=0)

    final_batches_y = [np.concatenate(x, axis=0) for x in final_batches_y if len(x) > 0]
    final_batches_y = np.concatenate(final_batches_y, axis=0)

    return final_batches_x, final_batches_y
Example #18
0
    def train(self, x, y, minibatch_size):
        for i in range(self.epoch):

            x, y = sk_shuffle(x, y)

            for j in range(0, x.shape[0], minibatch_size):
                output = self.forward_prop(x)
                self.back_prop(x, y, output)
Example #19
0
def corrected_data_prediction(sorted_sessions,fg,tm_rng,clf,vctr):
	comp_corr_x,comp_corr_y = data_for_checkout_correction(sorted_sessions,fg,tm_rng)
	sh_comp_corr_x,sh_comp_corr_y = sk_shuffle(comp_corr_x,comp_corr_y)
	tr_sh_comp_corr_y = np.asarray(sh_comp_corr_y,dtype='string')
	clf.fit(vctr.transform(to_lof_strings(sh_comp_corr_x)),tr_sh_comp_corr_y)
	cv_accuracy_comp = cv.cross_val_score(clf,vctr.transform(to_lof_strings(sh_comp_corr_x)),tr_sh_comp_corr_y,cv=5)
	print cv_accuracy_comp
	return cv_accuracy_comp
Example #20
0
def rnn_train(dataset, config_params, vocab, umls_vocab):
    global params, setup_NN

    global X, U, Y, Z, Mask, i2t, t2i, w2i, i2w, splits, numTags, emb_w, umls_v
    params = config_params
    umls_v = umls_vocab
    if 'CRF_MODEL_ON' in params and params['CRF_MODEL_ON']:
        sl.info('CRF IS ON. CRF_MODELS WILL BE USED')
        if params['mode'] == 1:
            from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN
            sl.info('MODE :Using the Approximate Message Passing framework')
        elif params['mode'] == -1:
            from bionlp.taggers.rnn_feature.networks.network import setup_NN
            sl.info('MODE : Modeling only the unary potentials')
        else:
            sl.info('MODE : Modeling both unary and binary potentials')
            from bionlp.taggers.rnn_feature.networks.dual_network import setup_NN
    else:
        sl.info(
            'CRF IS NOT ON. This tagger only supports CRF models. A default CRF_MODEL will be used.'
        )
        params['mode'] = 1
        params['CRF_MODEL_ON'] = True
        from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN
        sl.info('MODE :Using the Approximate Message Passing framework')

    sl.info('Using the parameters:\n {0}'.format(json.dumps(params, indent=2)))

    # Preparing Dataset

    sl.info('Preparing entire dataset for Neural Net computation ...')
    (X, U, Z, Y), numTags, emb_w, t2i, w2i = preprocess.load_data(
        dataset, params, entire_note=params['document'], vocab=vocab)

    X, U, Y, Z, Mask = preprocess.pad_and_mask(X, U, Y, Z, params['maxlen'])
    sl.info(
        'Total non zero entries in the Mask Inputs are {0}. This number should be equal to total number of tokens in the entire dataset'
        .format(sum(sum(_) for _ in Mask)))
    if params['shuffle'] == 1:
        X, U, Y, Z, Mask = sk_shuffle(X, U, Y, Z, Mask, random_state=0)
    i2t = {v: k for k, v in t2i.items()}
    i2w = {v: k for k, v in w2i.items()}
    splits = data_utils.make_cross_validation_sets(
        len(Y), params['folds'], training_percent=params['training-percent'])
    try:
        if params['trainable'] is False:
            (o, l, p) = evaluate_run()
        elif params['deploy'] == 1:
            (o, l, p) = deploy_run(splits[0], params)
        elif params['cross-validation'] == 0:
            (o, l, p) = single_run()
        else:
            (o, l, p) = cross_validation_run()
    except IOError, e:
        if e.errno != errno.EINTR:
            raise
        else:
            print " EINTR ERROR CAUGHT. YET AGAIN "
def test_dataloader():
    seed_everything()

    X = np.random.rand(5, 2)
    y = np.random.rand(5)
    x_val = np.random.rand(2, 2)
    y_val = np.random.rand(2)
    x_test = np.random.rand(1, 2)
    y_test = np.random.rand(1)

    shuffled_X, shuffled_y = sk_shuffle(X, y, random_state=1234)

    # -----------------------------
    # train
    # -----------------------------
    loaders = SklearnDataModule(X=X, y=y, val_split=0.2, test_split=0.2, random_state=1234, drop_last=True)
    train_loader = loaders.train_dataloader()
    val_loader = loaders.val_dataloader()
    test_loader = loaders.test_dataloader()
    assert np.all(train_loader.dataset.X == shuffled_X[2:])
    assert np.all(val_loader.dataset.X == shuffled_X[0])
    assert np.all(test_loader.dataset.X == shuffled_X[1])
    assert np.all(train_loader.dataset.Y == shuffled_y[2:])

    # -----------------------------
    # train + val
    # -----------------------------
    loaders = SklearnDataModule(X=X, y=y, x_val=x_val, y_val=y_val, test_split=0.2, random_state=1234, drop_last=True)
    train_loader = loaders.train_dataloader()
    val_loader = loaders.val_dataloader()
    test_loader = loaders.test_dataloader()
    assert np.all(train_loader.dataset.X == shuffled_X[1:])
    assert np.all(val_loader.dataset.X == x_val)
    assert np.all(test_loader.dataset.X == shuffled_X[0])

    # -----------------------------
    # train + test
    # -----------------------------
    loaders = SklearnDataModule(
        X=X, y=y, x_test=x_test, y_test=y_test, val_split=0.2, random_state=1234, drop_last=True
    )
    train_loader = loaders.train_dataloader()
    val_loader = loaders.val_dataloader()
    test_loader = loaders.test_dataloader()
    assert np.all(train_loader.dataset.X == shuffled_X[1:])
    assert np.all(val_loader.dataset.X == shuffled_X[0])
    assert np.all(test_loader.dataset.X == x_test)

    # -----------------------------
    # train + val + test
    # -----------------------------
    loaders = SklearnDataModule(X, y, x_val, y_val, x_test, y_test, random_state=1234, drop_last=True)
    train_loader = loaders.train_dataloader()
    val_loader = loaders.val_dataloader()
    test_loader = loaders.test_dataloader()
    assert np.all(train_loader.dataset.X == shuffled_X)
    assert np.all(val_loader.dataset.X == x_val)
    assert np.all(test_loader.dataset.X == x_test)
Example #22
0
def save_cluster_sample(sb_st1_x, sb_st1_y, sb_st2_x, sb_st2_y):
    sb_st1_x.extend(sb_st2_x)
    sb_st1_y.extend(sb_st2_y)
    shuffled_dat_x, shuffled_dat_y = sk_shuffle(sb_st1_x, sb_st1_y)
    fh = open('dat_f_retraining.txt', 'w')
    for item, label in zip(shuffled_dat_x, shuffled_dat_y):
        fh.write(str(' '.join(item)) + ',' + str(label) + '\n')
    fh.close()
    return None
Example #23
0
def rnn_train(dataset,config_params,vocab,umls_vocab):
    global params,setup_NN

    global X,U,Y,Z,Mask,i2t,t2i,w2i,i2w,splits,numTags,emb_w,umls_v
    params=config_params
    umls_v=umls_vocab
    if 'CRF_MODEL_ON' in params and params['CRF_MODEL_ON']:
        sl.info('CRF IS ON. CRF_MODELS WILL BE USED')
        if params['mode']==1:
            from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN
            sl.info('MODE :Using the Approximate Message Passing framework')
        elif params['mode']==-1:
            from bionlp.taggers.rnn_feature.networks.network import setup_NN
            sl.info('MODE : Modeling only the unary potentials')
        else:
            sl.info('MODE : Modeling both unary and binary potentials')
            from bionlp.taggers.rnn_feature.networks.dual_network import setup_NN
    else:
        sl.info('CRF IS NOT ON. This tagger only supports CRF models. A default CRF_MODEL will be used.')
        params['mode']=1
        params['CRF_MODEL_ON']=True
        from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN
        sl.info('MODE :Using the Approximate Message Passing framework')


    sl.info('Using the parameters:\n {0}'.format(json.dumps(params,indent=2)))

    # Preparing Dataset

    sl.info('Preparing entire dataset for Neural Net computation ...')
    (X,U,Z,Y) , numTags, emb_w , t2i,w2i =preprocess.load_data(dataset,params,entire_note=params['document'],vocab=vocab)


    X,U,Y,Z,Mask=preprocess.pad_and_mask(X,U,Y,Z,params['maxlen'])
    sl.info('Total non zero entries in the Mask Inputs are {0}. This number should be equal to total number of tokens in the entire dataset'.format(sum(sum(_) for _ in Mask)))
    if params['shuffle']==1:
        X,U,Y,Z,Mask=sk_shuffle(X,U,Y,Z,Mask,random_state=0)
    i2t = {v: k for k, v in t2i.items()}
    i2w = {v: k for k, v in w2i.items()}
    splits = data_utils.make_cross_validation_sets(len(Y),params['folds'],training_percent=params['training-percent'])
    try:
        if params['trainable'] is False:
            (o,l,p)=evaluate_run()
        elif params['deploy']==1:
            (o,l,p)=deploy_run(splits[0],params)
        elif params['cross-validation']==0:
            (o,l,p)=single_run()
        else:
            (o,l,p)=cross_validation_run()
    except IOError, e:
        if e.errno!=errno.EINTR:
            raise
        else:
            print " EINTR ERROR CAUGHT. YET AGAIN "
Example #24
0
    def _create_prob_slices_file(HP, subjects, filename, bundle, shuffle=True):

        mask_dir = join(C.HOME, HP.DATASET_FOLDER)

        input_dir = HP.MULTI_PARENT_PATH

        combined_slices = []
        mask_slices = []

        for s in subjects:
            print("processing subject {}".format(s))

            probs_x = nib.load(join(input_dir, "UNet_x_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data()
            probs_y = nib.load(join(input_dir, "UNet_y_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data()
            probs_z = nib.load(join(input_dir, "UNet_z_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data()
            # probs_x = DatasetUtils.scale_input_to_unet_shape(probs_x, HP.DATASET, HP.RESOLUTION)
            # probs_y = DatasetUtils.scale_input_to_unet_shape(probs_y, HP.DATASET, HP.RESOLUTION)
            # probs_z = DatasetUtils.scale_input_to_unet_shape(probs_z, HP.DATASET, HP.RESOLUTION)
            combined = np.stack((probs_x, probs_y, probs_z), axis=4)  # (73, 87, 73, 18, 3)  #not working alone: one dim too much for UNet -> reshape
            combined = np.reshape(combined, (combined.shape[0], combined.shape[1], combined.shape[2],
                                             combined.shape[3] * combined.shape[4]))    # (73, 87, 73, 3*18)

            # print("combined shape after", combined.shape)

            mask_data = ImgUtils.create_multilabel_mask(HP, s, labels_type=HP.LABELS_TYPE)
            if HP.DATASET == "HCP_2mm":
                #use "HCP" because for mask we need downscaling
                mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, "HCP", HP.RESOLUTION)
            elif HP.DATASET == "HCP_2.5mm":
                # use "HCP" because for mask we need downscaling
                mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, "HCP", HP.RESOLUTION)
            else:
                # Mask has same resolution as probmaps -> we can use same resizing
                mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, HP.DATASET, HP.RESOLUTION)

            # Save as Img
            img = nib.Nifti1Image(combined, ImgUtils.get_dwi_affine(HP.DATASET, HP.RESOLUTION))
            nib.save(img, join(HP.EXP_PATH, "combined", s + "_combinded_probmap.nii.gz"))


            combined = DatasetUtils.scale_input_to_unet_shape(combined, HP.DATASET, HP.RESOLUTION)
            assert (combined.shape[2] == mask_data.shape[2])

            #Save as Slices
            for z in range(combined.shape[2]):
                combined_slices.append(combined[:, :, z, :])
                mask_slices.append(mask_data[:, :, z, :])

        if shuffle:
            combined_slices, mask_slices = sk_shuffle(combined_slices, mask_slices, random_state=9)

        if HP.TRAIN:
            np.save(filename + "_data.npy", combined_slices)
            np.save(filename + "_seg.npy", mask_slices)
Example #25
0
def __get_repr(labels, label_list, emb, shuffle=True):
    # sort all labels so the encodings are consistent
    label_list_sorted = sorted(label_list)

    X = []
    y = []
    for node, node_labels in labels.items():
        X.append(emb[node])
        y.append(get_label_repr(node_labels, label_list_sorted))
    if shuffle:
        return sk_shuffle(numpy.asarray(X), numpy.asarray(y))
    return numpy.asarray(X), numpy.asarray(y)
Example #26
0
    def fit_transform(self, X, y=None):
        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(X)
            if y is not None:
                self.y = pd.Series(y)
        else:
            self.X = X.copy(deep=True)
            if y is not None:
                if isinstance(y, pd.Series):
                    self.y = y.copy(deep=True)
                else:
                    self.y = y.iloc[:, 0]  # Convert Dataframe to Series
        if not isinstance(self.X, pd.DataFrame):
            raise ValueError(f'{type(X)} is not supported')
        if y is not None and len(X) != len(y):
            raise ValueError(('Found input variables with inconsistent '
                              f'numbers of samples: [{len(X)}, {len(y)}]'))
        self.shape_before = self.X.shape

        self.X, self.col_was_null = self.__impute(self.X)

        self._label_encoder = None
        self._onehot_encoder = None
        self.X, self.del_columns = self.__encode(self.X)

        self._standardizer = None
        if self.standardize:
            self.X = self.__standardize(self.X)

        if self.columns is not None:
            self.X = self.X[self.columns]

        if self.shuffle:
            if self.y is not None:
                self.X, self.y = sk_shuffle(self.X,
                                            self.y,
                                            random_state=self.random_state)
            else:
                self.X = sk_shuffle(self.X, random_state=self.random_state)
Example #27
0
def data_generator(
        X, y, batch_size, target_size=(224, 224, 3),
        preprocessor=resnet_preprocessor, shuffle=False):
    start = 0
    end = start + batch_size
    n = X.shape[0]
    if shuffle:
        X, y = sk_shuffle(X, y)
    while True:
        X_batch = X[start: end]
        y_batch = y[start: end]
        X_resized = np.array([scipy.misc.imresize(x, target_size) for x in X_batch])
        X_preprocessed = preprocessor(X_resized)

        start += batch_size
        end += batch_size
        if start >= n:
            start = 0
            end = batch_size
            if shuffle:
                X, y = sk_shuffle(X, y)
        yield (X_preprocessed, y_batch)
def shuufle_data(X, y):
    '''
    shuffle the training data
    '''
    X, y = sk_shuffle(X, y)

    X_train = X[:, :, :, None]
    y_train = y[:]

    print("Training Data Size: ", X_train.shape, y_train.shape)
    print("Loading data complete")
    print("Flipping and shuffling of data complete")
    return X_train, y_train
def generator(samples, batch_size=32, with_flipped=True):
    """
    Generates the needed images in the given batch size

    Args:
        samples: list: The complete list of samples that should be used
        batch_size: int: The size of batches that should be used
        with_flipped: bool: Should the flipped images also be used.

    Yields:

        X_train, the images for the training
        y_train, the labels for the training
    """
    num_samples = len(samples)
    # Always
    while 1:
        sk_shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset + batch_size]

            images = []
            measurements = []
            for batch_sample in batch_samples:
                # Add the main image
                image = read_image(batch_sample[0])
                measurement = batch_sample[1]
                images.append(image)
                measurements.append(measurement)
                # Flipped image
                if with_flipped:
                    flipped, steering = flip_image(image, measurement)
                    images.append(flipped)
                    measurements.append(steering)

            X_train = np.array(images)
            y_train = np.array(measurements)
            yield sk_shuffle(X_train, y_train)
Example #30
0
    def __init__(self, path, oversample=None, label_filter=lambda x: x.endswith('.jpg'), issynthetic=False,test_size=0.2,im_range=[0, 55000], isTrain=True):
        if not path.endswith('train'): path = os.path.join(path, 'train')
        path = os.path.join(path, 'images')
        path = pathlib.Path(path)
        self.isTrain = isTrain
        img_paths_train=[]
        
        if not issynthetic:
#             in_impaths = list(path.glob('*/*/*.jpg')) if im_range=='all' else list(path.glob('*/*/*.jpg'))[im_range[0]:im_range[1]]
            # use only standard views
            for std_view in regions_dict_standard:
                pathk = '*/' + std_view + '/im.jpg'
                img_paths_train += list(path.glob(pathk))
        else:
#             in_impaths = list(path.glob('*/*/*/*.jpg')) if im_range=='all' else list(path.glob('*/*/*/*.jpg'))[im_range[0]:im_range[1]]
             # use only standard views
            for std_view in regions_dict_standard:
                pathk = '*/' + std_view + '/*/im.jpg'
                img_paths_train += list(path.glob(pathk))
        
        in_impaths = img_paths_train if im_range=='all' else img_paths_train[im_range[0]:im_range[1]]

        self.train_impaths, self.val_impaths, self.train_idxs, self.val_idxs = train_test_split(in_impaths,range(len(in_impaths)), test_size= test_size, random_state=42)
        assert callable(label_filter)
        self.label_filter=label_filter
        self.calculus_filter = lambda x: x.startswith('calculus') and x.endswith('.jpg')
        
        lbpaths = []
        impaths = []
        all_impaths = self.train_impaths if self.isTrain else self.val_impaths
        
        for impath in all_impaths:
            impath = impath.as_posix()
            lbpath = os.path.dirname(impath).replace('images', 'masks', 1)
            lbpath_dict = {'calculus':[]}
            
            calculus_lbpath = [os.path.join(lbpath, lbname) for lbname in os.listdir(lbpath) if self.calculus_filter(lbname)]
            lbpath_dict['calculus'].extend(calculus_lbpath)
            lbpaths.append(lbpath_dict)
            impaths.append(impath)
                
            if self.isTrain:
                if isinstance(oversample, int) and oversample >= 1 and len(calculus_lbpath) >0 :
    #                    ipdb.set_trace()
                    lbpaths += [lbpath_dict]*oversample
                    impaths += [impath]*oversample
        impaths, lbpaths = sk_shuffle(impaths, lbpaths, random_state=42)
        
        self.impaths = impaths
        self.lbpaths = lbpaths
Example #31
0
def generator(samples, batch_size = 32, angle_offset = 0.2):
    num_samples = len(samples)
    while 1:
        shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset + batch_size]
            
            images = []
            angles = []
            for batch_sample in batch_samples:
                # Center Image
                name = "./IMG/" + batch_sample[0].split("\\")[-1]
                center_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                center_angle = float(batch_sample[3])
                images.append(center_image)
                angles.append(center_angle)
                
                # Flip image
                images.append(np.fliplr(center_image))
                angles.append(-center_angle)
                
                # Left Image
                name = "./IMG/" + batch_sample[1].split("\\")[-1]
                left_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                left_angle = float(batch_sample[3]) + angle_offset
                images.append(left_image)
                angles.append(left_angle)
                
                # Flip image
                images.append(np.fliplr(left_image))
                angles.append(-left_angle)
                
                
                # Right Image
                name = "./IMG/" + batch_sample[2].split("\\")[-1]
                right_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
                right_angle = float(batch_sample[3]) - angle_offset
                images.append(right_image)
                angles.append(right_angle)
                
                # Flip image
                images.append(np.fliplr(right_image))
                angles.append(-right_angle)
                
                
            X_train = np.array(images)
            y_train = np.array(angles)
            yield sk_shuffle(X_train, y_train)                  
Example #32
0
    def gof(self,
            sample_a,
            sample_b,
            n_k,
            shuffle=False,
            seed=None,
            log_matches=False):
        """
        Carry out the test of goodness of fit between two samples
        """
        # data processing
        mixed_samples = np.concatenate((sample_a, sample_b), axis=0)
        classes = np.concatenate(
            (np.ones(len(sample_a)), np.zeros(len(sample_b))),
            axis=0).astype(dtype=np.int32)

        if shuffle:
            mixed_samples, classes = sk_shuffle(mixed_samples,
                                                classes,
                                                random_state=seed)

        # instantiate variables
        self.n_a = len(sample_a)
        self.n_b = len(sample_b)
        self.n_k = n_k
        self.n = self.n_a + self.n_b
        self.mu_T = self._calculate_mu(self.n_a, self.n_b)
        self.sigma_T = self._calculate_sigma(self.n_a, self.n_b, self.n_k)
        self._fit(mixed_samples)
        self.shuffled = shuffle

        # run test
        R = self._calculate_mixed_sample_statistic(mixed_samples, classes, n_k,
                                                   log_matches)
        # collect results
        self.neighbour_same_class = R[0]
        self.consecutive_neighbour = R[1]
        if log_matches:
            self.match_log = np.unique(R[2], return_counts=True)
        else:
            self.match_log = None

        # post processing
        self.T = (self.n_k * (self.n))**-1 * self.neighbour_same_class
        self.p_value = self._calculate_p_val(self.T, self.mu_T, self.sigma_T)

        return self.T, self.p_value
Example #33
0
        def _input_fn():
            # shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
            num_threads = multiprocessing.cpu_count() if multi_threading else 1
            buffer_size = 2 * batch_size + 1

            self.logger.info("")
            self.logger.info("* data input_fn:")
            self.logger.info("================")
            self.logger.info("Mode: {}".format(mode))
            self.logger.info("Batch size: {}".format(batch_size))
            self.logger.info("Epoch count: {}".format(num_epochs))
            self.logger.info("Thread count: {}".format(num_threads))
            self.logger.info("Shuffle: {}".format(shuffle))
            self.logger.info("================")
            self.logger.info("")

            data = inputs
            if shuffle:
                self.logger.info('shuffle data manually.')
                data = inputs.iloc[sk_shuffle(np.arange(len(inputs)))]

            dataset = tf.data.Dataset.from_generator(generate_fn(data),
                                                     output_type, output_shape)
            dataset = dataset.skip(skip_header_lines)
            dataset = dataset.map(zip_map, num_parallel_calls=num_threads)
            # if shuffle:
            #     dataset = dataset.shuffle(buffer_size)
            padded_shapes = OrderedDict(zip(output_key, output_shape))
            if not is_serving:
                padded_shapes = padded_shapes, padded_shapes.pop(
                    metadata.TARGET_NAME)

            dataset = dataset.padded_batch(batch_size, padded_shapes) \
                             .prefetch(buffer_size=tf.contrib.data.AUTOTUNE) \
                             .repeat(num_epochs)

            iterator = dataset.make_initializable_iterator()
            hook.iterator_initializer_func = lambda sess: sess.run(iterator.
                                                                   initializer)
            if is_serving:
                # dataset.make_one_shot_iterator()
                features = iterator.get_next()
                return features, None
            else:
                features, target = iterator.get_next()
                return features, target
Example #34
0
    def fit_transform(self, X, y):
        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(X)
            self.y = pd.Series(y)
        else:
            self.X = X.copy(deep=True)
            self.y = y.copy(deep=True)
        if not isinstance(self.X, pd.DataFrame):
            raise ValueError('%s is not supported' % type(X))
        self.shape_before = self.X.shape

        self.X, self.col_was_null = self.__impute(self.X)

        self._label_encoder = None
        self._onehot_encoder = None
        self.X, self.del_columns = self.__encode(self.X)

        self._standardizer = None
        if self.standardize:
            self.X = self.__standardize(self.X)

        if self.shuffle:
            self.X, self.y = sk_shuffle(self.X, self.y,
                                        random_state=self.random_state)
Example #35
0
    def train(self, examples, param_grid=None, grid_search_folds=5,
              grid_search=True, grid_objective='f1_score_micro',
              grid_jobs=None, shuffle=True):
        '''
        Train a classification model and return the model, score, feature
        vectorizer, scaler, label dictionary, and inverse label dictionary.

        :param examples: The examples to train the model on.
        :type examples: ExamplesTuple
        :param param_grid: The parameter grid to search through for grid
                           search. If unspecified, a default parameter grid
                           will be used.
        :type param_grid: list of dicts mapping from strs to
                          lists of parameter values
        :param grid_search_folds: The number of folds to use when doing the
                                  grid search, or a mapping from
                                  example IDs to folds.
        :type grid_search_folds: int or dict
        :param grid_search: Should we do grid search?
        :type grid_search: bool
        :param grid_objective: The objective function to use when doing the
                               grid search.
        :type grid_objective: function
        :param grid_jobs: The number of jobs to run in parallel when doing the
                          grid search. If unspecified or 0, the number of
                          grid search folds will be used.
        :type grid_jobs: int
        :param shuffle: Shuffle examples (e.g., for grid search CV.)
        :type shuffle: bool

        :return: The best grid search objective function score, or 0 if we're
                 not doing grid search.
        :rtype: float
        '''
        # seed the random number generator so that randomized algorithms are
        # replicable
        rand_seed = 123456789
        np.random.seed(rand_seed)

        # Shuffle so that the folds are random for the inner grid search CV.
        # You can't shuffle a scipy sparse matrix in place, so unfortunately
        # we make a copy of everything (and then get rid of the old version)
        if shuffle:
            ids, classes, features = sk_shuffle(examples.ids, examples.classes,
                                                examples.features,
                                                random_state=rand_seed)
            examples = ExamplesTuple(ids, classes, features,
                                     examples.feat_vectorizer)

        # call train setup to set up the vectorizer, the labeldict, and the
        # scaler
        self._train_setup(examples)

        # select features
        xtrain = self.feat_selector.fit_transform(examples.features)

        # Convert to dense if necessary
        if self._use_dense_features:
            try:
                xtrain = xtrain.todense()
            except MemoryError:
                if self._model_type in _REQUIRES_DENSE:
                    reason = ('{} does not support sparse ' +
                              'matrices.').format(self._model_type)
                else:
                    reason = ('{} feature scaling requires a dense ' +
                              'matrix.').format(self._feature_scaling)
                raise MemoryError('Ran out of memory when converting training' +
                                  ' data to dense. This was required because ' +
                                  reason)

        # Scale features if necessary
        if self._model_type != 'MultinomialNB':
            xtrain = self.scaler.fit_transform(xtrain)

        # Instantiate an estimator and get the default parameter grid to search
        estimator, default_param_grid = self._create_estimator()

        # use label dict transformed version of examples.classes if doing
        # classification
        if self._model_type not in _REGRESSION_MODELS:
            classes = np.array([self.label_dict[label] for label in
                                examples.classes])
        else:
            classes = examples.classes

        # set up a grid searcher if we are asked to
        if grid_search:
            # set up grid search folds
            if isinstance(grid_search_folds, int):
                if not grid_jobs:
                    grid_jobs = grid_search_folds
                else:
                    grid_jobs = min(grid_search_folds, grid_jobs)
                folds = grid_search_folds
            else:
                # use the number of unique fold IDs as the number of grid jobs
                if not grid_jobs:
                    grid_jobs = len(np.unique(grid_search_folds))
                else:
                    grid_jobs = min(len(np.unique(grid_search_folds)),
                                    grid_jobs)
                # Only retain IDs within folds if they're in grid_search_folds
                dummy_label = next(itervalues(grid_search_folds))
                labels = [grid_search_folds.get(curr_id, dummy_label) for
                          curr_id in examples.ids]
                folds = FilteredLeaveOneLabelOut(labels, grid_search_folds,
                                                 examples)

            # Use default parameter grid if we weren't passed one
            if not param_grid:
                param_grid = default_param_grid

            # If we're using a correlation metric for doing binary
            # classification, override the estimator's predict function
            if (grid_objective in _CORRELATION_METRICS and
                    self._model_type not in _REGRESSION_MODELS):
                estimator.predict_normal = estimator.predict
                estimator.predict = _predict_binary

            # limit the number of grid_jobs to be no higher than five or the
            # number of cores for the machine, whichever is lower
            grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES)

            grid_searcher = GridSearchCV(estimator, param_grid,
                                         scoring=grid_objective, cv=folds,
                                         n_jobs=grid_jobs,
                                         pre_dispatch=grid_jobs)

            # run the grid search for hyperparameters
            grid_searcher.fit(xtrain, classes)
            self._model = grid_searcher.best_estimator_
            grid_score = grid_searcher.best_score_
        else:
            self._model = estimator.fit(xtrain, classes)
            grid_score = 0.0

        return grid_score
Example #36
0
    def cross_validate(self, examples, stratified=True, cv_folds=10,
                       grid_search=False, grid_search_folds=5, grid_jobs=None,
                       grid_objective='f1_score_micro', prediction_prefix=None,
                       param_grid=None, shuffle=True):
        '''
        Cross-validates a given model on the training examples.

        :param examples: The data to cross-validate learner performance on.
        :type examples: ExamplesTuple
        :param stratified: Should we stratify the folds to ensure an even
                           distribution of classes for each fold?
        :type stratified: bool
        :param cv_folds: The number of folds to use for cross-validation, or
                         a mapping from example IDs to folds.
        :type cv_folds: int or dict
        :param grid_search: Should we do grid search when training each fold?
                            Note: This will make this take *much* longer.
        :type grid_search: bool
        :param grid_search_folds: The number of folds to use when doing the
                                  grid search (ignored if cv_folds is set to
                                  a dictionary mapping examples to folds).
        :type grid_search_folds: int
        :param grid_jobs: The number of jobs to run in parallel when doing the
                          grid search. If unspecified or 0, the number of
                          grid search folds will be used.
        :type grid_jobs: int
        :param grid_objective: The objective function to use when doing the
                               grid search.
        :type grid_objective: function
        :param param_grid: The parameter grid to search through for grid
                           search. If unspecified, a default parameter
                           grid will be used.
        :type param_grid: list of dicts mapping from strs to
                          lists of parameter values
        :param prediction_prefix: If saving the predictions, this is the
                                  prefix that will be used for the filename.
                                  It will be followed by ".predictions"
        :type prediction_prefix: str
        :param shuffle: Shuffle examples before splitting into folds for CV.
        :type shuffle: bool

        :return: The confusion matrix, overall accuracy, per-class PRFs, and
                 model parameters for each fold.
        :rtype: list of 4-tuples
        '''
        # seed the random number generator so that randomized algorithms are
        # replicable
        rand_seed = 123456789
        np.random.seed(rand_seed)

        # Shuffle so that the folds are random for CV.
        # You can't shuffle a scipy sparse matrix in place, so unfortunately
        # we make a copy of everything (and then get rid of the old version)
        if shuffle:
            ids, classes, features = sk_shuffle(examples.ids, examples.classes,
                                                examples.features,
                                                random_state=rand_seed)
            examples = ExamplesTuple(ids, classes, features,
                                     examples.feat_vectorizer)

        # call train setup
        self._train_setup(examples)

        # setup the cross-validation iterator
        if isinstance(cv_folds, int):
            stratified = (stratified and
                          not self._model_type in _REGRESSION_MODELS)
            kfold = (StratifiedKFold(examples.classes, n_folds=cv_folds) if
                     stratified else KFold(len(examples.classes),
                                           n_folds=cv_folds))
        else:
            # if we have a mapping from IDs to folds, use it for the overall
            # cross-validation as well as the grid search within each
            # training fold.  Note that this means that the grid search
            # will use K-1 folds because the Kth will be the test fold for
            # the outer cross-validation.
            # Only retain IDs within folds if they're in grid_search_folds
            dummy_label = next(itervalues(cv_folds))
            labels = [cv_folds.get(curr_id, dummy_label) for curr_id in
                      examples.ids]
            # Only retain IDs within folds if they're in cv_folds
            kfold = FilteredLeaveOneLabelOut(labels, cv_folds, examples)
            grid_search_folds = cv_folds

        # handle each fold separately and accumulate the predictions and the
        # numbers
        results = []
        grid_search_scores = []
        append_predictions = False
        for train_index, test_index in kfold:
            # Train model
            self._model = None  # prevent feature vectorizer from being reset.
            train_tuple = ExamplesTuple(ids=examples.ids[train_index],
                                        classes=examples.classes[train_index],
                                        features=examples.features[train_index],
                                        feat_vectorizer=examples.feat_vectorizer)
            grid_search_scores.append(self.train(train_tuple,
                                                 grid_search_folds=grid_search_folds,
                                                 grid_search=grid_search,
                                                 grid_objective=grid_objective,
                                                 param_grid=param_grid,
                                                 grid_jobs=grid_jobs,
                                                 shuffle=False))
            # note: there is no need to shuffle again within each fold,
            # regardless of what the shuffle keyword argument is set to.

            # Evaluate model
            test_tuple = ExamplesTuple(ids=examples.ids[test_index],
                                       classes=examples.classes[test_index],
                                       features=examples.features[test_index],
                                       feat_vectorizer=examples.feat_vectorizer)
            results.append(self.evaluate(test_tuple,
                                         prediction_prefix=prediction_prefix,
                                         append=append_predictions,
                                         grid_objective=grid_objective))

            append_predictions = True

        # return list of results for all folds
        return results, grid_search_scores