def __init__(self, data, label=None, batch_size=1, shuffle=False, last_batch_handle='pad', data_name='data', label_name='softmax_label'): super(SparseMatrixDataIter, self).__init__(batch_size) assert(isinstance(data, scipy.sparse.csr.csr_matrix)) self.data = _init_data(data, allow_empty=False, default_name=data_name) self.label = _init_data(label, allow_empty=True, default_name=label_name) self.num_data = self.data[0][1].shape[0] # shuffle data if shuffle: sh_data = [] d = self.data[0][1] if len(self.label[0][1]) > 0: l = self.label[0][1] ds, dl = sk_shuffle(d, l) self.data = _init_data(ds, allow_empty=False, default_name=data_name) self.label = _init_data(dl, allow_empty=True, default_name=label_name) else: ds = sk_shuffle(d) self.data = _init_data(ds, allow_empty=False, default_name=data_name) # batching if last_batch_handle == 'discard': new_n = self.data[0][1].shape[0] - self.data[0][1].shape[0] % batch_size self.num_data = new_n self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label] assert self.num_data >= batch_size, "batch_size needs to be smaller than data size." self.cursor = -batch_size self.batch_size = batch_size self.last_batch_handle = last_batch_handle
def get_mask_splits(dim, pick_name=None, model_type='Mobilenet', bw=False): #Train Set train_paths, train_labels = get_mask_classes('Train') train_images = np.array( [get_image_value(i, dim, bw, model_type) for i in train_paths]) train_dict = dict(images=train_images, labels=train_labels) #Test Set test_paths, test_labels = get_mask_classes('Test') test_images = np.array( [get_image_value(i, dim, bw, model_type) for i in test_paths]) test_images, test_labels = sk_shuffle(test_images, test_labels) #Validation Set val_paths, val_labels = get_mask_classes('Validation') val_images = np.array( [get_image_value(i, dim, bw, model_type) for i in val_paths]) val_images, val_labels = sk_shuffle(val_images, val_labels) tts = train_images, test_images, train_labels, test_labels, val_images, val_labels if pick_name: print('Pickling The Data') pickle.dump(tts, open(f'../Pickles/TTSMask_{pick_name}.p', 'wb'), protocol=4) print('Finished Pickling') return tts
def balance_dataset(dataset, balance_ratio): """Reduce the number of unrelated data examples to match the related ones. Input: dataset: pandas dataframe containing the data balance_ratio: precentage of the balancing: 0.5 = equal balancing """ RELATION_RATIO = balance_ratio labelMatrix = dataset['label'].to_numpy() numberOfRelations = np.count_nonzero(labelMatrix) relationRatio = numberOfRelations / len(dataset) if relationRatio < RELATION_RATIO: dataset['labelAbs'] = dataset['label'].abs() print('-----DATA IS UNBALANCED CURRENT SIZE: ' + str(len(dataset)) + ' CLASS RATIO: ' + str(relationRatio) + ' ... BALANCING DATA') shuffled = sk_shuffle(dataset) orderedDataset = shuffled.sort_values(by=['labelAbs'], ascending=False) cutOff = int(1 / RELATION_RATIO * numberOfRelations) balanced = sk_shuffle(orderedDataset.head(cutOff)) balanced = balanced.drop('labelAbs', axis=1) print('-----BALANCED DATASET WITH SIZE: ' + str(len(balanced))) return balanced else: print('-----DATASET IS ALREADY BALANCED - CLASS RATIO: ' + str(relationRatio) + '-----') return dataset
def image_generator_ysn(x, y=None, batch_size=64, shuffle=True, enable_shift=True): """ Arguments: x: input image of size (n_x, n_h, n_w, n_ch) y: second image of same size as x. Use only if the same operation is to be performed on another image y. Most likely should be None. batch_size: size of batches of the generator shuffle: enable or disable shuffling of data in the beginneing. enable_shift: enable or disable vertical/horizontal shift. If disabled, this acts as a simple generator with batch_size. """ if shuffle: if y is not None: x, y = sk_shuffle(x, y) else: x = sk_shuffle(x) n_x = len(x) n_h = x.shape[1] n_w = x.shape[2] i = 0 # Generator loop while True: # Get the batch data batch_x = x[i:min(i + batch_size, n_x)] if y is not None: batch_y = y[i:min(i + batch_size, n_x)] if enable_shift: # horizontal shift by a random number shift # We have a slightly higher chance for zero shift. See if you can figure out why. shift = np.random.randint(-n_w, n_w, 1) batch_x = np.roll(batch_x, shift=shift, axis=2) # vertical shift shift = np.random.randint(-n_w, n_w, 1) batch_x = np.roll(batch_x, shift=shift, axis=1) # Handle batch increment/end i += batch_size if i >= n_x: i = 0 # Yield data if y is not None: yield (batch_x, batch_y) else: yield (batch_x)
def make_hdf5(filenames, out_file, crop=None): color_files = ['raw/frames/%s' % f for f in filenames] sketch_files = ['raw/sketch/%s' % f for f in filenames] color_imgs = parallelize(read_img, color_files, verbose=10) sketch_imgs = parallelize(read_img, sketch_files, verbose=10, params=1) img_data = [(c, s, f) for c, s, f in zip(color_imgs, sketch_imgs, filenames) if (c is not None and s is not None)] img_data = sk_shuffle(img_data) color_imgs, sketch_imgs, files = zip(*img_data) color_imgs = np.asanyarray(color_imgs) sketch_imgs = np.asanyarray(sketch_imgs) files = np.array(files) # crop images if required if crop is not None: color_imgs = center_crop(color_imgs, crop) sketch_imgs = center_crop(sketch_imgs, crop) img_mean = np.mean(color_imgs, 0) with h5py.File(out_file, 'w') as hf: hf.create_dataset('img_files', data=files) hf.create_dataset('col_sketch_data', data=color_imgs) hf.create_dataset('bw_sketch_data', data=sketch_imgs) hf.create_dataset('col_reference_data', data=color_imgs) hf.create_dataset('img_mean', data=img_mean)
def __init__( self, X, y, x_val=None, y_val=None, x_test=None, y_test=None, val_split=0.2, test_split=0.1, num_workers=0, random_state=1234, shuffle=True, batch_size: int = 16, pin_memory=True, drop_last=False, *args, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.num_workers = num_workers self.batch_size = batch_size self.shuffle = shuffle self.pin_memory = pin_memory self.drop_last = drop_last # shuffle x and y if shuffle and _SKLEARN_AVAILABLE: X, y = sk_shuffle(X, y, random_state=random_state) elif shuffle and not _SKLEARN_AVAILABLE: # pragma: no cover raise ModuleNotFoundError( "You want to use shuffle function from `scikit-learn` which is not installed yet." ) val_split = 0 if x_val is not None or y_val is not None else val_split test_split = 0 if x_test is not None or y_test is not None else test_split hold_out_split = val_split + test_split if hold_out_split > 0: val_split = val_split / hold_out_split hold_out_size = math.floor(len(X) * hold_out_split) x_holdout, y_holdout = X[:hold_out_size], y[:hold_out_size] test_i_start = int(val_split * hold_out_size) x_val_hold_out, y_val_holdout = x_holdout[: test_i_start], y_holdout[: test_i_start] x_test_hold_out, y_test_holdout = x_holdout[ test_i_start:], y_holdout[test_i_start:] X, y = X[hold_out_size:], y[hold_out_size:] # if don't have x_val and y_val create split from X if x_val is None and y_val is None and val_split > 0: x_val, y_val = x_val_hold_out, y_val_holdout # if don't have x_test, y_test create split from X if x_test is None and y_test is None and test_split > 0: x_test, y_test = x_test_hold_out, y_test_holdout self._init_datasets(X, y, x_val, y_val, x_test, y_test)
def __init__(self, paths=["", ""], batch_size=32, augment=False, seed=1, domain="A", name="No Name", shuffle=True): # super(MyDataset, self).__init__() self.batch_size = batch_size self.paths = paths self.augment = augment self.seed = seed self.domain = domain self.name = name # train_generator, train_steps print("Loading file...") self.X_train, self.Y_train = np.load(paths[0]), np.load(paths[1]) print("+ Done.") print("Total datasets samples: {}".format(len(self.Y_train))) if shuffle: print("Shuffling datasets...") self.X_train, self.Y_train = sk_shuffle(self.X_train, self.Y_train) print("+ Done.") print("Preprocessing: rescale pixel value to (-1,1)...") self.X_train = self.preprocessing(self.X_train) print("+ Done.") self.generator, self.steps = self.my_generator( batch_size=self.batch_size, augment=self.augment, seed=self.seed)
def fit_transform(self, X, y): if isinstance(X, np.ndarray): self.X = pd.DataFrame(X) self.y = pd.Series(y) else: self.X = X.copy(deep=True) self.y = y.copy(deep=True) if not isinstance(self.X, pd.DataFrame): raise ValueError('%s is not supported' % type(X)) self.shape_before = self.X.shape self.X, self.col_was_null = self.__impute(self.X) self._label_encoder = None self._onehot_encoder = None self.X, self.del_columns = self.__encode(self.X) self._standardizer = None if self.standardize: self.X = self.__standardize(self.X) if self.shuffle: self.X, self.y = sk_shuffle(self.X, self.y, random_state=self.random_state)
def fit_svm(self, sess, train_data, validation_data, validation_label, epochs, input_tensor=None, shuffle=False): input_tensor = input_tensor if input_tensor is not None else self.input_tensor batches = int(len(train_data) / self.batch_size) data = train_data print('SVM train') for i in range(epochs): if shuffle: data = sk_shuffle(train_data, random_state=self.seed) for b in range(batches): batch_data = data[b * self.batch_size:(b + 1) * self.batch_size] sess.run([self.svm_optimizer, self.svm_loss], feed_dict={input_tensor: batch_data}) epoch_loss = sess.run(self.loss, feed_dict={input_tensor: train_data }) / len(train_data) epoch_loss_svm = sess.run(self.svm_loss, feed_dict={input_tensor: train_data }) / len(train_data) predictions = sess.run(self.output, feed_dict={input_tensor: validation_data}) print('Epoch:', i + 1, 'Loss:', epoch_loss_svm, 'AUROC:', roc_auc_score(validation_label, predictions))
def batch_generator(data_dir, image_paths, steering_angles, batch_size, is_training): """ Generate training image give image paths and associated steering angles """ #images = np.empty([batch_size, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS]) #steers = np.empty(batch_size) while True: i = 0 images = [] steers = [] for index in np.random.permutation(image_paths.shape[0]): center, left, right = image_paths[index] steering_angle = steering_angles[index] # eliminate samples with steering close to zero if abs(float(steering_angle)) < STRAIGHT_STEERING: continue # augmentation if is_training and np.random.rand() < 0.6: image, steering_angle = augment(data_dir, center, left, right, steering_angle) else: image = load_image(data_dir, center) # add the image and steering angle to the batch images.append(preprocess(image, False)) steers.append(steering_angle) i += 1 if i == batch_size: break X_train = np.array(images) y_train = np.array(steers) # print("current batch size: {}".format(len(X_train))) yield sk_shuffle(X_train, y_train)
def fit_transform(self, X, y): if isinstance(X, np.ndarray): self.X = pd.DataFrame(X) self.y = pd.Series(y) else: self.X = X.copy(deep=True) if isinstance(y, pd.Series): self.y = y.copy(deep=True) else: self.y = y.iloc[:, 0] # Convert Dataframe to Series if not isinstance(self.X, pd.DataFrame): raise ValueError(f'{type(X)} is not supported') if len(X) != len(y): raise ValueError(('Found input variables with inconsistent ' f'numbers of samples: [{len(X)}, {len(y)}]')) self.shape_before = self.X.shape self.X, self.col_was_null = self.__impute(self.X) self._label_encoder = None self._onehot_encoder = None self.X, self.del_columns = self.__encode(self.X) self._standardizer = None if self.standardize: self.X = self.__standardize(self.X) if self.shuffle: self.X, self.y = sk_shuffle(self.X, self.y, random_state=self.random_state)
def generate_train_data(dataframe, nbr_classes, img_root_path, shuffle=True, augment=False, img_width=224, img_height=224, model='vgg16'): N = dataframe.shape[0] if shuffle: dataframe = sk_shuffle(dataframe) X_train = np.zeros((N, img_width, img_height, 3)) Y_train = np.zeros((N, nbr_classes)) for index, row in dataframe.iterrows(): driver_id = row['subject'] classname = row['classname'] label = int(classname[-1]) img_name = row['img'] img_path = os.path.join(img_root_path, 'train', classname, img_name) img = load_img(img_path, img_width) X_train[index] = img Y_train[index, label] = 1 X_train = X_train.astype(np.float16) if model == 'vgg16': X_train = vgg16_preprocess_input(X_train) elif model == 'inceptv3': X_train = incept3_preprocess_input(X_train) return X_train, Y_train
def corrected_holdout_prediction(sorted_sessions,fg,tm_rng,clf,vctr): comp_corr_x,comp_corr_y = data_for_checkout_correction(sorted_sessions,fg,tm_rng) sh_comp_corr_x,sh_comp_corr_y = sk_shuffle(comp_corr_x,comp_corr_y) tr_sh_comp_corr_y = np.asarray(sh_comp_corr_y,dtype='string') preds = clf.predict(vctr.transform(to_lof_strings(sh_comp_corr_x))) print 'accuracy corrected holdout :' + str(metrics.accuracy_score(y_true=tr_sh_comp_corr_y,y_pred=preds)) return None
def yield_batches(self, batch_size, split_type, shuffle=True): """ Yield batches of defined batch size. :param batch_size: (int) size of batches :param split_type: (string) set to yield from (train/val/test) :param shuffle: (boolean) shuffle on every epoch :return: (ndarray, ndarray) features and labels (generator) """ with h5py.File(self.tmp_storage_path) as fr: count = fr[self.y_prefix + split_type][:].shape[0] indexes = np.arange(0, count) while 1: # shuffle in place if shuffle: random.shuffle(indexes) # yield batches for index in range(0, count, batch_size): batch_indexes = sorted(indexes[index:min(index + batch_size, count)]) X = fr[self.X_prefix + split_type][batch_indexes, :] y = fr[self.y_prefix + split_type][batch_indexes] if shuffle: X, y = sk_shuffle(X, y) if self.noise and split_type == "train": X += np.random.normal(0, self.noise, X.shape) yield (X, y)
def make_hdf5(filenames, out_file, train=False, shuffle=True): imgs = parallelize(read_img, filenames, 2) act_labels = parallelize(label_actions, filenames) obj_labels = parallelize(label_objects, filenames) # remove junk images (and corresponding labels) junk_idx = set([i for i in range(len(imgs)) if imgs[i] is None]) imgs = remove_idx(imgs, junk_idx) img_files = remove_idx(filenames, junk_idx) act_labels = remove_idx(act_labels, junk_idx) obj_labels = remove_idx(obj_labels, junk_idx) if shuffle: imgs, img_files, act_labels, obj_labels = sk_shuffle( imgs, img_files, act_labels, obj_labels) img_data = np.asanyarray(imgs) act_labels = np.array(act_labels) obj_labels = np.array(obj_labels) img_files = np.array(img_files) print img_data.shape with h5py.File(out_file, 'w') as hf: hf.create_dataset('image_files', data=img_files) hf.create_dataset('images', data=img_data) hf.create_dataset('obj_labels', data=obj_labels) hf.create_dataset('act_labels', data=act_labels) hf.attrs['num_acts'] = len(class_dict) #imsitu hf.attrs['num_objs'] = 1000 #imagenet if train: hf.create_dataset('img_mean', data=img_mean)
def batch_generator(dataframe, nbr_classes, img_root_path, batch_size, shuffle=True, augment=False, return_label=True, img_width=224, img_height=224, model='vgg16'): N = dataframe.shape[0] if shuffle: dataframe = sk_shuffle(dataframe) batch_index = 0 while True: current_index = (batch_index * batch_size) % N if N >= (current_index + batch_size): current_batch_size = batch_size batch_index += 1 else: current_batch_size = N - current_index batch_index = 0 if shuffle: dataframe = sk_shuffle(dataframe) X_batch = np.zeros((current_batch_size, img_width, img_height, 3)) Y_batch = np.zeros((current_batch_size, nbr_classes)) for i in range(current_index, current_index + current_batch_size): row = dataframe.loc[i,:] driver_id = row['subject'] classname = row['classname'] label = int(classname[-1]) img_name = row['img'] img_path = os.path.join(img_root_path, 'train', classname, img_name) img = load_img(img_path, img_width) X_batch[i - current_index] = img if return_label: Y_batch[i - current_index, label] = 1 if augment: X_batch = X_batch.astype(np.uint8) X_batch = seq.augment_images(X_batch) X_batch = X_batch.astype(np.float16) if model == 'vgg16': X_batch = vgg16_preprocess_input(X_batch) elif model == 'inceptv3': X_batch = incept3_preprocess_input(X_batch) if return_label: yield (X_batch, Y_batch) else: yield X_batch
def balance_classes(X: np.ndarray, Y: list, batch_size: int): """ Makes sure each batch has an equal amount of data from each class. Perfect balance Args: X: input features Y: mixed labels (ints) batch_size: the ultimate batch size """ nb_classes = len(set(Y)) nb_batches = math.ceil(len(Y) / batch_size) # sort by classes final_batches_x = [[] for i in range(nb_batches)] final_batches_y = [[] for i in range(nb_batches)] # Y needs to be np arr Y = np.asarray(Y) # pick chunk size for each class using the largest split chunk_size = [] for class_i in range(nb_classes): mask = Y == class_i y = Y[mask] chunk_size.append(math.ceil(len(y) / nb_batches)) chunk_size = max(chunk_size) # force chunk size to be even if chunk_size % 2 != 0: chunk_size -= 1 # divide each class into each batch for class_i in range(nb_classes): mask = Y == class_i x = X[mask] y = Y[mask] # shuffle items in the class x, y = sk_shuffle(x, y, random_state=123) # divide the class into the batches for i_start in range(0, len(y), chunk_size): batch_i = i_start // chunk_size i_end = i_start + chunk_size if len(final_batches_x) > batch_i: final_batches_x[batch_i].append(x[i_start: i_end]) final_batches_y[batch_i].append(y[i_start: i_end]) # merge into full dataset final_batches_x = [np.concatenate(x, axis=0) for x in final_batches_x if len(x) > 0] final_batches_x = np.concatenate(final_batches_x, axis=0) final_batches_y = [np.concatenate(x, axis=0) for x in final_batches_y if len(x) > 0] final_batches_y = np.concatenate(final_batches_y, axis=0) return final_batches_x, final_batches_y
def train(self, x, y, minibatch_size): for i in range(self.epoch): x, y = sk_shuffle(x, y) for j in range(0, x.shape[0], minibatch_size): output = self.forward_prop(x) self.back_prop(x, y, output)
def corrected_data_prediction(sorted_sessions,fg,tm_rng,clf,vctr): comp_corr_x,comp_corr_y = data_for_checkout_correction(sorted_sessions,fg,tm_rng) sh_comp_corr_x,sh_comp_corr_y = sk_shuffle(comp_corr_x,comp_corr_y) tr_sh_comp_corr_y = np.asarray(sh_comp_corr_y,dtype='string') clf.fit(vctr.transform(to_lof_strings(sh_comp_corr_x)),tr_sh_comp_corr_y) cv_accuracy_comp = cv.cross_val_score(clf,vctr.transform(to_lof_strings(sh_comp_corr_x)),tr_sh_comp_corr_y,cv=5) print cv_accuracy_comp return cv_accuracy_comp
def rnn_train(dataset, config_params, vocab, umls_vocab): global params, setup_NN global X, U, Y, Z, Mask, i2t, t2i, w2i, i2w, splits, numTags, emb_w, umls_v params = config_params umls_v = umls_vocab if 'CRF_MODEL_ON' in params and params['CRF_MODEL_ON']: sl.info('CRF IS ON. CRF_MODELS WILL BE USED') if params['mode'] == 1: from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN sl.info('MODE :Using the Approximate Message Passing framework') elif params['mode'] == -1: from bionlp.taggers.rnn_feature.networks.network import setup_NN sl.info('MODE : Modeling only the unary potentials') else: sl.info('MODE : Modeling both unary and binary potentials') from bionlp.taggers.rnn_feature.networks.dual_network import setup_NN else: sl.info( 'CRF IS NOT ON. This tagger only supports CRF models. A default CRF_MODEL will be used.' ) params['mode'] = 1 params['CRF_MODEL_ON'] = True from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN sl.info('MODE :Using the Approximate Message Passing framework') sl.info('Using the parameters:\n {0}'.format(json.dumps(params, indent=2))) # Preparing Dataset sl.info('Preparing entire dataset for Neural Net computation ...') (X, U, Z, Y), numTags, emb_w, t2i, w2i = preprocess.load_data( dataset, params, entire_note=params['document'], vocab=vocab) X, U, Y, Z, Mask = preprocess.pad_and_mask(X, U, Y, Z, params['maxlen']) sl.info( 'Total non zero entries in the Mask Inputs are {0}. This number should be equal to total number of tokens in the entire dataset' .format(sum(sum(_) for _ in Mask))) if params['shuffle'] == 1: X, U, Y, Z, Mask = sk_shuffle(X, U, Y, Z, Mask, random_state=0) i2t = {v: k for k, v in t2i.items()} i2w = {v: k for k, v in w2i.items()} splits = data_utils.make_cross_validation_sets( len(Y), params['folds'], training_percent=params['training-percent']) try: if params['trainable'] is False: (o, l, p) = evaluate_run() elif params['deploy'] == 1: (o, l, p) = deploy_run(splits[0], params) elif params['cross-validation'] == 0: (o, l, p) = single_run() else: (o, l, p) = cross_validation_run() except IOError, e: if e.errno != errno.EINTR: raise else: print " EINTR ERROR CAUGHT. YET AGAIN "
def test_dataloader(): seed_everything() X = np.random.rand(5, 2) y = np.random.rand(5) x_val = np.random.rand(2, 2) y_val = np.random.rand(2) x_test = np.random.rand(1, 2) y_test = np.random.rand(1) shuffled_X, shuffled_y = sk_shuffle(X, y, random_state=1234) # ----------------------------- # train # ----------------------------- loaders = SklearnDataModule(X=X, y=y, val_split=0.2, test_split=0.2, random_state=1234, drop_last=True) train_loader = loaders.train_dataloader() val_loader = loaders.val_dataloader() test_loader = loaders.test_dataloader() assert np.all(train_loader.dataset.X == shuffled_X[2:]) assert np.all(val_loader.dataset.X == shuffled_X[0]) assert np.all(test_loader.dataset.X == shuffled_X[1]) assert np.all(train_loader.dataset.Y == shuffled_y[2:]) # ----------------------------- # train + val # ----------------------------- loaders = SklearnDataModule(X=X, y=y, x_val=x_val, y_val=y_val, test_split=0.2, random_state=1234, drop_last=True) train_loader = loaders.train_dataloader() val_loader = loaders.val_dataloader() test_loader = loaders.test_dataloader() assert np.all(train_loader.dataset.X == shuffled_X[1:]) assert np.all(val_loader.dataset.X == x_val) assert np.all(test_loader.dataset.X == shuffled_X[0]) # ----------------------------- # train + test # ----------------------------- loaders = SklearnDataModule( X=X, y=y, x_test=x_test, y_test=y_test, val_split=0.2, random_state=1234, drop_last=True ) train_loader = loaders.train_dataloader() val_loader = loaders.val_dataloader() test_loader = loaders.test_dataloader() assert np.all(train_loader.dataset.X == shuffled_X[1:]) assert np.all(val_loader.dataset.X == shuffled_X[0]) assert np.all(test_loader.dataset.X == x_test) # ----------------------------- # train + val + test # ----------------------------- loaders = SklearnDataModule(X, y, x_val, y_val, x_test, y_test, random_state=1234, drop_last=True) train_loader = loaders.train_dataloader() val_loader = loaders.val_dataloader() test_loader = loaders.test_dataloader() assert np.all(train_loader.dataset.X == shuffled_X) assert np.all(val_loader.dataset.X == x_val) assert np.all(test_loader.dataset.X == x_test)
def save_cluster_sample(sb_st1_x, sb_st1_y, sb_st2_x, sb_st2_y): sb_st1_x.extend(sb_st2_x) sb_st1_y.extend(sb_st2_y) shuffled_dat_x, shuffled_dat_y = sk_shuffle(sb_st1_x, sb_st1_y) fh = open('dat_f_retraining.txt', 'w') for item, label in zip(shuffled_dat_x, shuffled_dat_y): fh.write(str(' '.join(item)) + ',' + str(label) + '\n') fh.close() return None
def rnn_train(dataset,config_params,vocab,umls_vocab): global params,setup_NN global X,U,Y,Z,Mask,i2t,t2i,w2i,i2w,splits,numTags,emb_w,umls_v params=config_params umls_v=umls_vocab if 'CRF_MODEL_ON' in params and params['CRF_MODEL_ON']: sl.info('CRF IS ON. CRF_MODELS WILL BE USED') if params['mode']==1: from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN sl.info('MODE :Using the Approximate Message Passing framework') elif params['mode']==-1: from bionlp.taggers.rnn_feature.networks.network import setup_NN sl.info('MODE : Modeling only the unary potentials') else: sl.info('MODE : Modeling both unary and binary potentials') from bionlp.taggers.rnn_feature.networks.dual_network import setup_NN else: sl.info('CRF IS NOT ON. This tagger only supports CRF models. A default CRF_MODEL will be used.') params['mode']=1 params['CRF_MODEL_ON']=True from bionlp.taggers.rnn_feature.networks.approx_network import setup_NN sl.info('MODE :Using the Approximate Message Passing framework') sl.info('Using the parameters:\n {0}'.format(json.dumps(params,indent=2))) # Preparing Dataset sl.info('Preparing entire dataset for Neural Net computation ...') (X,U,Z,Y) , numTags, emb_w , t2i,w2i =preprocess.load_data(dataset,params,entire_note=params['document'],vocab=vocab) X,U,Y,Z,Mask=preprocess.pad_and_mask(X,U,Y,Z,params['maxlen']) sl.info('Total non zero entries in the Mask Inputs are {0}. This number should be equal to total number of tokens in the entire dataset'.format(sum(sum(_) for _ in Mask))) if params['shuffle']==1: X,U,Y,Z,Mask=sk_shuffle(X,U,Y,Z,Mask,random_state=0) i2t = {v: k for k, v in t2i.items()} i2w = {v: k for k, v in w2i.items()} splits = data_utils.make_cross_validation_sets(len(Y),params['folds'],training_percent=params['training-percent']) try: if params['trainable'] is False: (o,l,p)=evaluate_run() elif params['deploy']==1: (o,l,p)=deploy_run(splits[0],params) elif params['cross-validation']==0: (o,l,p)=single_run() else: (o,l,p)=cross_validation_run() except IOError, e: if e.errno!=errno.EINTR: raise else: print " EINTR ERROR CAUGHT. YET AGAIN "
def _create_prob_slices_file(HP, subjects, filename, bundle, shuffle=True): mask_dir = join(C.HOME, HP.DATASET_FOLDER) input_dir = HP.MULTI_PARENT_PATH combined_slices = [] mask_slices = [] for s in subjects: print("processing subject {}".format(s)) probs_x = nib.load(join(input_dir, "UNet_x_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data() probs_y = nib.load(join(input_dir, "UNet_y_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data() probs_z = nib.load(join(input_dir, "UNet_z_" + str(HP.CV_FOLD), "probmaps", s + "_probmap.nii.gz")).get_data() # probs_x = DatasetUtils.scale_input_to_unet_shape(probs_x, HP.DATASET, HP.RESOLUTION) # probs_y = DatasetUtils.scale_input_to_unet_shape(probs_y, HP.DATASET, HP.RESOLUTION) # probs_z = DatasetUtils.scale_input_to_unet_shape(probs_z, HP.DATASET, HP.RESOLUTION) combined = np.stack((probs_x, probs_y, probs_z), axis=4) # (73, 87, 73, 18, 3) #not working alone: one dim too much for UNet -> reshape combined = np.reshape(combined, (combined.shape[0], combined.shape[1], combined.shape[2], combined.shape[3] * combined.shape[4])) # (73, 87, 73, 3*18) # print("combined shape after", combined.shape) mask_data = ImgUtils.create_multilabel_mask(HP, s, labels_type=HP.LABELS_TYPE) if HP.DATASET == "HCP_2mm": #use "HCP" because for mask we need downscaling mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, "HCP", HP.RESOLUTION) elif HP.DATASET == "HCP_2.5mm": # use "HCP" because for mask we need downscaling mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, "HCP", HP.RESOLUTION) else: # Mask has same resolution as probmaps -> we can use same resizing mask_data = DatasetUtils.scale_input_to_unet_shape(mask_data, HP.DATASET, HP.RESOLUTION) # Save as Img img = nib.Nifti1Image(combined, ImgUtils.get_dwi_affine(HP.DATASET, HP.RESOLUTION)) nib.save(img, join(HP.EXP_PATH, "combined", s + "_combinded_probmap.nii.gz")) combined = DatasetUtils.scale_input_to_unet_shape(combined, HP.DATASET, HP.RESOLUTION) assert (combined.shape[2] == mask_data.shape[2]) #Save as Slices for z in range(combined.shape[2]): combined_slices.append(combined[:, :, z, :]) mask_slices.append(mask_data[:, :, z, :]) if shuffle: combined_slices, mask_slices = sk_shuffle(combined_slices, mask_slices, random_state=9) if HP.TRAIN: np.save(filename + "_data.npy", combined_slices) np.save(filename + "_seg.npy", mask_slices)
def __get_repr(labels, label_list, emb, shuffle=True): # sort all labels so the encodings are consistent label_list_sorted = sorted(label_list) X = [] y = [] for node, node_labels in labels.items(): X.append(emb[node]) y.append(get_label_repr(node_labels, label_list_sorted)) if shuffle: return sk_shuffle(numpy.asarray(X), numpy.asarray(y)) return numpy.asarray(X), numpy.asarray(y)
def fit_transform(self, X, y=None): if isinstance(X, np.ndarray): self.X = pd.DataFrame(X) if y is not None: self.y = pd.Series(y) else: self.X = X.copy(deep=True) if y is not None: if isinstance(y, pd.Series): self.y = y.copy(deep=True) else: self.y = y.iloc[:, 0] # Convert Dataframe to Series if not isinstance(self.X, pd.DataFrame): raise ValueError(f'{type(X)} is not supported') if y is not None and len(X) != len(y): raise ValueError(('Found input variables with inconsistent ' f'numbers of samples: [{len(X)}, {len(y)}]')) self.shape_before = self.X.shape self.X, self.col_was_null = self.__impute(self.X) self._label_encoder = None self._onehot_encoder = None self.X, self.del_columns = self.__encode(self.X) self._standardizer = None if self.standardize: self.X = self.__standardize(self.X) if self.columns is not None: self.X = self.X[self.columns] if self.shuffle: if self.y is not None: self.X, self.y = sk_shuffle(self.X, self.y, random_state=self.random_state) else: self.X = sk_shuffle(self.X, random_state=self.random_state)
def data_generator( X, y, batch_size, target_size=(224, 224, 3), preprocessor=resnet_preprocessor, shuffle=False): start = 0 end = start + batch_size n = X.shape[0] if shuffle: X, y = sk_shuffle(X, y) while True: X_batch = X[start: end] y_batch = y[start: end] X_resized = np.array([scipy.misc.imresize(x, target_size) for x in X_batch]) X_preprocessed = preprocessor(X_resized) start += batch_size end += batch_size if start >= n: start = 0 end = batch_size if shuffle: X, y = sk_shuffle(X, y) yield (X_preprocessed, y_batch)
def shuufle_data(X, y): ''' shuffle the training data ''' X, y = sk_shuffle(X, y) X_train = X[:, :, :, None] y_train = y[:] print("Training Data Size: ", X_train.shape, y_train.shape) print("Loading data complete") print("Flipping and shuffling of data complete") return X_train, y_train
def generator(samples, batch_size=32, with_flipped=True): """ Generates the needed images in the given batch size Args: samples: list: The complete list of samples that should be used batch_size: int: The size of batches that should be used with_flipped: bool: Should the flipped images also be used. Yields: X_train, the images for the training y_train, the labels for the training """ num_samples = len(samples) # Always while 1: sk_shuffle(samples) for offset in range(0, num_samples, batch_size): batch_samples = samples[offset:offset + batch_size] images = [] measurements = [] for batch_sample in batch_samples: # Add the main image image = read_image(batch_sample[0]) measurement = batch_sample[1] images.append(image) measurements.append(measurement) # Flipped image if with_flipped: flipped, steering = flip_image(image, measurement) images.append(flipped) measurements.append(steering) X_train = np.array(images) y_train = np.array(measurements) yield sk_shuffle(X_train, y_train)
def __init__(self, path, oversample=None, label_filter=lambda x: x.endswith('.jpg'), issynthetic=False,test_size=0.2,im_range=[0, 55000], isTrain=True): if not path.endswith('train'): path = os.path.join(path, 'train') path = os.path.join(path, 'images') path = pathlib.Path(path) self.isTrain = isTrain img_paths_train=[] if not issynthetic: # in_impaths = list(path.glob('*/*/*.jpg')) if im_range=='all' else list(path.glob('*/*/*.jpg'))[im_range[0]:im_range[1]] # use only standard views for std_view in regions_dict_standard: pathk = '*/' + std_view + '/im.jpg' img_paths_train += list(path.glob(pathk)) else: # in_impaths = list(path.glob('*/*/*/*.jpg')) if im_range=='all' else list(path.glob('*/*/*/*.jpg'))[im_range[0]:im_range[1]] # use only standard views for std_view in regions_dict_standard: pathk = '*/' + std_view + '/*/im.jpg' img_paths_train += list(path.glob(pathk)) in_impaths = img_paths_train if im_range=='all' else img_paths_train[im_range[0]:im_range[1]] self.train_impaths, self.val_impaths, self.train_idxs, self.val_idxs = train_test_split(in_impaths,range(len(in_impaths)), test_size= test_size, random_state=42) assert callable(label_filter) self.label_filter=label_filter self.calculus_filter = lambda x: x.startswith('calculus') and x.endswith('.jpg') lbpaths = [] impaths = [] all_impaths = self.train_impaths if self.isTrain else self.val_impaths for impath in all_impaths: impath = impath.as_posix() lbpath = os.path.dirname(impath).replace('images', 'masks', 1) lbpath_dict = {'calculus':[]} calculus_lbpath = [os.path.join(lbpath, lbname) for lbname in os.listdir(lbpath) if self.calculus_filter(lbname)] lbpath_dict['calculus'].extend(calculus_lbpath) lbpaths.append(lbpath_dict) impaths.append(impath) if self.isTrain: if isinstance(oversample, int) and oversample >= 1 and len(calculus_lbpath) >0 : # ipdb.set_trace() lbpaths += [lbpath_dict]*oversample impaths += [impath]*oversample impaths, lbpaths = sk_shuffle(impaths, lbpaths, random_state=42) self.impaths = impaths self.lbpaths = lbpaths
def generator(samples, batch_size = 32, angle_offset = 0.2): num_samples = len(samples) while 1: shuffle(samples) for offset in range(0, num_samples, batch_size): batch_samples = samples[offset:offset + batch_size] images = [] angles = [] for batch_sample in batch_samples: # Center Image name = "./IMG/" + batch_sample[0].split("\\")[-1] center_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB) center_angle = float(batch_sample[3]) images.append(center_image) angles.append(center_angle) # Flip image images.append(np.fliplr(center_image)) angles.append(-center_angle) # Left Image name = "./IMG/" + batch_sample[1].split("\\")[-1] left_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB) left_angle = float(batch_sample[3]) + angle_offset images.append(left_image) angles.append(left_angle) # Flip image images.append(np.fliplr(left_image)) angles.append(-left_angle) # Right Image name = "./IMG/" + batch_sample[2].split("\\")[-1] right_image = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB) right_angle = float(batch_sample[3]) - angle_offset images.append(right_image) angles.append(right_angle) # Flip image images.append(np.fliplr(right_image)) angles.append(-right_angle) X_train = np.array(images) y_train = np.array(angles) yield sk_shuffle(X_train, y_train)
def gof(self, sample_a, sample_b, n_k, shuffle=False, seed=None, log_matches=False): """ Carry out the test of goodness of fit between two samples """ # data processing mixed_samples = np.concatenate((sample_a, sample_b), axis=0) classes = np.concatenate( (np.ones(len(sample_a)), np.zeros(len(sample_b))), axis=0).astype(dtype=np.int32) if shuffle: mixed_samples, classes = sk_shuffle(mixed_samples, classes, random_state=seed) # instantiate variables self.n_a = len(sample_a) self.n_b = len(sample_b) self.n_k = n_k self.n = self.n_a + self.n_b self.mu_T = self._calculate_mu(self.n_a, self.n_b) self.sigma_T = self._calculate_sigma(self.n_a, self.n_b, self.n_k) self._fit(mixed_samples) self.shuffled = shuffle # run test R = self._calculate_mixed_sample_statistic(mixed_samples, classes, n_k, log_matches) # collect results self.neighbour_same_class = R[0] self.consecutive_neighbour = R[1] if log_matches: self.match_log = np.unique(R[2], return_counts=True) else: self.match_log = None # post processing self.T = (self.n_k * (self.n))**-1 * self.neighbour_same_class self.p_value = self._calculate_p_val(self.T, self.mu_T, self.sigma_T) return self.T, self.p_value
def _input_fn(): # shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False num_threads = multiprocessing.cpu_count() if multi_threading else 1 buffer_size = 2 * batch_size + 1 self.logger.info("") self.logger.info("* data input_fn:") self.logger.info("================") self.logger.info("Mode: {}".format(mode)) self.logger.info("Batch size: {}".format(batch_size)) self.logger.info("Epoch count: {}".format(num_epochs)) self.logger.info("Thread count: {}".format(num_threads)) self.logger.info("Shuffle: {}".format(shuffle)) self.logger.info("================") self.logger.info("") data = inputs if shuffle: self.logger.info('shuffle data manually.') data = inputs.iloc[sk_shuffle(np.arange(len(inputs)))] dataset = tf.data.Dataset.from_generator(generate_fn(data), output_type, output_shape) dataset = dataset.skip(skip_header_lines) dataset = dataset.map(zip_map, num_parallel_calls=num_threads) # if shuffle: # dataset = dataset.shuffle(buffer_size) padded_shapes = OrderedDict(zip(output_key, output_shape)) if not is_serving: padded_shapes = padded_shapes, padded_shapes.pop( metadata.TARGET_NAME) dataset = dataset.padded_batch(batch_size, padded_shapes) \ .prefetch(buffer_size=tf.contrib.data.AUTOTUNE) \ .repeat(num_epochs) iterator = dataset.make_initializable_iterator() hook.iterator_initializer_func = lambda sess: sess.run(iterator. initializer) if is_serving: # dataset.make_one_shot_iterator() features = iterator.get_next() return features, None else: features, target = iterator.get_next() return features, target
def train(self, examples, param_grid=None, grid_search_folds=5, grid_search=True, grid_objective='f1_score_micro', grid_jobs=None, shuffle=True): ''' Train a classification model and return the model, score, feature vectorizer, scaler, label dictionary, and inverse label dictionary. :param examples: The examples to train the model on. :type examples: ExamplesTuple :param param_grid: The parameter grid to search through for grid search. If unspecified, a default parameter grid will be used. :type param_grid: list of dicts mapping from strs to lists of parameter values :param grid_search_folds: The number of folds to use when doing the grid search, or a mapping from example IDs to folds. :type grid_search_folds: int or dict :param grid_search: Should we do grid search? :type grid_search: bool :param grid_objective: The objective function to use when doing the grid search. :type grid_objective: function :param grid_jobs: The number of jobs to run in parallel when doing the grid search. If unspecified or 0, the number of grid search folds will be used. :type grid_jobs: int :param shuffle: Shuffle examples (e.g., for grid search CV.) :type shuffle: bool :return: The best grid search objective function score, or 0 if we're not doing grid search. :rtype: float ''' # seed the random number generator so that randomized algorithms are # replicable rand_seed = 123456789 np.random.seed(rand_seed) # Shuffle so that the folds are random for the inner grid search CV. # You can't shuffle a scipy sparse matrix in place, so unfortunately # we make a copy of everything (and then get rid of the old version) if shuffle: ids, classes, features = sk_shuffle(examples.ids, examples.classes, examples.features, random_state=rand_seed) examples = ExamplesTuple(ids, classes, features, examples.feat_vectorizer) # call train setup to set up the vectorizer, the labeldict, and the # scaler self._train_setup(examples) # select features xtrain = self.feat_selector.fit_transform(examples.features) # Convert to dense if necessary if self._use_dense_features: try: xtrain = xtrain.todense() except MemoryError: if self._model_type in _REQUIRES_DENSE: reason = ('{} does not support sparse ' + 'matrices.').format(self._model_type) else: reason = ('{} feature scaling requires a dense ' + 'matrix.').format(self._feature_scaling) raise MemoryError('Ran out of memory when converting training' + ' data to dense. This was required because ' + reason) # Scale features if necessary if self._model_type != 'MultinomialNB': xtrain = self.scaler.fit_transform(xtrain) # Instantiate an estimator and get the default parameter grid to search estimator, default_param_grid = self._create_estimator() # use label dict transformed version of examples.classes if doing # classification if self._model_type not in _REGRESSION_MODELS: classes = np.array([self.label_dict[label] for label in examples.classes]) else: classes = examples.classes # set up a grid searcher if we are asked to if grid_search: # set up grid search folds if isinstance(grid_search_folds, int): if not grid_jobs: grid_jobs = grid_search_folds else: grid_jobs = min(grid_search_folds, grid_jobs) folds = grid_search_folds else: # use the number of unique fold IDs as the number of grid jobs if not grid_jobs: grid_jobs = len(np.unique(grid_search_folds)) else: grid_jobs = min(len(np.unique(grid_search_folds)), grid_jobs) # Only retain IDs within folds if they're in grid_search_folds dummy_label = next(itervalues(grid_search_folds)) labels = [grid_search_folds.get(curr_id, dummy_label) for curr_id in examples.ids] folds = FilteredLeaveOneLabelOut(labels, grid_search_folds, examples) # Use default parameter grid if we weren't passed one if not param_grid: param_grid = default_param_grid # If we're using a correlation metric for doing binary # classification, override the estimator's predict function if (grid_objective in _CORRELATION_METRICS and self._model_type not in _REGRESSION_MODELS): estimator.predict_normal = estimator.predict estimator.predict = _predict_binary # limit the number of grid_jobs to be no higher than five or the # number of cores for the machine, whichever is lower grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES) grid_searcher = GridSearchCV(estimator, param_grid, scoring=grid_objective, cv=folds, n_jobs=grid_jobs, pre_dispatch=grid_jobs) # run the grid search for hyperparameters grid_searcher.fit(xtrain, classes) self._model = grid_searcher.best_estimator_ grid_score = grid_searcher.best_score_ else: self._model = estimator.fit(xtrain, classes) grid_score = 0.0 return grid_score
def cross_validate(self, examples, stratified=True, cv_folds=10, grid_search=False, grid_search_folds=5, grid_jobs=None, grid_objective='f1_score_micro', prediction_prefix=None, param_grid=None, shuffle=True): ''' Cross-validates a given model on the training examples. :param examples: The data to cross-validate learner performance on. :type examples: ExamplesTuple :param stratified: Should we stratify the folds to ensure an even distribution of classes for each fold? :type stratified: bool :param cv_folds: The number of folds to use for cross-validation, or a mapping from example IDs to folds. :type cv_folds: int or dict :param grid_search: Should we do grid search when training each fold? Note: This will make this take *much* longer. :type grid_search: bool :param grid_search_folds: The number of folds to use when doing the grid search (ignored if cv_folds is set to a dictionary mapping examples to folds). :type grid_search_folds: int :param grid_jobs: The number of jobs to run in parallel when doing the grid search. If unspecified or 0, the number of grid search folds will be used. :type grid_jobs: int :param grid_objective: The objective function to use when doing the grid search. :type grid_objective: function :param param_grid: The parameter grid to search through for grid search. If unspecified, a default parameter grid will be used. :type param_grid: list of dicts mapping from strs to lists of parameter values :param prediction_prefix: If saving the predictions, this is the prefix that will be used for the filename. It will be followed by ".predictions" :type prediction_prefix: str :param shuffle: Shuffle examples before splitting into folds for CV. :type shuffle: bool :return: The confusion matrix, overall accuracy, per-class PRFs, and model parameters for each fold. :rtype: list of 4-tuples ''' # seed the random number generator so that randomized algorithms are # replicable rand_seed = 123456789 np.random.seed(rand_seed) # Shuffle so that the folds are random for CV. # You can't shuffle a scipy sparse matrix in place, so unfortunately # we make a copy of everything (and then get rid of the old version) if shuffle: ids, classes, features = sk_shuffle(examples.ids, examples.classes, examples.features, random_state=rand_seed) examples = ExamplesTuple(ids, classes, features, examples.feat_vectorizer) # call train setup self._train_setup(examples) # setup the cross-validation iterator if isinstance(cv_folds, int): stratified = (stratified and not self._model_type in _REGRESSION_MODELS) kfold = (StratifiedKFold(examples.classes, n_folds=cv_folds) if stratified else KFold(len(examples.classes), n_folds=cv_folds)) else: # if we have a mapping from IDs to folds, use it for the overall # cross-validation as well as the grid search within each # training fold. Note that this means that the grid search # will use K-1 folds because the Kth will be the test fold for # the outer cross-validation. # Only retain IDs within folds if they're in grid_search_folds dummy_label = next(itervalues(cv_folds)) labels = [cv_folds.get(curr_id, dummy_label) for curr_id in examples.ids] # Only retain IDs within folds if they're in cv_folds kfold = FilteredLeaveOneLabelOut(labels, cv_folds, examples) grid_search_folds = cv_folds # handle each fold separately and accumulate the predictions and the # numbers results = [] grid_search_scores = [] append_predictions = False for train_index, test_index in kfold: # Train model self._model = None # prevent feature vectorizer from being reset. train_tuple = ExamplesTuple(ids=examples.ids[train_index], classes=examples.classes[train_index], features=examples.features[train_index], feat_vectorizer=examples.feat_vectorizer) grid_search_scores.append(self.train(train_tuple, grid_search_folds=grid_search_folds, grid_search=grid_search, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_jobs, shuffle=False)) # note: there is no need to shuffle again within each fold, # regardless of what the shuffle keyword argument is set to. # Evaluate model test_tuple = ExamplesTuple(ids=examples.ids[test_index], classes=examples.classes[test_index], features=examples.features[test_index], feat_vectorizer=examples.feat_vectorizer) results.append(self.evaluate(test_tuple, prediction_prefix=prediction_prefix, append=append_predictions, grid_objective=grid_objective)) append_predictions = True # return list of results for all folds return results, grid_search_scores