def batch_iter(data, batch_size, num_epochs, seed=None, fill=False): """ Generates a batch iterator for a dataset. """ random = np.random.RandomState(seed) data = np.array(data) data_length = len(data) num_batches_per_epoch = int(len(data) / batch_size) if len(data) % batch_size != 0: num_batches_per_epoch += 1 for epoch in range(num_epochs): # Shuffle the data at each epoch shuffle_indices = random.permutation(np.arange(data_length)) for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_length) selected_indices = shuffle_indices[start_index:end_index] # If we don't have enough data left for a whole batch, fill it randomly if fill is True and end_index >= data_length: num_missing = batch_size - len(selected_indices) selected_indices = np.concatenate([ selected_indices, random.randint(0, data_length, num_missing) ]) yield data[selected_indices]
def random_selections(data,n=999999999): """Iterator through random permutations of the given data. Each "epoch" contains all the samples exactly once.""" count = 0 while 1: selection = random.permutation(len(data)) for index in selection: if count>=n: return yield data[index] count += 1
def get_pls_scores_permutation(self, gArray, mArray, gSizes, mSizes, numPermutation=NUM_PERMUTATION): ''' g and m from legacy code, no particular meaning ??? Permutation will be done within each data slice, due to different data characteristics in time points or delta, or etc. ''' SampleNumber = self.SampleNumber PLS = PLSRegression(n_components=3) scores = [] for jj in range(numPermutation): if str(jj)[-1] == '0': print(" Permutation --- %d" % jj) for g in gSizes: matrix1 = [] for ii in range(g): matrix1.append(permutation(gArray, SampleNumber)) matrix1 = np.array(matrix1).T for m in mSizes: matrix2 = [] for ii in range(m): matrix2.append(permutation(mArray, SampleNumber)) matrix2 = np.array(matrix2).T # if matrix1.shape[1] > matrix2.shape[1]: PLS.fit(matrix1, matrix2) PLSscore = PLS.score(matrix1, matrix2) else: PLS.fit(matrix2, matrix1) PLSscore = PLS.score(matrix2, matrix1) scores.append(PLSscore) return scores
def train(self, X, Y): """ X: matrix of dimensions n x indim y: column vector of dimension n x 1 """ # choose random center vectors from training set rnd_idx = random.permutation(X.shape[0])[:self.numCenters] self.centers = [X[i, :] for i in rnd_idx] # calculate activations of RBFs G = self._calcAct(X) # calculate output weights (pseudoinverse) self.W = dot(pinv(G), Y)
def univariate_permutations(p_model, p_features, p_X_train, p_Y_train, p_hyperparams=None, p_cross_validations=5, p_validation_fraction=0.25, p_seed=0, p_verbose=False): # TODO Add in p_regression and p_metric peices # TODO since we're doing cross-validation here, make X_train a larger portion of th overall dataset (80-90%) scores = [] rs = ShuffleSplit(n_splits=p_cross_validations, test_size=p_validation_fraction, random_state=p_seed) # crossvalidate the scores on a number of different random splits of the data for j, (train_index, test_index) in enumerate(rs.split(p_X_train)): # loop through the n_splits train/test splits if p_verbose: print('Iteration number {} for the train test split'.format(j+1)) Xt_train, Xt_test = p_X_train.iloc[train_index, :], p_X_train.iloc[test_index, :] # set X for train and test Yt_train, Yt_test = p_Y_train.iloc[train_index], p_Y_train.iloc[test_index] # set Y for train and test p_model.fit(Xt_train, Yt_train, p_hyperparams) # fit the model on the training set score = p_model.score(Xt_test, Yt_test) for i, (feature_name, index) in enumerate(p_features): # shuffle the ith predictor variable (to break any relationship with the target) X_t = Xt_test.copy() # copy the test data, so as not to disturb random.seed(p_seed) # set seed for reproducibility X_t.iloc[:, index] = random.permutation(X_t.iloc[:, index]) # Permute the observations from the ith variable shuff_score = p_model.score(X_t, Yt_test) scores.append([feature_name, index, (shuff_score - score) / shuff_score]) if p_verbose: if i == 0: print('{:*^65}'.format('Looping Through Remaining Variables')) print('Testing {}. {}: {:.5f}'.format(i+1, feature_name, (shuff_score - score) / shuff_score)) print('{:*^65}'.format("Features sorted by their score:")) df = pd.DataFrame(columns=['Variable', 'Var_Index', 'Score'], data=scores) df['Var_Index'] = df['Var_Index'].apply(lambda x: tuple(x)) # change list to tuple in order to get hashable type df_agg = df.groupby(['Variable', 'Var_Index'], as_index=False)['Score'].mean() # Average the scores df_agg['Var_Index'] = df_agg['Var_Index'].apply(lambda x: list(x)) # Probably don't need to convert back to list... df_sorted = df_agg.sort_values(by='Score', ascending=False).reset_index(drop=True) for i in range(len(df_sorted)): print('{}: {:.5f}'.format(df_sorted['Variable'][i], df_sorted['Score'][i]))
def __call__(self, sample): # parse and assert image = sample['image'].copy() image = image.astype(np.float32) # random brightness if random.randint(2): delta = random.uniform(-self.brightness_delta, self.brightness_delta) image += delta # mode == 0 means do random contrast first and mode == 1 means do random contrast last mode = random.randint(2) if mode == 1: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) image *= alpha # convert color from BGR to HSV image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) # random saturation if random.randint(2): image[..., 1] *= random.uniform(self.saturation_lower, self.saturation_upper) # random hue if random.randint(2): image[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) image[..., 0][image[..., 0] > 360] -= 360 image[..., 0][image[..., 0] < 0] += 360 # convert color from HSV to BGR image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) # random contrast if mode == 0: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) image *= alpha # randomly swap channels if random.randint(2): image = image[..., random.permutation(3)] # update and return sample sample['image'] = image return sample
def batch_iter(data, batch_size, num_epochs, seed=None, fill=False): """ Generates a batch iterator for a dataset. """ random = np.random.RandomState(seed) data = np.array(data) data_length = len(data) num_batches_per_epoch = int(len(data)/batch_size) if len(data) % batch_size != 0: num_batches_per_epoch += 1 for epoch in range(num_epochs): # Shuffle the data at each epoch shuffle_indices = random.permutation(np.arange(data_length)) for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_length) selected_indices = shuffle_indices[start_index:end_index] # If we don't have enough data left for a whole batch, fill it randomly if fill is True and end_index >= data_length: num_missing = batch_size - len(selected_indices) selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)]) yield data[selected_indices]
def getSmallTrainingInput(self, inputSize): smallTrainingSet = [] smallTrainingLabels = [] imageCountDict = {'butler':0, 'radcliffe':0, 'bartan':0, 'bracco':0, 'gilpin':0, 'harmon':0} indexToLastNameDict = {0:'butler', 1:'radcliffe', 2:'bartan', 3:'bracco', 4:'gilpin', 5:'harmon'} trainingSize = len(self.trainingSet) randomIndexArray = random.permutation(trainingSize) maxSizePerPerson = inputSize/6 for i in randomIndexArray: lastNameIndex = self.trainingLabels[i].tolist().index(1) lastName = indexToLastNameDict[lastNameIndex] if imageCountDict[lastName] < maxSizePerPerson: smallTrainingSet.append(self.trainingSet[i]) smallTrainingLabels.append(self.trainingLabels[i]) imageCountDict[lastName] += 1 smallTrainingSet = np.asarray(smallTrainingSet) smallTrainingLabels = np.asarray(smallTrainingLabels) return smallTrainingSet, smallTrainingLabels
def getSmallTrainingInput(self, inputSize): smallTrainingSet = [] smallTrainingLabels = [] imageCountDict = { 'butler': 0, 'radcliffe': 0, 'bartan': 0, 'bracco': 0, 'gilpin': 0, 'harmon': 0 } indexToLastNameDict = { 0: 'butler', 1: 'radcliffe', 2: 'bartan', 3: 'bracco', 4: 'gilpin', 5: 'harmon' } trainingSize = len(self.trainingSet) randomIndexArray = random.permutation(trainingSize) maxSizePerPerson = inputSize / 6 for i in randomIndexArray: lastNameIndex = self.trainingLabels[i].tolist().index(1) lastName = indexToLastNameDict[lastNameIndex] if imageCountDict[lastName] < maxSizePerPerson: smallTrainingSet.append(self.trainingSet[i]) smallTrainingLabels.append(self.trainingLabels[i]) imageCountDict[lastName] += 1 smallTrainingSet = np.asarray(smallTrainingSet) smallTrainingLabels = np.asarray(smallTrainingLabels) return smallTrainingSet, smallTrainingLabels
def __init__(self, dataset): self.dataset = dataset random = np.random.RandomState(seed=12345) self.perm = random.permutation(len(dataset))[:500]
def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table": random = np.random.RandomState(random_seed) return self.take(random.permutation(self.num_rows()))
def fit_CV(mu, cv, fit_method='Exp', svr_gamma=0.06, x0=[0.5, 0.5], verbose=False): '''Fits a noise model (CV vs mean) Parameters ---------- mu: 1-D array mean of the genes (raw counts) cv: 1-D array coefficient of variation for each gene fit_method: string allowed: 'SVR', 'Exp', 'binSVR', 'binExp' default: 'SVR'(requires scikit learn) SVR: uses Support vector regression to fit the noise model Exp: Parametric fit to cv = mu^(-a) + b bin: before fitting the distribution of mean is normalized to be uniform by downsampling and resampling. Returns ------- score: 1-D array Score is the relative position with respect of the fitted curve mu_linspace: 1-D array x coordiantes to plot (min(log2(mu)) -> max(log2(mu))) cv_fit: 1-D array y=f(x) coordinates to plot pars: tuple or None ''' log2_m = log2(mu) log2_cv = log2(cv) if len(mu) > 1000 and 'bin' in fit_method: #histogram with 30 bins n, xi = histogram(log2_m, 30) med_n = percentile(n, 50) for i in range(0, len(n)): # index of genes within the ith bin ind = where((log2_m >= xi[i]) & (log2_m < xi[i + 1]))[0] if len(ind) > med_n: #Downsample if count is more than median ind = ind[random.permutation(len(ind))] ind = ind[:len(ind) - med_n] mask = ones(len(log2_m), dtype=bool) mask[ind] = False log2_m = log2_m[mask] log2_cv = log2_cv[mask] elif (around(med_n / len(ind)) > 1) and (len(ind) > 5): #Duplicate if count is less than median log2_m = r_[log2_m, tile(log2_m[ind], around(med_n / len(ind)) - 1)] log2_cv = r_[log2_cv, tile(log2_cv[ind], around(med_n / len(ind)) - 1)] else: if 'bin' in fit_method: print('More than 1000 input feature needed for bin correction.') pass if 'SVR' in fit_method: try: from sklearn.svm import SVR if svr_gamma == 'auto': svr_gamma = 1000. / len(mu) #Fit the Support Vector Regression clf = SVR(gamma=svr_gamma) clf.fit(log2_m[:, newaxis], log2_cv) fitted_fun = clf.predict score = log2(cv) - fitted_fun(log2(mu)[:, newaxis]) params = None #The coordinates of the fitted curve mu_linspace = linspace(min(log2_m), max(log2_m)) cv_fit = fitted_fun(mu_linspace[:, newaxis]) return score, mu_linspace, cv_fit, params except ImportError: if verbose: print( 'SVR fit requires scikit-learn python library. Using exponential instead.' ) if 'bin' in fit_method: return fit_CV(mu, cv, fit_method='binExp', x0=x0) else: return fit_CV(mu, cv, fit_method='Exp', x0=x0) elif 'Exp' in fit_method: from scipy.optimize import minimize #Define the objective function to fit (least squares) fun = lambda x, log2_m, log2_cv: sum( abs(log2((2.**log2_m)**(-x[0]) + x[1]) - log2_cv)) #Fit using Nelder-Mead algorythm optimization = minimize(fun, x0, args=(log2_m, log2_cv), method='Nelder-Mead') params = optimization.x #The fitted function fitted_fun = lambda log_mu: log2( (2.**log_mu)**(-params[0]) + params[1]) # Score is the relative position with respect of the fitted curve score = log2(cv) - fitted_fun(log2(mu)) #The coordinates of the fitted curve mu_linspace = linspace(min(log2_m), max(log2_m)) cv_fit = fitted_fun(mu_linspace) return score, mu_linspace, cv_fit, params
def random_shuffle(self, random_seed: Optional[int]) -> List[T]: random = np.random.RandomState(random_seed) return self._table.take(random.permutation(self.num_rows()))
training_op = optimizer.minimize(loss, var_list = train_vars) # -> layers 1 and 2 are now frozen # the more data available, the more layers can be unfrozen # frozen layers won't change -> cache the output of the topmost frozen layer for each training instance # -> spped boost since training goes through the whole dataset many times # (-> go through the frozen layers 1x/training instead of 1x/epoch) # e.g., run the whole training set through the lower layers: hidden2_outputs = sess.run(hidden2, feed_dict={X:X_train}) # during training build batches of hidden2_outputs instead of batches of training instances: import numpy as np import random as rnd n_epochs = 100 n_batches = 500 for epoch in range(n_epochs): shuffled_idx = rnd.permutation(len(hidden2_outputs)) hidden2_batches = np.array_split(hidden2_outputs[shuffled_idx], n_batches) y_batches = np.array_split(y_train[shuffled_idx], n_batches) for hidden2_batch, y_batch in zip(hidden2_batches, y_batches): sess.run(training_op, feed_dict={hidden2:hidden2_batch, y:y_batch}) # if no existing model available and not a lot of labeled training data # -> unsupervised training: train each layer 1 by 1 (starting from low-level) using unsupervised feature detection algorithm # -> pretraining on auxiliary task (for which you can easily have enough labeled training data) # 5 ways to speed up the training # 1) good initialization of the weights # 2) good activation function # 3) Batch Normalization # 4) reuse of pretrained network # 5) good optimizer (-> use Adam !)
def rchoose(k, n): assert k <= n return random.permutation(range(n))[:k]
def rchoose(k,n): "Choose k distinct value from range(n)." assert k<=n return random.permutation(range(n))[:k]
def rchoose(k,n): assert k<=n return random.permutation(range(n))[:k]
'''----------------------------------------------------------------''' from numpy import random import numpy as np arr = np.array([1, 2, 3, 4, 5]) random.shuffle(arr) print(arr) '''----------------------------------------------------------------''' from numpy import random import numpy as np arr = np.array([1, 2, 3, 4, 5]) print(random.permutation(arr)) '''----------------------------------------------------------------''' import os, math, random from time import sleep while True: random_num = random.randint(1, 10) if random_num is 10: print('is equal to 10. The loop breaks') break elif random_num is not 10: print('is not equal to 10. The loop repeats') sleep(1) '''----------------------------------------------------------------'''