def RandomBootstrap(X_pool, y_pool, size, balance, seed=0): ''' Assume the task is binary classification ''' print ('-' * 50) print ('Starting bootstrap...') print ('Initial training set size = %d' % size) start = time() random_state = RandomState(seed=seed) poolsize = y_pool.shape[0] pool_set = np.arange(poolsize) if balance: # select 1/2 * size from each class class0_size = (int)(size / 2) class1_size = (int)(size - class0_size) class0_indices = np.nonzero(y_pool == 0)[0] class1_indices = np.nonzero(y_pool == 1)[0] class0_docs = random_state.permutation(class0_indices)[:class0_size] class1_docs = random_state.permutation(class1_indices)[:class1_size] training_set = np.hstack((class0_docs, class1_docs)) else: # otherwise, pick 'size' documents randomly training_set = random_state.permutation(pool_set)[:size] pool_set = np.setdiff1d(pool_set, training_set) print ('bootstraping took %0.2fs.' % (time() - start)) return (training_set.tolist(), pool_set.tolist())
def run_lhi_informed_analysis(self,max_curves=26,center_size=20,index=None): if True: self.lhi = compute_local_homogeneity_index(self.OR*pi,__main__.__dict__.get('LHI',2.0)) f = open(normalize_path('lhi'+str(__main__.__dict__.get('LHI',2.0))+'.pickle'),'wb') pickle.dump(self.lhi,f) f.close() else: f = open(normalize_path('lhi'+str(__main__.__dict__.get('LHI',2.0))+'.pickle'),'rb') self.lhi = pickle.load(f) lhi_center = self.lhi[self.center_r-center_size:self.center_r+center_size,self.center_c-center_size:self.center_c+center_size] steps = [] r = RandomState(1023) if not __main__.__dict__.get('uniform',False): pinwheels = r.permutation(numpy.nonzero(numpy.ravel(lhi_center) < __main__.__dict__.get('cutoff',0.3))[0]) domains = r.permutation(numpy.nonzero(numpy.ravel(lhi_center) > (1-__main__.__dict__.get('cutoff',0.3)))[0]) assert len(pinwheels) > max_curves/2 #s = numpy.argsort(numpy.ravel(lhi_center)) if index == None: for i in xrange(0,max_curves/2): (x,y) = numpy.unravel_index(pinwheels[i],lhi_center.shape) steps.append((x+self.center_r-center_size,y+self.center_c-center_size)) (x,y) = numpy.unravel_index(domains[i],lhi_center.shape) steps.append((x+self.center_r-center_size,y+self.center_c-center_size)) else: if (index % 2) == 0: (x,y) = numpy.unravel_index(pinwheels[int(index/2)],lhi_center.shape) steps= [(x+self.center_r-center_size,y+self.center_c-center_size)] else: (x,y) = numpy.unravel_index(domains[int(index/2)],lhi_center.shape) steps= [(x+self.center_r-center_size,y+self.center_c-center_size)] else: bins = [] for i in xrange(0,10): a = numpy.ravel(lhi_center) >= i*0.1 b = numpy.ravel(lhi_center) < (i+1)*0.1 bins.append(r.permutation(numpy.nonzero(numpy.multiply(a,b))[0])) (x,y) = numpy.unravel_index(bins[index % 10][int(index/10)],lhi_center.shape) steps= [(x+self.center_r-center_size,y+self.center_c-center_size)] #places = r.permutation(numpy.arange(0,len(numpy.ravel(lhi_center)),1)) #(x,y) = numpy.unravel_index(places[index],lhi_center.shape) #steps.append((x+self.center_r-center_size,y+self.center_c-center_size)) self.analyse(steps,ns=__main__.__dict__.get('number_sizes',10))
def get_usps_split(seed,digits=range(num_classes)): from numpy.random import RandomState rnd = RandomState(seed) num_train = 200 num_test = 500 X_train = [] Y_train = [] X_test = [] Y_test = [] for t in digits: I = rnd.permutation(num_digits_per_class) ## note: num_train + num_test < len(I); ## The NCA paper used 200 train and 500 test. ## This gives 700 per class. I_train = I[:num_train] I_test = I[-num_test:] X_t_train = raw[:,I_train,t].T X_t_test = raw[:,I_test,t].T X_train.extend(X_t_train) X_test.extend(X_t_test) Y_train.extend([t] * num_train) Y_test.extend([t] * num_test) assert len(X_train) == len(Y_train) and len(X_test) == len(Y_test) # note: we only permute the training cases, since we don't do # sgd of any kind on the test cases. import numpy as np I = rnd.permutation(len(X_train)) X_train = np.array(X_train)[I] Y_train = np.array(Y_train)[I] I = rnd.permutation(len(X_test)) X_test = np.array(X_test)[I] Y_test = np.array(Y_test)[I] X_train = usps_resizer(X_train,8) X_test = usps_resizer(X_test,8) X_train /= 255. X_test /= 255. return (X_train,X_test),(Y_train,Y_test)
def get_usps_split(seed, digits=range(num_classes)): from numpy.random import RandomState rnd = RandomState(seed) num_train = 200 num_test = 500 X_train = [] Y_train = [] X_test = [] Y_test = [] for t in digits: I = rnd.permutation(num_digits_per_class) ## note: num_train + num_test < len(I); ## The NCA paper used 200 train and 500 test. ## This gives 700 per class. I_train = I[:num_train] I_test = I[-num_test:] X_t_train = raw[:, I_train, t].T X_t_test = raw[:, I_test, t].T X_train.extend(X_t_train) X_test.extend(X_t_test) Y_train.extend([t] * num_train) Y_test.extend([t] * num_test) assert len(X_train) == len(Y_train) and len(X_test) == len(Y_test) # note: we only permute the training cases, since we don't do # sgd of any kind on the test cases. import numpy as np I = rnd.permutation(len(X_train)) X_train = np.array(X_train)[I] Y_train = np.array(Y_train)[I] I = rnd.permutation(len(X_test)) X_test = np.array(X_test)[I] Y_test = np.array(Y_test)[I] X_train = usps_resizer(X_train, 8) X_test = usps_resizer(X_test, 8) X_train /= 255. X_test /= 255. return (X_train, X_test), (Y_train, Y_test)
def pool_entropy_h(X, y, candidate_mask, train_mask, classifier, n_candidates, pool_n, n_jobs=-1, random_state=None, **kwargs): """ Return the candidate that will minimise the expected entropy of the predictions. Parameters ---------- X_training_candidates : array The feature matrix of the potential training candidates. classes : int The name of classes. pool_n : int The size of the sampel pool used in estimating the entropy n_jobs : int The number of parallel jobs (-1 if want to use all cores) Returns ------- best_candidate : int The index of the best candidate. """ classes = classifier.classes_ # sorted lexicographically n_classes = len(classes) n_features = X.shape[1] entropy = np.empty(len(candidate_mask)) entropy[:] = np.inf rng = RandomState(random_state) # the probabilities used to calculate expected value of pool probs = classifier.predict_proba(X[candidate_mask]) # copy the classifier (avoid modifying the original classifier) classifier_plus = clone(classifier) # construct the sample pool (used to estimate the entropy) unlabelled_indices = np.where(-train_mask)[0] pool_indices = rng.permutation(unlabelled_indices)[:pool_n] pool_mask = np.zeros(len(candidate_mask), dtype=bool) pool_mask[pool_indices] = True # let's look at each candidate candidate_indices = np.where(candidate_mask)[0] results = Parallel(n_jobs=n_jobs)(delayed(_parallel_entropy_estimate)( X, y.copy(), train_mask.copy(), pool_mask, clone(classifier_plus), classes, n_classes, probs, i, index) for i, index in enumerate(candidate_indices)) indices, expected = zip(*results) indices, expected = np.asarray(indices), np.asarray(expected) assert not np.isnan(expected).any(), 'Some expected values are undefined.' entropy[indices] = expected # pick the candidate with the smallest expected entropy best_candidates = np.argsort(entropy)[:n_candidates] return best_candidates
def view_voltages(data,title=None, shuffle=False, shuffle_seed=1, vmin=-70, vmax=35, s_per_step=None): """ Show a complete simulation run in an (M*N)xT trace image. Args: data: MxNxT array of voltage traces title: figure title shuffle: If true, shuffle the order of cells in the trace image. s_per_step: seconds per step - if given, display a proper time axis """ plt.figure(figsize=fig_size) vtraces = data.reshape(-1,data.shape[2])[:] if shuffle: rng = RandomState(shuffle_seed) vtraces = rng.permutation(vtraces) if s_per_step is None: T = data.shape[-1] else: T = data.shape[-1] * s_per_step plt.imshow(vtraces, cmap='bone', vmin=vmin, vmax=vmax, aspect='auto', interpolation='nearest', extent=[0, T, vtraces.shape[0], 0]) plt.colorbar() if title: plt.title(title) plt.show()
def permute_within_groups(x, group, prng=None): """ Permutation of condition within each group. Parameters ---------- x : array-like A 1-d array indicating treatment. group : array-like A 1-d array indicating group membership prng : RandomState instance or None, optional (default=None) If RandomState instance, prng is the pseudorandom number generator; If None, the pseudorandom number generator is the RandomState instance used by `np.random`. Returns ------- permuted : array-like The within group permutation of x. """ permuted = x.copy() if prng is None: prng = RandomState() # (avoid additional flops) -- maybe memoize for g in np.unique(group): gg = group == g permuted[gg] = prng.permutation(permuted[gg]) return permuted
def corr(x, y, reps=10**4, prng=None): """ Simulate permutation p-value for Spearman correlation coefficient Parameters ---------- x : array-like y : array-like reps : int prng : RandomState instance or None, optional (default=None) If RandomState instance, prng is the pseudorandom number generator; If None, the pseudorandom number generator is the RandomState instance used by `np.random`. Returns ------- tuple Returns test statistic, left-sided p-value, right-sided p-value, two-sided p-value, simulated distribution """ if prng is None: prng = RandomState() tst = np.corrcoef(x, y)[0, 1] sims = [np.corrcoef(prng.permutation(x), y)[0, 1] for i in range(reps)] left_pv = np.sum(sims <= tst)/reps right_pv = np.sum(sims >= tst)/reps two_sided_pv = np.sum(np.abs(sims) >= np.abs(tst))/reps return tst, left_pv, right_pv, two_sided_pv, sims
def load_dataset(params, path='datasets'): download_dataset(path) # training data data = [ np.load(os.path.join(path, 'cifar-10-batches-py', 'data_batch_%d' % (i + 1)), encoding='latin1') for i in range(5) ] X_train = np.vstack([d['data'] for d in data]) y_train = np.hstack([np.asarray(d['labels'], np.int8) for d in data]) # test data data = np.load(os.path.join(path, 'cifar-10-batches-py', 'test_batch'), encoding='latin1') X_test = data['data'] y_test = np.asarray(data['labels'], np.int8) # reshape X_train = X_train.reshape(-1, 3, 32, 32) X_test = X_test.reshape(-1, 3, 32, 32) # permute rndSeed = RandomState(params.seed) permute = rndSeed.permutation(len(y_train)) X_train = X_train[permute] y_train = y_train[permute] permute = rndSeed.permutation(len(y_test)) X_test = X_test[permute] y_test = y_test[permute] # normalize try: mean_std = np.load(os.path.join(path, 'cifar-10-mean_std.npz'), encoding='latin1') mean = mean_std['mean'] std = mean_std['std'] except IOError: mean = X_train.mean(axis=(0, 2, 3), keepdims=True).astype(np.float32) std = X_train.std(axis=(0, 2, 3), keepdims=True).astype(np.float32) np.savez(os.path.join(path, 'cifar-10-mean_std.npz'), mean=mean, std=std) X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, y_train, X_test, y_test
def shuffle_list(seed, *data): from numpy.random import RandomState np_rng = RandomState(seed) idxs = np_rng.permutation(np.arange(len(data[0]))) if len(data) == 1: return [data[0][idx] for idx in idxs] else: return [[d[idx] for idx in idxs] for d in data]
def execute(self): """Execute the link.""" ds = process_manager.service(DataStore) # basic checks on contensts of the data frame assert self.read_key in ds, 'Key "{key}" not in DataStore.'.format( key=self.read_key) df = ds[self.read_key] if not isinstance(df, pd.DataFrame): raise Exception('Retrieved object not of type pandas DataFrame.') ndf = len(df.index) assert ndf > 0, 'dataframe {} is empty.'.format(self.read_key) if self.store_key is None: if self.column in df.columns: raise Exception( 'Column name <{}> already used: <{!s}>. Will not overwrite.' .format(self.column, df.columns)) df[self.column] = 0 # fix final number of events assigned per random class # ... each class gets at least one event if self.nevents is not None: if len(self.nevents) == self.nclasses - 1: self.nevents.append(ndf - sum(n for n in self.nevents)) else: self.nevents = [int(ndf * f) for f in self.fractions] for i in range(self.nclasses): nsum = sum(n for n in self.nevents[:i + 1]) ndiff = 0 if nsum - ndf < 0 else nsum - ndf self.nevents[i] -= ndiff if self.nevents[i] < 0: self.nevents[i] = 0 self.logger.info( 'Random class <{index:d}> assigned <{n:d}> events.', index=i, n=self.nevents[i]) # random reshuffling of dataframe indices RNG = RandomState(self._seed) permute = RNG.permutation(df.index) # apply the random reshuffling, and assign records to the n datasets for i in range(self.nclasses): ib = sum(n for n in self.nevents[:i]) ie = sum(n for n in self.nevents[:i + 1]) if self.store_key is None: df.ix[permute[ib:ie], self.column] = i else: ds[self.store_key[i]] = df.ix[permute[ib:ie]] self.logger.info( 'Stored output collection <{key}> with <{n:d}> records in datastore.', key=self.store_key[i], n=len(ds[self.store_key[i]].index)) # increase seed in case of next iteration self._seed += 1 return StatusCode.Success
def _get_shuffled_mini_batches(self, m: int): """ Get mini batches of indices :param m: total number of samples """ rng = RandomState() shuffled_indices = rng.permutation(range(m)) for b in range(0, m, self.batch_size): yield shuffled_indices[b:b + self.batch_size]
def generatePermutation(numbersamples, randomSeedOrState): from numpy.random import RandomState if isinstance(randomSeedOrState, RandomState): randomstate = randomSeedOrState else: randomstate = RandomState(int(randomSeedOrState % sys.maxint)) perm = randomstate.permutation(numbersamples) return perm
def generatePermutation(numbersamples,randomSeedOrState): from numpy.random import RandomState if isinstance(randomSeedOrState,RandomState): randomstate = randomSeedOrState else: randomstate = RandomState(randomSeedOrState) perm = randomstate.permutation(numbersamples) return perm
def _get_sequence(self): if self.sequence is None: if self.sampling.order == self.sampling.RANDOM: rs = RandomState(seed=self.state['list_seed']) self.sequence = rs.permutation(self.state['power']) elif self.sampling.order == self.sampling.DIRECT: self.sequence = list(range(self.state['power'])) elif self.sampling.order == self.sampling.REVERSED: self.sequence = list(range(self.state['power']))[::-1] return self.sequence
def generate_random_permutation_transform(seed, challenge_length, puf_count, atf=False): """ Returns an input transformation that uses k pseudorandomly generated permutations :param seed: int Seed for the pseudorandom generation :param challenge_length: int Challenge length (must equal LTFArray.n) :param puf_count: int Number of permutations to be used (must equal LTFArray.k) :param atf: boolean Perform ATF transform after permuting :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n) A function that can perform the desired transformation. """ prng = RandomState(seed) permutations = [prng.permutation(challenge_length) for _ in range(puf_count)] def transform(challenges, k): """ Method as described in generate_concatenated_transform doc string. :param challenges: array of int shape(N,n) Array of challenges which should be evaluated by the simulation. :param k: int Number of LTFArray PUFs :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n) A function that can perform the desired transformation. """ (_, n) = challenges.shape assert k == puf_count and n == challenge_length, \ 'Permutations Input Transform cannot be used for LTFArrays with size other than defined' result = swapaxes( array([ challenges[:, permutations[i]] for i in range(puf_count) ]), 0, 1 ) if atf: # Perform atf transform result = transpose( array([ prod(result[:, :, i:], 2) for i in range(n) ]), (1, 2, 0) ) return result transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed return transform
def generate_permutation(numbersamples, randomSeedOrState): from numpy.random import RandomState if isinstance(randomSeedOrState, RandomState): randomstate = randomSeedOrState else: randomstate = RandomState(int(randomSeedOrState % 2147483647)) #old maxint perm = randomstate.permutation(numbersamples) return perm
def permute(data, label, params): ''' Permute data. ''' rndSeed = RandomState(params.seed) permute = rndSeed.permutation(data.shape[0]) data = data[permute] label = label[permute] return (data, label)
def execute(self): """ Execute AssignRandomClass """ ds = ProcessManager().service(DataStore) # basic checks on contensts of the data frame assert self.readKey in list( ds.keys()), 'Key %s not in DataStore.' % self.readKey df = ds[self.readKey] if not isinstance(df, DataFrame): raise Exception('Retrieved object not of type pandas DataFrame.') ndf = len(df.index) assert ndf > 0, 'dataframe %s is empty.' % self.readKey if self.column in df.columns: raise Exception( 'Column name <%s> already used: <%s>. Will not overwrite.' % (self.column, str(df.columns))) # fix final number of events assigned per random class # ... each class gets at least one event if self.nevents is not None: if len(self.nevents) == self.nclasses - 1: self.nevents.append(ndf - sum(n for n in self.nevents)) if self.nevents is None: self.nevents = [int(ndf * f) for f in self.fractions] pass for i in range(self.nclasses): nsum = sum(n for n in self.nevents[:i + 1]) ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf) self.nevents[i] -= ndiff if self.nevents[i] < 0: self.nevents[i] = 0 pass for i, n in enumerate(self.nevents): assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \ (i, n, str(self.nevents)) self.log().info('Random class <%d> assigned n events <%d>.' % (i, n)) # random reshuffling of dataframe indices settings = ProcessManager().service(ConfigObject) RNG = RandomState(settings['seed']) permute = RNG.permutation(df.index) # apply the random reshuffling, and assign records to the n classes df[self.column] = 0 for i in range(self.nclasses): ib = sum(n for n in self.nevents[:i]) ie = sum(n for n in self.nevents[:i + 1]) df.ix[permute[ib:ie], self.column] = i pass return StatusCode.Success
def view_spikes(data, title=None, shuffle=False, shuffle_seed=1): spikesflat = data.reshape(-1, data.shape[2])[:] if shuffle: rng = RandomState(shuffle_seed) spikesflat = rng.permutation(spikesflat) idxs, spiketimes = np.nonzero(spikesflat) plt.figure(figsize=fig_size) plt.scatter(spiketimes, idxs, marker='|', s=50, alpha=0.7, color='k') if title: plt.title(title) plt.show()
def data_to_csv(prefix: str, path: str, dataset_name: str, seeds, items_to_use, only_items_to_use): masks = glob.glob(path + "*.png") if only_items_to_use: masks = list( filter( lambda x: any( [True if i in x else False for i in items_to_use]), masks)) else: masks = list( filter( lambda x: any( [True if i not in x else False for i in items_to_use]), masks)) names = list(map(lambda x: x.split(path)[1], masks)) items = list(map(lambda x: x.split("_"), names)) codes = list( map(lambda x: os.path.join(prefix, "ISIC_" + x[1] + ".jpg"), items)) types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items)) diseases = np.array(sorted(list(set(types)))) print(diseases) print(types) print(codes) assert len(types) == len(codes) assert len(types) == len(masks) result_dict = {} for path, code, typ in tqdm(list(zip(masks, codes, types))): if code in result_dict: labels = result_dict[code] else: labels = np.zeros(len(diseases)) result_dict[code] = labels idx = np.where(diseases == typ) labels[idx] = load_mask(path) use_format = True if len(seeds) > 1 else False for seed in seeds: rs = RandomState(seed) result = list(result_dict.items()) result.sort(key=lambda x: x[0]) result = rs.permutation(result) indices = list(map(lambda x: x[0], result)) result = list(map(lambda x: x[1], result)) frame = pd.DataFrame(result, index=indices, columns=diseases, dtype='int64') if use_format: frame.to_csv(dataset_name.format(seed), index_label="images") else: frame.to_csv(dataset_name, index_label="images")
def randomise_dataframe_rows(self, df): """ Randomise ordering of DataFrame. Return a NumPy array of shuffled index values using `np.random.permutation` Return a new Dataframe containing the shuffled order using `loc[]` `seed(1)` reproduces random same results when share and run same code by others """ if isinstance(df, type(None)): return None # np.random.seed(0) # return df.loc[np.random.permutation(len(df))] prng = RandomState(1234567890) return df.loc[prng.permutation(len(df))]
def data_to_csv(masks_data_path: str, save_to_file_path: str, real_image_path: str, fake_image_path: str, masks_path: str, seeds, replace_first_percents, extend): masks = glob.glob(masks_data_path + "*.png") names = list(map(lambda x: x.split(masks_data_path)[1], masks)) items = list(map(lambda x: x.split("_"), names)) codes = list(map(lambda x: x[1], items)) types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items)) diseases = np.array(sorted(list(set(types)))) print(diseases) print(types) print(codes) assert len(types) == len(codes) assert len(types) == len(masks) codes = list(set(codes)) full = len(codes) part = int(full / 100 * replace_first_percents) filled = [(c, diseases) for c in codes] for seed in seeds: rs = RandomState(seed) result = list(filled) result.sort(key=lambda x: x[0]) result = rs.permutation(result) indexes = list(map(lambda x: x[0], result)) indexes_a = indexes[:part] indexes_b = indexes[:part] indexes_c = indexes[part:] indexes_a = list( map( lambda i: create_record(i, diseases, fake_image_path, masks_path, "_semantic_synthesized_image"), indexes_a)) indexes_b = list( map( lambda i: create_record(i, diseases, real_image_path, masks_path), indexes_b)) indexes_c = list( map( lambda i: create_record(i, diseases, real_image_path, masks_path), indexes_c)) if extend: result = indexes_a + indexes_b + indexes_c else: result = indexes_a + indexes_c frame = pd.DataFrame(result, columns=['images'] + list(diseases)) frame.to_csv(save_to_file_path.format(replace_first_percents, seed), index_label="images", index=False)
def data_to_csv(real_prefix: str, generated_prefix: str, path: str, dataset_name: str, seeds, replace_first_percents, extend): masks = glob.glob(path + "*.png") names = list(map(lambda x: x.split(path)[1], masks)) items = list(map(lambda x: x.split("_"), names)) # codes = list(map(lambda x: os.path.join(prefix, "ISIC_" + x[1] + ".jpg"), items)) codes = list(map(lambda x: x[1], items)) types = list(map(lambda x: "_".join(x[3:]).split(".")[0], items)) diseases = np.array(sorted(list(set(types)))) print(diseases) print(types) print(codes) assert len(types) == len(codes) assert len(types) == len(masks) result_dict = {} for path, code, typ in tqdm(list(zip(masks, codes, types))): if code in result_dict: labels = result_dict[code] else: labels = np.zeros(len(diseases)) result_dict[code] = labels idx = np.where(diseases == typ) labels[idx] = load_mask(path) full = len(result_dict) part = int(full / 100 * replace_first_percents) for seed in seeds: rs = RandomState(seed) result = list(result_dict.items()) result.sort(key=lambda x: x[0]) result = rs.permutation(result) indices = list(map(lambda x: x[0], result)) result = list(map(lambda x: x[1], result)) if extend: indices_a = list(map(lambda x: os.path.join(generated_prefix, "ISIC_" + x + "_semantic_synthesized_image.jpg"), indices[:part])) indices_b = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"), indices[:part])) indices_c = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"), indices[part:])) indices = indices_a + indices_b + indices_c result = result[:part] + result[:part] + result[part:] else: indices_a = list(map(lambda x: os.path.join(generated_prefix, "ISIC_" + x + "_semantic_synthesized_image.jpg"), indices[:part])) indices_b = list(map(lambda x: os.path.join(real_prefix, "ISIC_" + x + ".jpg"), indices[part:])) indices = indices_a + indices_b frame = pd.DataFrame(result, index=indices, columns=diseases, dtype='int64') frame.to_csv(dataset_name.format(replace_first_percents, seed), index_label="images")
def _subsample(counts, n, replace=False, seed=0): """Randomly subsample from a vector of counts. Parameters ---------- counts : 1-D array_like Vector of counts. n : int Number of element to subsample (<= the total number of counts). replace : bool, optional Subsample with or without replacement. seed : int, optional Random seed. Returns ------- subcounts : 1-D ndarray Subsampled vector of counts Raises ------ ValueError, TypeError """ if n < 0: raise ValueError("'n' must be > 0 ") counts = np.asarray(counts) if counts.ndim != 1: raise ValueError("counts must be an 1-D array_like object") counts = counts.astype(int, casting='safe') counts_sum = counts.sum() if n > counts_sum: raise ValueError("'n' must be <= the total number of counts") prng = RandomState(seed) if replace: p = counts / counts_sum subcounts = prng.multinomial(n, p) else: nonzero = np.flatnonzero(counts) expanded = np.concatenate([np.repeat(i, counts[i]) for i in nonzero]) permuted = prng.permutation(expanded)[:n] subcounts = np.bincount(permuted, minlength=counts.size) return subcounts
def mix_crop(s1, s2, max_nr_samples=40, random_state=None): assert len(s1) == len(s2) if random_state is not None: prng = RandomState(random_state) indices_mixed = prng.permutation(np.arange(0, len(s1))) s1 = s1[indices_mixed] s2 = s2[indices_mixed] max_nr_samples = max_nr_samples if max_nr_samples < len(s1) else len(s1) s1 = s1[0:max_nr_samples] s2 = s2[0:max_nr_samples] return s1, s2
def _sample_next_goal_positions( self, random_state: RandomState) -> Tuple[np.ndarray, bool]: # Set all the goals to the initial rotation first. self.mujoco_simulation.set_target_quat(np.array([[1, 0, 0, 0]] * 8)) self.mujoco_simulation.forward() # Set position of blocks block_size = self.mujoco_simulation.simulation_params.object_size width, height, _ = self.mujoco_simulation.get_placement_area().size # Note that block_size and rel_w, rel_h are all half of the block size rel_w, rel_h = block_size / width, block_size / height # offset for making blocks to be attached to each other offset_w, offset_h = rel_w * 2, rel_h * 2 # Expected configuration # [ ][ ] # [ ][ ][ ][ ] # [ ][ ] block_config = random_state.permutation([ [offset_w, 0], [offset_w * 2, 0], [0, offset_h], [offset_w, offset_h], [offset_w * 2, offset_h], [offset_w * 3, offset_h], [offset_w, offset_h * 2], [offset_w * 2, offset_h * 2], ]) # Now randomly place the overall config in the placement area config_w, config_h = block_config.max(axis=0) margin_w, margin_h = 1.0 - config_w - rel_w, 1.0 - config_h - rel_h ori_x, ori_y = random_state.uniform(low=(rel_w, rel_h), high=(margin_w, margin_h)) # Randomize the position of the entire block configuration. block_config += np.array([[ori_x, ori_y]]) # Then place the objects as designed. return place_targets_with_fixed_position( self.mujoco_simulation.get_object_bounding_boxes(), self.mujoco_simulation.get_table_dimensions(), self.mujoco_simulation.get_placement_area(), block_config, )
def _iter_fast(self, ds, batch_size, start=None, end=None, shuffle=True, seed=None): # craete random seed prng1 = None prng2 = _dummy_shuffle if shuffle: if seed is None: seed = get_random_magic_seed() prng1 = RandomState(seed) prng2 = RandomState(seed) batches = create_batch(ds.shape[0], batch_size, start, end, prng1) prng2.shuffle(batches) for i, j in batches: data = ds[i:j] yield self._normalizer(data[prng2.permutation(data.shape[0])])
def generate_random_permutation_transform(cls, seed, nn, kk, atf=False): """ Returns an input transformation that uses k pseudorandomly generated permutations :param seed: int Seed for the pseudorandom generation :param nn: int Challenge length (must equal LTFArray.n) :param kk: int Number of permutations to be used (must equal LTFArray.k) :param atf: boolean Perform ATF transform after permuting :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n) A function that can perform the desired transformation. """ prng = RandomState(seed) permutations = [prng.permutation(nn) for _ in range(kk)] def transform(challenges, k): """ Method as described in generate_concatenated_transform doc string. :param challenges: array of shape(N,n) Array of challenges which should be evaluated by the simulation. :param k: int Number of LTFArray PUFs :return: A function: array of int with shape(N,n), int number of PUFs k -> shape(N,k,n) A function that can perform the desired transformation. """ (_, n) = challenges.shape assert k == kk and n == nn, \ 'Permutations Input Transform cannot be used for LTFArrays with size other than defined' sub_challenges = swapaxes( array([ challenges[:, permutations[i]] for i in range(kk) ]), 0, 1 ) if atf: # Perform atf transform sub_challenges = cls.att(sub_challenges) return sub_challenges transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed return transform
def create_dataset(opt, mode): convert = tnt.transform.compose([ lambda x: x.astype(np.float32), lambda x: x / 255.0, # cvtransforms.Normalize([125.3, 123.0, 113.9], [63.0, 62.1, 66.7]), lambda x: x.transpose(2, 0, 1).astype(np.float32), torch.from_numpy, ]) train_transform = tnt.transform.compose([ cvtransforms.RandomHorizontalFlip(), cvtransforms.Pad(opt.randomcrop_pad, cv2.BORDER_REFLECT), cvtransforms.RandomCrop(32), convert, ]) ds = getattr(datasets, opt.dataset)('.', train=mode, download=True) smode = 'train' if mode else 'test' if mode: from numpy.random import RandomState prng = RandomState(opt.seed) assert (opt.sampleSize % 10 == 0) random_permute = prng.permutation(np.arange( 0, 5000))[0:opt.sampleSize / 10] labels = np.array(getattr(ds, 'train_labels')) data = getattr(ds, 'train_data') classes = np.unique(labels) inds_all = np.array([], dtype='int32') for cl in classes: inds = np.where(np.array(labels) == cl)[0][random_permute] inds_all = np.r_[inds, inds_all] ds = tnt.dataset.TensorDataset([ data[inds_all, :].transpose(0, 2, 3, 1), labels[inds_all].tolist() ]) else: ds = tnt.dataset.TensorDataset([ getattr(ds, smode + '_data').transpose(0, 2, 3, 1), getattr(ds, smode + '_labels') ]) return ds.transform({0: train_transform if mode else convert})
def generate_random_permutation_transform(seed, nn, kk, atf=False): """ Returns an input transformation that uses k pseudorandomly generated permutations :param seed: Seed for the pseudorandom generation :param nn: challenge length (must equal LTFArray.n) :param kk: Number of permutations to be used (must equal LTFArray.k) :param atf: Perform ATF transform after permuting :return: The desired input transform """ r = RandomState(seed) permutations = [r.permutation(nn) for x in range(kk)] def transform(cs, k): (N, n) = cs.shape assert k == kk and n == nn, \ 'Permutations Input Transform cannot be used for LTFArrays with size other than defined' result = swapaxes( array([ cs[:, permutations[i]] for i in range(kk) ]), 0, 1 ) if atf: """ Perform atf transform """ result = transpose( array([ prod(result[:, :, i:], 2) for i in range(n) ]), (1, 2, 0) ) return result transform.__name__ = 'transform_permutations' + ('_plus_atf_' if atf else '') + '_%x' % seed return transform
def _find_fixed_permutations(cls, n, k): """ Finds permutations suitable to use in LTFArray.transform_fixed_permutation. Permutations are chosen such that no permutation has a fix point and no two permutations share at least one point. (See `permutation_okay` below.) Note that the run time of this method increases drastically with k. On an Intel i7, n=64, k=10 takes a couple of seconds. :return: list of seeds for `RandomState`. Obtain the permutation with `RandomState(seed).permutation(n)`. """ def permutation_okay(new_p, ps): # 1. check that p has no fix point if any([i == new_p[i] for i in range(len(new_p))]): return False # 2. check that it does not share a point if any old_p in ps: if any([ any([old_p[i] == new_p[i] for i in range(len(new_p))]) for old_p in ps ]): return False return True seed = 0xbad permutation_seeds = [] permutations = [] while len(permutations) < k: prng = RandomState(seed) p = prng.permutation(n) if permutation_okay(p, permutations): permutation_seeds.append(seed) permutations.append(p) seed += 1 return permutation_seeds
def score(self, prediction: Prediction, actual: DataTuple) -> float: """We add the ability to take the average of hsic score. As for larger datasets it will kill your machine """ preds = prediction.hard.to_numpy()[:, np.newaxis] s_cols = actual.s.columns sens_labels = np.array(actual.s[s_cols].to_numpy()) batchs_size = 5000 together = np.hstack((preds, sens_labels)).transpose() random = RandomState(seed=888) col_idx = random.permutation(together.shape[1]) together = np.take(together, col_idx, axis=1) prediction_shuffled = together[0] label_shuffled = together[1] num_batches_float = preds.shape[0] / batchs_size num_batches: int = int(math.ceil(num_batches_float)) batches = [] start = 0 for _ in range(num_batches): end = start + batchs_size preds_to_test = prediction_shuffled[start:end] labels_to_test = label_shuffled[start:end] batches.append(hsic(preds_to_test, labels_to_test, 0.7, 0.5)) start += batchs_size return np.mean(np.array(batches))
def main(infile: str, outfile: IO, **splitter_kw): """Script entry point.""" logging.basicConfig( format='[%(asctime)s] [%(levelname)s] %(name)s - %(message)s', level=logging.INFO) start = time.perf_counter() with h5py.File(infile, mode="r") as h5in: _LOGGER.info("Reading labels from %r...", infile) labels = (pd.DataFrame.from_records(np.asarray( h5in["labels"])).transform(decode_column)) _LOGGER.info("Creating splitter with args %s", splitter_kw) random_state = RandomState(splitter_kw.pop("seed")) splitter = Splitter(**splitter_kw, random_state=random_state) # type:ignore for indices in splitter.split(labels): permutation = random_state.permutation( len(indices.train) + len(indices.val)) json.dump( { "train": indices.train.tolist(), "val": indices.val.tolist(), "test": indices.test.tolist(), "train-val": np.concatenate( (indices.train, indices.val))[permutation].tolist() }, outfile, indent=None, separators=(",", ":")) outfile.write("\n") _LOGGER.info("Script complete in %.2fs", (time.perf_counter() - start))
def shuffled(random: RandomState, datasets: Sequence[xr.Dataset]) -> xr.Dataset: """ Shuffles dataset along the sample dimension within chunks if chunking is present. Datasets passed will be shuffled identically. Args: dim: dimension to shuffle indices along random: Initialized random number generator state used for shuffling datasets: input data to be shuffled, must contain identical dimensionality/coordinates if multiple datasets are given """ chunks_default = (len(datasets[0][SAMPLE_DIM_NAME]), ) chunks = datasets[0].chunks.get(SAMPLE_DIM_NAME, chunks_default) chunk_indices = _get_chunk_indices(chunks) shuffled_inds = np.concatenate( [random.permutation(indices) for indices in chunk_indices]) return [ dataset.isel({SAMPLE_DIM_NAME: shuffled_inds}) for dataset in datasets ]
def two_sample(x, y, reps=10**5, stat='mean', alternative="greater", keep_dist=False, interval=False, level=0.95, seed=None): """ One-sided or two-sided, two-sample permutation test for equality of two means, with p-value estimated by simulated random sampling with reps replications. Tests the hypothesis that x and y are a random partition of x,y against the alternative that x comes from a population with mean (a) greater than that of the population from which y comes, if side = 'greater' (b) less than that of the population from which y comes, if side = 'less' (c) different from that of the population from which y comes, if side = 'two-sided' If ``keep_dist``, return the distribution of values of the test statistic; otherwise, return only the number of permutations for which the value of the test statistic and p-value. Parameters ---------- x : array-like Sample 1 y : array-like Sample 2 reps : int number of repetitions stat : {'mean', 't'} The test statistic. (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) (equivalently, sum(x), since those are monotonically related) (b) If stat == 't', the test statistic is the two-sample t-statistic-- but the p-value is still estimated by the randomization, approximating the permutation distribution. The t-statistic is computed using scipy.stats.ttest_ind (c) FIXME: Explanation or example of how to pass in a function, instead of a str keep_dist : bool flag for whether to store and return the array of values of the irr test statistic interval : {'upper', 'lower', 'two-sided'} The type of confidence interval (a) If interval == 'upper', computes an upper confidence bound on the true p-value based on the simulations by inverting Binomial tests. (b) If interval == 'lower', computes a lower confidence bound on the true p-value based on the simulations by inverting Binomial tests. (c) If interval == 'two-sided', computes lower and upper confidence bounds on the true p-value based on the simulations by inverting Binomial tests. level : float in (0, 1) the confidence limit for the confidence bounds. Returns ------- float the estimated p-value float the test statistic tuple These values are only returned if `level` == True (a) confidence bound on p-value, if interval in {'lower','upper'} (b) [lower confidence bound, upper confidence bound], if interval == 'two-sided' """ prng = RandomState(seed) z = np.concatenate([x, y]) # pooled responses # FIXME: Type check: we may want to pass in a function for argument 'stat' # FIXME: If function, use that. Otherwise, look in the dictionary stats = { 'mean': lambda u: np.mean(u[:len(x)]) - np.mean(u[len(x):]), 't': lambda u: ttest_ind( u[:len(y)], u[len(y):], equal_var=True)[0] } tst_fun = stats[stat] theStat = { 'greater': tst_fun, 'less': lambda u: -tst_fun(u), 'two-sided': lambda u: math.fabs(tst_fun(u)) } tst = theStat[alternative](z) if keep_dist: dist = [] for i in range(reps): dist.append( theStat[alternative](prng.permutation(z)) ) hits = np.sum(dist >= tst) if interval in ["upper", "lower", "two-sided"]: return (hits/reps, tst, binom_conf_interval(reps, hits, level, alternative), dist) else: return hits/reps, tst, dist else: hits = np.sum([(theStat[alternative](prng.permutation(z)) >= tst) for i in range(reps)]) if interval in ["upper", "lower", "two-sided"]: return (hits/reps, tst, binom_conf_interval(reps, hits, level, alternative)) else: return hits/reps, tst
class Relationship(object): def __init__(self, seed): self.seed = seed self.state = RandomState(self.seed) self.grouped = {} self.ops = self.RelationshipOps(self) def add_relations(self, from_ids, to_ids, weights=1): """ Add relations to this Relationships from from_ids, to_ids, weights """ self.grouped = utils.merge_2_dicts( self.grouped, Relations.from_tuples(from_ids, to_ids, weights), lambda r1, r2: r1.plus(r2)) def add_grouped_relations(self, from_ids, grouped_ids): """ Add "bulk" relationship, i.e. many "to" sides for each "from" side at once. :param from_ids: list of "from" sides of the relationships to add :param grouped_ids: list of list of "to" sides of the relationships to add Note: we assume all weights are 1 for this use (for now """ for one_from, many_tos in zip(from_ids, grouped_ids): rels = pd.DataFrame({"from": one_from, "to": many_tos}) self.add_relations(from_ids=rels["from"], to_ids=rels["to"]) def remove_relations(self, from_ids, to_ids): """ Removes all relations between those from_ids and to_ids pairs (not combinatory: if each list is 10 elements, we removed 10 pairs). If the same relation was stored several times between two ids, this removes them all """ self.grouped = utils.merge_2_dicts( self.grouped, Relations.from_tuples(from_ids, to_ids, weights=0), lambda r1, r2: r1.minus(r2)) def get_relations(self, from_ids=None): """ This returns, as a dataframe, the sub-set of the relationships whose "from" is part of specified "from_ids". If no from_ids is provided, this just returns all the relations. """ _from_ids = set(self.grouped.keys()) if from_ids is None else from_ids def _rel_arrays(): for gid in set(_from_ids): if gid in self.grouped.keys(): relations = self.grouped[gid] yield np.array([ np.array([gid] * relations.to_ids.shape[0]), relations.to_ids, relations.weights ]) rel_arrays = list(_rel_arrays()) if len(rel_arrays) == 0: return pd.DataFrame(columns=["from", "to", "weight"]) else: df = pd.DataFrame(np.hstack(rel_arrays).T, columns=["from", "to", "weight"]) df["weight"] = df["weight"].astype(float) return df def get_neighbourhood_size(self, from_ids): """ return a series indexed by "from" containing the number of "tos" for each requested from. """ def size(from_id): if from_id in self.grouped: return len(self.grouped[from_id]) else: return 0 return pd.Series({from_id: size(from_id) for from_id in from_ids}) def unique_tos(self): """ :return: the set of unique "to" parts throughout all relationships """ return { to for relations in self.grouped.values() for to in relations.to_ids } def select_one(self, from_ids=None, named_as="to", remove_selected=False, discard_empty=True, one_to_one=False, overridden_to_weights=None): """ Randomly selects one "to" part for each specified id in from_ids. An id can be specified several times in that list, in which case we simply do a selection several times. The result is aligned with from_ids by index. i.e. the row in the return value that has the same pandas index than a rom in from_ids is the selection for that row. The selection in the resulting dataframe will by default be named "to", unless this is overridden by "named_as". If remove_selected is True, the selected relations are removed from the relationship. This is handy to model stocks or any container of things. If discard_empty is True, all specified from_ids will be present in the result, even if no relation is available for them or if some selection were dropped due to one-to-one config. If one_to_one is True, the selection is an injective function, i.e each to_ids will at most be picked once. overridden_to_weights is an optional dictionary of {"to": weight} that can be used to override the default weights contained in this Relationship. """ if overridden_to_weights is not None: missing_keys = self.unique_tos() - set( overridden_to_weights.keys().values) assert len(missing_keys) == 0, \ "overridden_to_weights is missing those 'to' keys: {}".format( missing_keys) if from_ids is None: _from_ids = pd.Series(list(self.grouped.keys())) elif type(from_ids) == list: _from_ids = pd.Series(from_ids) else: _from_ids = from_ids def _results(): # req_index is the technical index of the table built by the Story, # => must be respect to join correctly the result of the select_one for req_index, from_id in zip(_from_ids.index, _from_ids): if from_id in self.grouped: idx, picked = self.grouped[from_id].pick_one( self.state, overridden_to_weights) if picked is None: if discard_empty: continue else: yield req_index, from_id, -1, None else: yield req_index, from_id, idx, picked elif not discard_empty: yield req_index, from_id, -1, None output = list(zip(*_results())) if len(output) == 0: return pd.DataFrame(columns=["from", named_as]) request_index, from_id, rel_idx, chosen_tos = output output = pd.DataFrame( { named_as: list(chosen_tos), "idx": list(rel_idx), "from": from_id }, index=request_index) if one_to_one and output.shape[0] > 0: # not de-duplicating the blank results blank_idx = output[named_as].isna() blanks, present = output[blank_idx], output[~blank_idx] present = present.loc[self.state.permutation(present.index)] present.drop_duplicates(subset=named_as, keep="first", inplace=True) output = pd.concat([present, blanks]) if remove_selected: # we have to remove all the relations of each from in one go since # no injective selection might have the same index several times g = output[output["idx"] != -1][["from", "idx"]].groupby(by="from") for from_id in g.groups: group = self.grouped[from_id] removed_idx = g.get_group(from_id)["idx"] group.remove_inplace(removed_idx) if len(group) == 0: del self.grouped[from_id] output.drop(["idx"], axis=1, inplace=True) return output def select_all_horizontal(self, from_ids, named_as="to"): """ Return all the "to" sides starting from each "from", as an "horizontal" list, i.e. each "from" is on one row and the set of all "to" are all on that row, in one list. Any requested from_id that has no relationship is absent is the returned dataframe (=> the corresponding rows are dropped in the result) """ rows = self.get_relations(from_ids) groups = rows.set_index("to", drop=True).groupby("from", sort=False) df = pd.DataFrame(data=list(groups.groups.items()), columns=["from", named_as]) df[named_as] = df[named_as].apply(lambda s: [el for el in s]) return df def select_many(self, from_ids, named_as, quantities, remove_selected=False, discard_empty=True): """ The result is returned in vertical format and index by the values of the index of from_ids. Since we select several values, we return several lines per index value of from_id => during the subsequent join by the Operation, the number of produced rows increases. """ req = pd.DataFrame({"from": from_ids, "qties": quantities}) req["qties"] = req["qties"].astype(np.int) # gathers all requests to the same "from" together, keeping track of # the "request index" in the original from_ids so we can merge it later def gather(df): # shuffles that set of request s.t. in case of capping not the same # from_id get "capped" all time df2 = df.loc[self.state.permutation(df.index)] return pd.Series({ "quantities": df2["qties"].tolist(), "req_index": df2.index.tolist() }) # the same "from" can be requested several times all_reqs = req.groupby("from", sort=False).apply(gather) def _all_picks_results(): for _, row in all_reqs.iterrows(): from_id = row.name if from_id in self.grouped: relations = self.grouped[from_id] quantities = utils.cap_to_total(row["quantities"], len(relations)) # rel_idx is the index of the picked values within the grouped values (i.e. for one from_id) rel_idx, rel_tos = relations.pick_many( self.state, np.sum(quantities)) # prepares the indices of the resulting vertical format, as a sequence # of index interval where to inject the picked values to_idx = np.cumsum(quantities).tolist() from_idx = [0] + to_idx[:-1] idx_intervals = [(lb, ub) for lb, ub in zip(from_idx, to_idx)] def _one_pick_result(): for ((lower_bound, upper_bound), req_index) in zip(idx_intervals, row["req_index"]): size = upper_bound - lower_bound if size == 0: continue yield [ req_index, from_id, rel_tos[lower_bound:upper_bound], rel_idx[lower_bound:upper_bound], ] yield list(_one_pick_result()) all_picks_results = list(_all_picks_results()) if len(all_picks_results) > 0: output = pd.DataFrame( data=functools.reduce(lambda l1, l2: l1 + l2, all_picks_results), columns=["req_idx", "from", named_as, "rel_idx"]) if remove_selected: # remove all the relations of each from in one go since # no injective selection might have the same index several times g = output[output["rel_idx"] != -1][["from", "rel_idx" ]].groupby(by="from") for from_id in g.groups: group = self.grouped[from_id] removed_idx = g.get_group(from_id)["rel_idx"].values[0] group.remove_inplace(removed_idx) if len(group) == 0: del self.grouped[from_id] else: output = pd.DataFrame( columns=["req_idx", "from", named_as, "rel_idx"]) output.set_index("req_idx", drop=True, inplace=True) output.drop(["rel_idx", "from"], axis=1, inplace=True) # "discard_empty" option: return empty result (instead of nothing) for # any non existing (i.e. empty) "from" relation if not discard_empty and output.shape[0] != len(from_ids): missing_index = from_ids.index.difference(output.index) missing_values = pd.DataFrame({ named_as: pd.Series([[] * missing_index.shape[0]], index=missing_index) }) output = pd.concat([output, missing_values], copy=False) return output ###################### # IO # ###################### def save_to(self, file_path): """ Saves all the relationship as well as the current status of the seed as a CSV file """ logging.info("saving relationship to {}".format(file_path)) # creating a vertical dataframe to store the inner table saved_df = pd.DataFrame(self.get_relations().stack(), columns=["value"]) # we also want to save the seed => added an index level to separate # self._table from self.seed in the end result saved_df["param"] = "relations" saved_df = saved_df.set_index("param", append=True) saved_df.index = saved_df.index.reorder_levels([2, 0, 1]) # then finally added the seed saved_df.loc[("seed", 0, 0)] = self.seed saved_df.to_csv(file_path) @staticmethod def load_from(file_path): logging.info("loading relationship from {}".format(file_path)) saved_df = pd.read_csv(file_path, index_col=[0, 1, 2]) seed = int(saved_df.loc["seed"].values[0][0]) _all = slice(None) relations = saved_df.loc[("relations", _all, _all)].unstack() relations.index = relations.index.droplevel(0) relations.columns = relations.columns.droplevel(0) relationship = Relationship(seed) relationship.add_relations( from_ids=relations["from"].values, to_ids=relations["to"].values, weights=relations["weight"].values.astype(float)) return relationship class RelationshipOps(object): def __init__(self, relationship): self.relationship = relationship class AddNeighbourhoodSize(AddColumns): def __init__(self, relationship, from_field, named_as): AddColumns.__init__(self) self.relationship = relationship self.from_field = from_field self.named_as = named_as def build_output(self, story_data): requested_froms = story_data[self.from_field] sizes = self.relationship.get_neighbourhood_size( from_ids=requested_froms) return pd.DataFrame( {self.named_as: requested_froms.map(sizes).astype(int)}) def get_neighbourhood_size(self, from_field, named_as): return self.AddNeighbourhoodSize(self.relationship, from_field, named_as) class SelectOne(AddColumns): """ """ def __init__(self, relationship, from_field, named_as, one_to_one, pop, discard_missing, weight): # inner join instead of default left to allow dropping rows # in case of duplicates and one-to-one AddColumns.__init__(self, join_kind="inner") self.relationship = relationship self.from_field = from_field self.named_as = named_as self.one_to_one = one_to_one self.pop = pop self.discard_missing = discard_missing self.weight = weight def build_output(self, story_data): selected = self.relationship.select_one( from_ids=story_data[self.from_field], named_as=self.named_as, remove_selected=self.pop, one_to_one=self.one_to_one, discard_empty=self.discard_missing, overridden_to_weights=self.weight) selected.drop("from", axis=1, inplace=True) return selected def select_one(self, from_field, named_as, one_to_one=False, pop=False, discard_empty=False, weight=None): """ :param from_field: field corresponding to the "from" side of the relationship :param named_as: field name assigned to the selected "to" side of the relationship :param one_to_one: boolean indicating that any "to" value will be selected at most once :param pop: if True, the selected relation is removed :param discard_empty: if False, any non-existing "from" in the relationship yields a None in the resulting selection. If true, that row is removed from the story_data. :param weight: weight to use for the "to" side of the relationship. Must be a Series whose index are the "to" values. Typical usage would be to plug an attribute of the "to" population here. :return: this operation adds a single column corresponding to a random choice from a Relationship """ return self.SelectOne(self.relationship, from_field, named_as, one_to_one, pop, discard_empty, weight) class SelectAll(Operation): def __init__(self, relationship, from_field, named_as): self.relationship = relationship self.from_field = from_field self.named_as = named_as def transform(self, story_data): from_ids = story_data[[self.from_field]].drop_duplicates() selected = self.relationship.select_all_horizontal( from_ids=from_ids[self.from_field].values, named_as=self.named_as) selected.set_index("from", drop=True, inplace=True) return pd.merge(left=story_data, right=selected, left_on=self.from_field, right_index=True) def select_all(self, from_field, named_as): """ This simply creates a new story_data field containing all the "to" values of the requested from, as a set. """ return self.SelectAll(self.relationship, from_field, named_as) class SelectMany(AddColumns): """ """ def __init__(self, relationship, from_field, named_as, quantity_field, pop, discard_missing): # inner join instead of default left to allow dropping rows # in case of duplicates and one-to-one AddColumns.__init__(self, join_kind="inner") self.relationship = relationship self.discard_missing = discard_missing self.from_field = from_field self.named_as = named_as self.quantity_field = quantity_field self.pop = pop def build_output(self, story_data): selected = self.relationship.select_many( from_ids=story_data[self.from_field], named_as=self.named_as, quantities=story_data[self.quantity_field], remove_selected=self.pop, discard_empty=self.discard_missing) return selected def select_many(self, from_field, named_as, quantity_field, pop=False, discard_missing=True): return self.SelectMany(self.relationship, from_field, named_as, quantity_field, pop, discard_missing) class Add(SideEffectOnly): def __init__(self, relationship, from_field, item_field): self.relationship = relationship self.from_field = from_field self.item_field = item_field def side_effect(self, story_data): if story_data.shape[0] > 0: self.relationship.add_relations( from_ids=story_data[self.from_field], to_ids=story_data[self.item_field]) def add(self, from_field, item_field): return self.Add(self.relationship, from_field, item_field) class AddGrouped(SideEffectOnly): def __init__(self, relationship, from_field, grouped_items_field): self.relationship = relationship self.from_field = from_field self.grouped_items_field = grouped_items_field def side_effect(self, story_data): if story_data.shape[0] > 0: self.relationship.add_grouped_relations( from_ids=story_data[self.from_field], grouped_ids=story_data[self.grouped_items_field]) def add_grouped(self, from_field, grouped_items_field): """ this is similar to add, execept that the "to" field should here contain lists of "to" values instead of single ones """ return self.AddGrouped(self.relationship, from_field, grouped_items_field) class Remove(SideEffectOnly): def __init__(self, relationship, from_field, item_field): self.relationship = relationship self.from_field = from_field self.item_field = item_field def side_effect(self, story_data): if story_data.shape[0] > 0: self.relationship.remove( from_ids=story_data[self.from_field], to_ids=story_data[self.item_field]) def remove(self, from_field, item_field): return self.Remove(self.relationship, from_field, item_field)
class Configurator: '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def __init__(self, NU_TRNG_DATA, NU_TRNG_LABS, verbose = False, NU_CFG_PATH = None): # DATA and LAB MEMBERS # Should be your training data and corresponding labels self.TRNG_DATA = NU_TRNG_DATA self.TRNG_LABS = NU_TRNG_LABS # Just in case you want to use a special Config file if NU_CFG_PATH is None: self.CFG_PATH = DEF_CFG_PATH else: self.CFG_PATH = NU_CFG_PATH # Different Containers for Data self.CFG = {'sections' : ['STACK', 'LAYER', 'SMAX_TUNE', 'MET_TUNE', 'LOG_TUNE', 'CPP_LIBRARY','FILEIO', 'SYSTEM' ]} self.CFG_STACK = [] # RandomState Object self.RANDO = RS() # Congfig parser self.CP = CFPR(allow_no_value=True) self.TLAY = None self.CHECKS = {'cfg_loaded' : False, 'lay_obj_initd' : False, 'verbose' : verbose} self.SWARM_SIZE = 5 self.ALL_CHECKS = self.CHECKS.keys() self.FLOAT_STEP = .00001 self.INT_STEP = 1 self.GIVE_UP_SCALE = np.linspace(-100,100,5000) self.PATIENT_LEVEL = 2500 self.SCORE_STACK = [999999.] self.START_TIME = time.time() self.STOP_AT_HOUR = 1.0 '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def CFGControl(self): for TC in [self.isPatient(), not self.isStopTime()]: print TC yield TC '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def GetPatiencesLevel(self): return self.GIVE_UP_SCALE[self.PATIENT_LEVEL] '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def isPatient(self): return self.GIVE_UP_SCALE[np.minimum(self.PATIENT_LEVEL, len(self.GIVE_UP_SCALE))] > -50. '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def TimeElapsed(self): return ((time.time() - self.START_TIME)/360./60.) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def isStopTime(self): return self.STOP_AT_HOUR < self.TimeElapsed() '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def isStackEmpty(self, STACK): return len(STACK) == 0 '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def PatienceUp(self): self.PATIENT_LEVEL+=1 '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def PatienceDown(self): self.PATIENT_LEVEL-=1 '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def BuzzEm(self, VAL, BUZZ): if BUZZ in GenInts(): NU_INT = self.BuzzInt(VAL) return NU_INT if NU_INT>0 else VAL else: NU_FLOAT = self.BuzzFloat(VAL) return NU_FLOAT if NU_FLOAT>0 else VAL '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def BuzzInt(self, VAL): RANGE = [VAL-self.INT_STEP, VAL+self.INT_STEP] SEQ = np.arange(RANGE[0], RANGE[1]+1,self.INT_STEP) NEW_INT, THROW_AWAY = SEQ[0],SEQ[1:] return NEW_INT '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def BuzzFloat(self, VAL): RANGE = [VAL-self.FLOAT_STEP, VAL+self.FLOAT_STEP] SEQ = np.arange(RANGE[0], RANGE[1]+1,self.FLOAT_STEP) NEW_FLOAT, THROW_AWAY = SEQ[0],SEQ[1:] return NEW_FLOAT '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def LoadAndConfigure(self, CFG_PATH=None): if CFG_PATH is not None: self.CFG_PATH = CFG_PATH self.LoadConfig() self.Configure() '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def Configure(self, CFG_PATH = None): self.CFG_STACK.insert(0, self.CFG) LAY = Layer(self.CFG['LAYER']) while all([C for C in self.CFGControl()]): LAY.ClearLayerParams() INT_IDX = [[i,j] for i,j in GenInts()] FLOAT_IDX =[[i,j] for i,j in GenFloats()] ALL_BUZZERS = self.RANDO.permutation([i for i in INT_IDX]+[f for f in FLOAT_IDX]) SPLIT_AT = np.minimum(self.SWARM_SIZE, ALL_BUZZERS.size) BUZZERS = ALL_BUZZERS[:SPLIT_AT] REJECTS = ALL_BUZZERS[SPLIT_AT+1:] for [SECT, PARAM] in BUZZERS: OLD_VAL = self.CFG[SECT][PARAM] self.CFG[SECT][PARAM] = self.BuzzEm(OLD_VAL, [SECT, PARAM]) for [SECT,PARAM] in REJECTS: self.CFG[SECT][PARAM] = self.CFG[SECT][PARAM] print self.CFG.keys() LAY.SetNewParams(self.CFG) RESULT = self.LogTrain(LAY) if RESULT['fun'][0][-1] <= self.SCORE_STACK[0]: self.CFG_STACK.insert(0, self.CFG) self.PatienceUp() else: if self.isStackEmpty(self.CFG_STACK): self.SWARM_SIZE += np.ceil(self.SWARM_SIZE/2.) self.FLOAT_STEP = .00002 self.INT_STEP = 2 self.PatienceDown() else: self.CFG_STACK.pop(0) self.PatienceDown() '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def GetCFG(self): return self.CFG '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def DeepCopy2CFG(self, NU_CFG): self.CFG = {SECT : {PARAM : NU_CFG[SECT][PARAM] for PARAM in GenParams(NU_CFG) } for SECT in GenSects(NU_CFG)} '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def DeepCopyCFG2NU(self): return {SECT : {PARAM : self.CFG[SECT][PARAM] for PARAM in GenParams(self.CFG) } for SECT in GenSects(self.CFG)} '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def PushBehave(self): self.BHAVE_STACK.insert(0, {}) self.BHAVE_STACK[0].update(self.CURR_BEHAVIOR) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def PushConfig(self, LATEST_CFG): self.CFG_STACK.insert(0, {}) for SECT, PARM in GenSectsAndParams(LATEST_CFG): self.CFG_STACK[0][SECT][PARM] = LATEST_CFG[SECT][PARM] self.NUM_CFG_STACKED+=1 '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def PopConfig(self): if self.NUM_CFG_STACKED > 0: NU_CFG = {SECT : {PARM : self.CFG_STACK[0][SECT][PARM] for PARM in GenParams(self.CFG_STACK[0][SECT]) } for SECT in GenSects(self.CFG_STACK[0])} self.CFG_STACK.pop(0) self.SCORE_STACK.pop(0) self.NUM_CFG_STACKED -= 1 return NU_CFG else: raise IndexError, "None more configurations to pop off stack." '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def LoadConfig(self): ''' DESCRPT: A monster, thankfully this is the only call to the config fil of all the other classes IN ARGS: PATH ; strings : config file location NOTES: ''' try: self.CP.read(self.CFG_PATH) except: print "Config Name" raise IOError, 'Config file wasn\'t able to be read' GENERIC_DICT = {} for SECT in self.CP.sections(): GENERIC_DICT[SECT] = {} for OPTS in self.CP.options(SECT): val = self.CP.get(SECT, OPTS) if val == 'True': GENERIC_DICT[SECT][OPTS] = True elif val == 'False': print 'haeeyy' GENERIC_DICT[SECT][OPTS] = False else: try: GENERIC_DICT[SECT][OPTS] = int(val) except: try: GENERIC_DICT[SECT][OPTS] = float(val) except: try: GENERIC_DICT[SECT][OPTS] = val except: GENERIC_DICT[SECT][OPTS] = None for key in GenSects(GENERIC_DICT): if key not in self.CFG['sections']: self.CFG['sections'].append(key) for key in self.CFG['sections']: self.CFG[key] = GENERIC_DICT[key] self.CFG['LAYER']['disp'] = self.CHECKS['verbose'] self.CHECKS['config_loaded'] = True self.CHECKS['lee_wants_rand_off'] = self.CFG['STACK']['lee_wants_rand_off'] if self.CFG['STACK']['lee_wants_rand_off']: CONFIG_SEED = self.CFG['STACK']['rand_seed_32bit'] self.RANDO.seed(seed=CONFIG_SEED) else: self.RANDO.seed(seed=np.int32(time.time())) self.CHECKS['cfg_loaded'] = True '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def GetChecks(self, CHECK_NAME): if self.CHECKS.has_key(CHECK_NAME): return self.CHECKS[CHECK_NAME] else: raise Warning, "Thats not a valid Check" '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def UpdateConfig(self, SECTION=None, OPTION=None): ''' DESCRPT: PREVIOUSLY: Class Parameters were loading with-in themselves and were frustratingly difficult to interact with from out side. Plus they took up a lot of space LATER: I implement parameter dictionaries for individual class that would be passed back-and-forth from StackSAE to an instantiating class Not bad overall just tedious NOW: ONE dictionary contains all the parameters and behaves exacly like the old way but with way less tedium NOTE: This function backs up the latest version of the config file before writing the updated one. The back up as time stamped ''' import shutil # This keeps track of the last config file backed up and is a parameter in # FILEIO LAST_CONFIG_UPDATE_PATH = os.getcwd() self.CFG['FILEIO']['last_config_backup'] = LAST_CONFIG_UPDATE_PATH + '_config_bu.ini' shutil.copy2(os.getcwd() + '/config.ini', LAST_CONFIG_UPDATE_PATH) # The config parser instantiated with SSAE is updated if SECTION is not None: sect = list(SECTION) else: sect = self.CFG['sections'] if OPTION is not None: opt = OPTION self.CP.set(sect, opt, self.CFG[sect][opt]) else: for s in sect: for opt in list(self.CFG[s]): self.CP.set(s, opt, str(self.CFG[s][opt])) # new config file is now in the cwd with open('config.ini', 'w') as write_config: self.CP.write(write_config) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def ProcessResult(self, RES): self.IntakeNuResults() for ATT in RES.keys(): if ATT in self.PREV_RESULT.keys(): self.CURR_RESULT[ATT] = RES.get(ATT) self.TLAY.ClearLayerParams() '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def StopTrain(self): self.ChangePhase('stop') self.ProcessResult({'phase_name': self.CURR_PHASE}) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def LogTrain(self, LAY): IN_SHAPE = self.TRNG_DATA.shape NUM_HIDD = self.CFG['STACK']['num_hidden'] MIN_HIDD = self.CFG['STACK']['min_hidden'] MAX_LAYER = self.CFG['STACK']['max_layer'] DEC_HIDD_BY = self.CFG['STACK']['decrement_num_hidden'] BASE_NOISE = self.CFG['STACK']['base_noise_level'] OUT_SHAPE = (IN_SHAPE[0], NUM_HIDD) [WIN, WOUT, BIN, BOUT, SHAPES] = LAY.CreateLogLayer(IN_SHAPE, NUM_HIDD, self.RANDO) THETA = LAY.TrainSparseAE(WIN, WOUT, BIN, BOUT, self.TRNG_DATA) return {'success': THETA.success, 'message': THETA.message, 'fun' : THETA.fun, 'nfev' : THETA.nfev, 'nit' : THETA.nit } '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def FlattenData(self, DATA): N,M = DATA.shape self.TRNG_DATA = DATA.reshape(N*M) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def SetTrainingData(self, DATA): if DATA.ndim > 1: self.FlattenData(DATA) else: self.TRNG_DATA = DATA '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def SetTrainingLabs(self, LABS): self.TRNG_LABS = LABS '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def ChangePhase(self, NU_PHASE): if NU_PHASE in self.ALLOWED_PHASE: self.CURR_PHASE = NU_PHASE else: raise ValueError, 'Non allowable phase passed' '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def BuildNewLay_CurrParam(self, PHASE = None): if PHASE is not None and self.CURR_PHASE != PHASE: self.ChangePhase(PHASE) self.TEST_LAY = Layer(self.CFG['LAYER']) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def BuildNewLay_NuParam(self, PHASE = None, **NU_LAY_PARAM): if PHASE is not None and self.CURR_PHASE != PHASE: self.ChangePhase(PHASE) self.TEST_LAY = Layer(MergeNu2Old(self.CFG, NU_LAY_PARAM)) '''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@''' def CFGStackIsEmpty(self): return len(self.CFG_STACK) == 0
from numpy.random import RandomState RNG = RandomState(21) # Construct an example dataset for binary classification n_vars = 2 n_events = 10000 signal = RNG.multivariate_normal( np.ones(n_vars), np.diag(np.ones(n_vars)), n_events) background = RNG.multivariate_normal( np.ones(n_vars) * -1, np.diag(np.ones(n_vars)), n_events) X = np.concatenate([signal, background]) y = np.ones(X.shape[0]) w = RNG.randint(1, 10, n_events * 2) y[signal.shape[0]:] *= -1 permute = RNG.permutation(y.shape[0]) X = X[permute] y = y[permute] # Use all dataset for training X_train, y_train, w_train = X, y, w # Declare BDT - we are going to use AdaBoost Decision Tree dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X_train)) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) # Train BDT
class CryoDataset: def __init__(self,imgstack,ctfstack): self.imgstack = imgstack self.ctfstack = ctfstack assert self.imgstack.get_num_images() == self.ctfstack.get_num_images() self.N = self.imgstack.get_num_pixels() self.pixel_size = self.imgstack.get_pixel_size() def compute_noise_statistics(self): self.mleDC_est = self.estimate_dc() self.noise_var = self.imgstack.estimate_noise_variance() self.data_var = self.imgstack.compute_variance() print 'Dataset noise profile' print ' Noise: {0:.3g}'.format(n.sqrt(self.noise_var)) print ' Data: {0:.3g}'.format(n.sqrt(self.data_var)) assert self.data_var > self.noise_var self.signal_var = self.data_var - self.noise_var print ' Signal: {0:.3g}'.format(n.sqrt(self.signal_var)) print ' Signal-to-Noise Ratio: {0:.1f}% ({1:.1f}dB)'.format(100*self.signal_var/self.noise_var, 10*n.log10(self.signal_var/self.noise_var)) def normalize_dataset(self): self.imgstack.scale_images(1.0/n.sqrt(self.noise_var)) self.ctfstack.scale_ctfs(1.0/n.sqrt(self.noise_var)) self.data_var = self.data_var/self.noise_var self.signal_var = self.signal_var/self.noise_var self.noise_var = 1.0 def divide_dataset(self,minibatch_size,testset_size,partition,num_partitions,seed): self.rand = RandomState(seed) self.N_D = self.imgstack.get_num_images() self.idxs = self.rand.permutation(self.N_D) print "Dividing dataset of {0} images with minisize of {1}".format(self.N_D,minibatch_size) if testset_size != None: print " Test Images: {0}".format(testset_size) self.test_idxs = self.idxs[0:testset_size] self.train_idxs = self.idxs[testset_size:] else: self.train_idxs = self.idxs self.test_idxs = [] if num_partitions > 1: print " Partition: {0} of {1}".format(partition+1,num_partitions) N_D = len(self.train_idxs) partSz = N_D/num_partitions self.train_idxs = self.train_idxs[partition*partSz:(partition+1)*partSz] self.N_D_Test = len(self.test_idxs) self.N_D_Train = len(self.train_idxs) numBatches = int(n.floor(float(self.N_D_Train)/minibatch_size)) real_minisize = int(n.floor(float(self.N_D_Train)/numBatches)) N_Rem = self.N_D_Train - real_minisize*numBatches numRegBatches = numBatches - N_Rem batchInds = [ (real_minisize*i, real_minisize*(i+1)) \ for i in xrange(numRegBatches) ] + \ [ (real_minisize*numRegBatches + (real_minisize+1)*i, min(real_minisize*numRegBatches + (real_minisize+1)*(i+1),self.N_D_Train)) \ for i in xrange(N_Rem) ] self.batch_idxs = n.array(batchInds) self.N_batches = self.batch_idxs.shape[0] self.batch_order = self.rand.permutation(self.N_batches) batch_sizes = self.batch_idxs[:,1] - self.batch_idxs[:,0] print " Train Images: {0}".format(self.N_D_Train) print " Minibatches: {0}".format(self.N_batches) print " Batch Size Range: {0} - {1}".format(batch_sizes.min(),batch_sizes.max()) self.minibatch_size = minibatch_size self.testset_size = testset_size self.partition = partition self.num_partitions = num_partitions self.reset_minibatches(True) def get_dc_estimate(self): return self.mleDC_est def estimate_dc(self,esttype='robust'): N = self.N obs = [] ctf_dcs = {} zeros = n.zeros((1,2)) for img_i,img in enumerate(self.imgstack): ctf_i = self.ctfstack.get_ctf_idx_for_image(img_i) if ctf_i not in ctf_dcs: ctf_dcs[ctf_i] = self.ctfstack.get_ctf(ctf_i).compute(zeros) obs.append(n.mean(img) * n.sqrt(float(N)) / ctf_dcs[ctf_i]) obs = n.array(obs) mleDC, mleDC_std = estimate_mean_std(obs, esttype) mleDC_est_std = mleDC_std / n.sqrt(len(obs)) return mleDC, mleDC_std, mleDC_est_std def set_datasign(self,datasign): mleDC, _, mleDC_est_std = self.get_dc_estimate() datasign_est = 1 if mleDC > 2*mleDC_est_std else -1 if mleDC < -2*mleDC_est_std else 0 print "Estimated DC Component: {0:.3g} +/- {1:.3g}".format(mleDC,mleDC_est_std) if datasign == 'auto': if datasign_est == 0: print " WARNING: estimated DC component has large variance, detected sign could be wrong." datasign = n.sign(mleDC) else: datasign = datasign_est else: if datasign_est*datasign < 0: print " WARNING: estimated DC component and specified datasign disagree; be sure this is correct!" if datasign != 1: print " Using negative datasign" assert datasign == -1 self.ctfstack.flip_datasign() else: print " Using positive datasign" assert datasign == 1 def reset_minibatches(self,epochReset=True): self.curr_batch = None self.epoch_frac = 0 if epochReset: self.epoch = 0 self.data_visits = 0 def get_testbatch(self): miniidx = self.test_idxs ret = {'img_idxs':miniidx, 'ctf_idxs':self.ctfstack.get_ctf_idx_for_image(miniidx), 'N_M':len(miniidx), 'test_batch':True} return ret def get_next_minibatch(self,shuffle_minibatches): if self.curr_batch == None: self.curr_batch = 1 batchInd = 0 newepoch = False else: batchInd = self.curr_batch self.curr_batch = (self.curr_batch+1)%self.N_batches newepoch = batchInd == 0 if newepoch: if shuffle_minibatches: self.batch_order = self.rand.permutation(self.N_batches) self.epoch = self.epoch + 1 self.epoch_frac = 0 batch_id = self.batch_order[batchInd] startI = self.batch_idxs[batch_id,0] endI = self.batch_idxs[batch_id,1] miniidx = self.train_idxs[startI:endI] self.data_visits += endI - startI self.epoch_frac += float(endI - startI)/self.N_D_Train ret = {'img_idxs':miniidx, 'ctf_idxs':self.ctfstack.get_ctf_idx_for_image(miniidx), 'N_M':len(miniidx), 'id':batch_id, 'epoch':self.epoch + self.epoch_frac, 'num_batches': self.N_batches, 'newepoch':newepoch, 'test_batch':False } return ret def get_epoch(self,frac=False): if self.epoch == None: # Data not yet loaded return 0 if frac: return self.epoch + self.epoch_frac else: return self.epoch
def tune_classifier(clf, param_grid, avg_cycles=10, nr_training_samples=50, nr_test_samples=160, combine_scenes=False, filename=""): """ :param clf: :param param_grid: :param avg_cycles: :param nr_training_samples: :param kfold: Nr folds for uncombined scene training. :param combine_scenes: :return: """ save_csv = True randomize = True nr_iters = avg_cycles objective = 'f1' emb1 = load_embeddings("matthias_test.pkl") emb2 = load_embeddings("matthias_test2.pkl") emb_lfw = load_embeddings("embeddings_lfw.pkl") emb1 = clean_duplicates(emb1) emb2 = clean_duplicates(emb2) emb_lfw = clean_duplicates(emb_lfw) # select scenes and outlier class class_ds1 = emb1 class_ds2 = emb2 outlier_ds = emb_lfw clf_name = clf.__class__.__name__ # calculate folds if combine_scenes: nr_splits = float(nr_test_samples / (2. * nr_training_samples)) + 1 else: nr_splits = float(nr_test_samples / (4. * nr_training_samples)) + 1 if not nr_splits.is_integer(): print "Invalid number of samples. Producing {} splits.".format(nr_splits) min_nr_test = nr_training_samples*2 if combine_scenes else nr_training_samples*4 print "Adjust nr. training samples. E.g. {}, {}, {}, ...".format(min_nr_test, 2*min_nr_test, 3*min_nr_test) return nr_splits = int(nr_splits) print "Performing {}-fold cross-validation...".format(nr_splits) if objective not in {'f1', 'youden'}: raise ValueError # allocate storage iter_precision = [] iter_recall = [] iter_f1_scores = [] iter_params = [] iter_training_time = [] iter_prediction_time = [] iter_youden_indices = [] iter_auc_scores = [] prng = RandomState() for i in range(0, nr_iters): # shuffle same every time prng = RandomState(i + 1) class_ds1_mixed = prng.permutation(class_ds1) class_ds2_mixed = prng.permutation(class_ds2) outlier_ds_mixed = prng.permutation(outlier_ds) # random.seed(i) # Reset random state # random.shuffle(class_ds1) # random.shuffle(class_ds2) # random.shuffle(outlier_ds) kf = KFold(n_splits=nr_splits, shuffle=False) param_combinations = get_all_param_variants(param_grid) # allocate metrics precision_values = [] recall_values = [] youden_index = [] f1_scores = [] auc_scores = [] training_time = [] prediction_time = [] # mode selection if combine_scenes: # -------------------- Case B: Train on 1 and 2, test on 1 and 2 if (nr_training_samples/2+nr_test_samples/4) > len(class_ds1_mixed) or (nr_training_samples/2+nr_test_samples/4) > len(class_ds2_mixed): print "Too few samples!" return else: # -------------------- Case A: Train on 1, test on 1 and 2 if (nr_training_samples+nr_test_samples/4) > len(class_ds1_mixed) or nr_test_samples/4 > len(class_ds2_mixed): print "Too few samples!" return for i_param, clf_params in enumerate(param_combinations): # init classifiers clf.set_params(**clf_params) # build each parameter combination precision_scores_config = [] recall_scores_config = [] f1_scores_config = [] auc_scores_config = [] training_time_config = [] prediction_time_config = [] # -------------------- Case A: Train on 1, test on 1 and 2 if combine_scenes: scene1_samples = class_ds1_mixed[0:(nr_training_samples/2+nr_test_samples/4)] scene2_samples = class_ds2_mixed[0:(nr_training_samples/2+nr_test_samples/4)] # calculate precision and recall in kfold cross validation for test_indices, train_indices in kf.split(scene1_samples): training_samples = np.concatenate((scene1_samples[train_indices], scene2_samples[train_indices])) start = current_milli_time() clf.fit(training_samples) training_time_config.append(current_milli_time() - start) # build test set, add scene 2 , add outlier dataset test_with_outliers = np.concatenate((scene1_samples[test_indices], scene2_samples[test_indices], outlier_ds_mixed[0:(nr_test_samples/2)])) # 1/2 class, 1/2 outliers labels = np.concatenate((np.repeat(1, nr_test_samples/2), np.repeat(-1, nr_test_samples/2))) # predict start = current_milli_time() # scores which are thresholded scores = clf.decision_function(test_with_outliers) prediction_time_config.append(current_milli_time() - start) labels_predicted = clf.threshold(scores) if clf_name == 'L2Estimator': # invert probability scores = 3 - scores # validate if len(test_with_outliers) != nr_test_samples: print len(scene1_samples) print len(test_with_outliers) print nr_test_samples print len(test_indices) print "001: Check your code!" return # calculate metrics fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1) auc_val = auc(fpr, tpr) true_nr_positives = nr_test_samples/2 true_nr_negatives = nr_test_samples/2 tp = np.count_nonzero(labels_predicted[0:true_nr_positives] == 1) fn = true_nr_positives-tp fp = np.count_nonzero(labels_predicted[true_nr_positives:] == 1) tn = true_nr_negatives-fp fpr = float(fp)/float(fp+tn) recall = float(tp) / float(tp + fn) try: precision = float(tp) / float(tp + fp) f1_score = 2 * float(precision * recall) / float(precision + recall) except ZeroDivisionError: precision = 0 f1_score = 0 # validate if (tp + fn != nr_test_samples/2) or (fp + tn != nr_test_samples/2): print "002: Check your code!" print "tp: {}, tn: {} || fn: {}, fp: {}, ".format(tp, fn, fp, tn) print "precision: {} || recall: {} ".format(precision, recall) return precision_scores_config.append(precision) recall_scores_config.append(recall) f1_scores_config.append(f1_score) auc_scores_config.append(auc_val) else: class_samples_s1 = class_ds1_mixed[0:(nr_training_samples+nr_test_samples/4)] # calculate precision and recall in kfold cross validation for test_indices, train_indices in kf.split(class_samples_s1): start = current_milli_time() clf.fit(class_samples_s1[train_indices]) training_time_config.append(current_milli_time() - start) # build test set, add scene 2 , add outlier dataset test_with_outliers = np.concatenate((class_samples_s1[test_indices], class_ds2_mixed[0:nr_test_samples/4], outlier_ds_mixed[0:(nr_test_samples/2)])) # 1/2 class, 1/2 outliers labels = np.concatenate((np.repeat(1, nr_test_samples/2), np.repeat(-1, nr_test_samples/2))) # predict start = current_milli_time() # scores which are thresholded scores = clf.decision_function(test_with_outliers) prediction_time_config.append(current_milli_time() - start) labels_predicted = clf.threshold(scores) if clf_name == 'L2Estimator': # invert probability scores = 3 - scores # validate if len(test_with_outliers) != nr_test_samples: print "001: Check your code!" # calculate metrics fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1) auc_val = auc(fpr, tpr) true_nr_positives = nr_test_samples/2 true_nr_negatives = nr_test_samples/2 tp = np.count_nonzero(labels_predicted[0:true_nr_positives] == 1) fn = true_nr_positives-tp fp = np.count_nonzero(labels_predicted[true_nr_positives:] == 1) tn = true_nr_negatives-fp fpr = float(fp)/float(fp+tn) recall = float(tp) / float(tp + fn) try: precision = float(tp) / float(tp + fp) f1_score = 2 * float(precision * recall) / float(precision + recall) except ZeroDivisionError: precision = 0 f1_score = 0 # validate if (tp + fn != nr_test_samples/2) or (fp + tn != nr_test_samples/2): print "002: Check your code!" print "tp: {}, tn: {} || fn: {}, fp: {}, ".format(tp, fn, fp, tn) print "precision: {} || recall: {} ".format(precision, recall) return precision_scores_config.append(precision) recall_scores_config.append(recall) f1_scores_config.append(f1_score) auc_scores_config.append(auc_val) # average precision and recall values precision_avg = np.mean(precision_scores_config) recall_avg = np.mean(recall_scores_config) training_time_avg = np.mean(training_time_config) prediction_time_avg = np.mean(prediction_time_config) f1_scores_avg = np.mean(f1_scores_config) auc_scores_avg = np.mean(auc_scores_config) precision_values.append(precision_avg) recall_values.append(recall_avg) youden_index.append(precision_avg+recall_avg-1) training_time.append(training_time_avg) prediction_time.append(prediction_time_avg) f1_scores.append(f1_scores_avg) auc_scores.append(auc_scores_avg) # if verbose: # print "______________________________________________________________________\n" \ # "Params: {}".format(clf_params) # print "Precision: {} || Recall: {}".format(precision_avg, recall_avg) # --------------- END RANDOMIZED EXPERIMENT # print list(precision_values) # print list(recall_values) # --------------- BEST PARAMETERS if objective == 'f1': best_index = np.argmax(f1_scores) elif objective == 'youden': best_index = np.argmax(youden_index) best_params = param_combinations[best_index] print "________________________{}/{}_______________________________".format(i+1, nr_iters) print "Best parameters (Youden-Index {:.2f}, F1: {:.2f}): {}".format(np.max(youden_index), np.max(f1_scores), best_params) print "Precision: {:.2f} || Recall: {:.2f}".format(precision_values[best_index], recall_values[best_index]) iter_precision.append(precision_values[best_index]) iter_recall.append(recall_values[best_index]) iter_f1_scores.append(f1_scores[best_index]) iter_youden_indices.append(youden_index[best_index]) iter_params.append(best_params) iter_training_time.append(training_time[best_index]) iter_prediction_time.append(prediction_time[best_index]) iter_auc_scores.append(auc_scores[best_index]) # --------------- END RANDOM SERIES print "_______________________________________________________\n\n\n" print " FINAL EVALUATION:\n" print "LEARNER: {}".format(clf_name) print "MODE: {}".format('Mixed Scene Training' if combine_scenes else 'Single Scene Training') print "K-FOLD VALIDATION: {} folds".format(nr_splits) print "-------------------------------------------------------" if combine_scenes: print "Batch size training: {} ({} S1/{} S2)".format(len(train_indices)*2, len(train_indices), len(train_indices)) else: print "Batch size training: {} ".format(len(train_indices)) print "Batch size test: {} ({} class, {} outliers)".format(len(labels), len(labels[labels==1]), len(labels[labels==-1])) print "_______________________________________________________" # print "Batch size: training: {}, prediction: {}".format(nr_training_samples, nr_training_samples * (nr_splits - 1) * 2) # print "Batch size training: {} (class)".format(len(class_training_samples)) print "Precision Avg, std: {:.4f} +- {:.4f}".format(np.mean(iter_precision), 2*np.std(iter_precision)) print "Recall Avg, std: {:.4f} +- {:.4f}".format(np.mean(iter_recall), 2 * np.std(iter_recall)) print "Precision: ", ["%0.2f" % i for i in iter_precision] print "Recall: ", ["%0.2f" % i for i in iter_recall] print "F1 score: ", ["%0.2f" % i for i in iter_f1_scores] print "AUC: ", ["%0.2f" % i for i in iter_auc_scores] print "Parameters: ", iter_params if save_csv: # keep only best if filename == "": filename = clf_name+'_auc_eval.csv' with open(filename, 'wb') as csvfile: # write configuration of best results over multiple random tests writer = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) # settings if clf_name == 'OneClassSVM': writer.writerow(["LEARNER: {} ({})".format(clf_name, iter_params[0]['kernel'])]) else: writer.writerow(["LEARNER: {}".format(clf_name)]) writer.writerow(["MODE: {}".format('Mixed Scene Training' if combine_scenes else 'Single Scene Training')]) writer.writerow(["K-FOLD VALIDATION: {} folds".format(nr_splits)]) writer.writerow(["RANDOM ITERATIONS: {}".format(nr_iters)]) writer.writerow(["Batch size: training: {}, test: {}".format(nr_training_samples, nr_test_samples)]) writer.writerow(["Precision Avg, std: {} +- {}".format(np.mean(iter_precision), 2*np.std(iter_precision))]) writer.writerow(["Recall Avg, std: {} +- {}".format(np.mean(iter_recall), 2 * np.std(iter_recall))]) writer.writerow(["F1 score, std: {} +- {}".format(np.mean(iter_f1_scores), 2 * np.std(iter_f1_scores))]) writer.writerow(["Youden index, std: {} +- {}".format(np.mean(iter_youden_indices), 2 * np.std(iter_youden_indices))]) writer.writerow("") if clf_name == 'L2Estimator' or clf_name == 'ABODEstimator' or clf_name == 'ApproxABODEstimator': writer.writerow(["Train", "Test", "Folds", "T Median", "T Mean", "T Std", "P", "P std.", "R", "R std", "F1", "F1 std", "Youdens", "Youdens std", "AUC", "AUC std", "Training Time", "Trainig Time std", "Prediction Time", "Prediction Time std"]) writer.writerow([ nr_training_samples, nr_test_samples, nr_splits, np.median([tmp['T'] for tmp in iter_params]), np.mean([tmp['T'] for tmp in iter_params]), np.std([tmp['T'] for tmp in iter_params]), np.mean(iter_precision), np.std(iter_precision), np.mean(iter_recall), np.std(iter_recall), np.mean(iter_f1_scores), np.std(iter_f1_scores), np.mean(iter_youden_indices), np.std(iter_youden_indices), np.mean(iter_auc_scores), np.std(iter_auc_scores), np.mean(iter_training_time), np.std(iter_training_time), np.mean(iter_prediction_time), np.std(iter_prediction_time) ]) writer.writerow("") writer.writerow(["Precision, Recall, Training-Time (ms):"]) writer.writerow(["%0.6f" % i for i in iter_precision]) writer.writerow(["%0.6f" % i for i in iter_recall]) writer.writerow(["%0.6f" % i for i in iter_training_time]) if clf_name == 'L2Estimator' or clf_name == 'ABODEstimator': thresholds = ["%0.4f" % tmp['T'] for tmp in iter_params] writer.writerow(thresholds) else: writer.writerow(iter_params) writer.writerow("")
def _iter_slow(self, batch_size=128, start=None, end=None, shuffle=True, seed=None, mode=0): # ====== Set random seed ====== # all_ds = self._data[:] prng1 = None prng2 = _dummy_shuffle if shuffle: if seed is None: seed = get_random_magic_seed() prng1 = RandomState(seed) prng2 = RandomState(seed) all_size = [i.shape[0] for i in all_ds] n_dataset = len(all_ds) # ====== Calculate batch_size ====== # if mode == 1: # equal s = sum(all_size) all_batch_size = [int(round(batch_size * i / s)) for i in all_size] for i in xrange(len(all_batch_size)): if all_batch_size[i] == 0: all_batch_size[i] += 1 if sum(all_batch_size) > batch_size: # 0.5% -> round up, too much for i in xrange(len(all_batch_size)): if all_batch_size[i] > 1: all_batch_size[i] -= 1 break all_upsample = [None] * len(all_size) elif mode == 2 or mode == 3: # upsampling and downsampling maxsize = int(max(all_size)) if mode == 2 else int(min(all_size)) all_batch_size = [int(batch_size / n_dataset) for i in xrange(n_dataset)] for i in xrange(batch_size - sum(all_batch_size)): # not enough all_batch_size[i] += 1 all_upsample = [maxsize for i in xrange(n_dataset)] else: # sequential all_batch_size = [batch_size] all_upsample = [None] all_size = [sum(all_size)] # ====== Create all block and batches ====== # # [ ((idx1, batch1), (idx2, batch2), ...), # batch 1 # ((idx1, batch1), (idx2, batch2), ...), # batch 2 # ... ] all_block_batch = [] # contain [block_batches1, block_batches2, ...] tmp_block_batch = [] for n, batchsize, upsample in zip(all_size, all_batch_size, all_upsample): tmp_block_batch.append( create_batch(n, batchsize, start, end, prng1, upsample)) # ====== Distribute block and batches ====== # if mode == 1 or mode == 2 or mode == 3: for i in zip_longest(*tmp_block_batch): all_block_batch.append([(k, v) for k, v in enumerate(i) if v is not None]) else: all_size = [i.shape[0] for i in all_ds] all_idx = [] for i, j in enumerate(all_size): all_idx += [(i, k) for k in xrange(j)] # (ds_idx, index) all_idx = [all_idx[i[0]:i[1]] for i in tmp_block_batch[0]] # complex algorithm to connecting the batch with different dataset for i in all_idx: tmp = [] idx = i[0][0] # i[0][0]: ds_index start = i[0][1] # i[0][1]: index end = start for j in i[1:]: # detect change in index if idx != j[0]: tmp.append((idx, (start, end + 1))) idx = j[0] start = j[1] end = j[1] tmp.append((idx, (start, end + 1))) all_block_batch.append(tmp) prng2.shuffle(all_block_batch) # print if you want debug # for _ in all_block_batch: # for i, j in _: # print('ds:', i, ' batch:', j) # print('===== End =====') # ====== return iteration ====== # for _ in all_block_batch: # each _ is a block batches = np.concatenate( [all_ds[i][j[0]:j[1]] for i, j in _], axis=0) batches = batches[prng2.permutation(batches.shape[0])] yield self._normalizer(batches)
def run_experiment(arglist): # Get the experiment paramters p = tools.Params("gape") p.set_by_cmdline(arglist) # Sequence categories cat_list = [[0, 1, 0, 1], [0, 0, 1, 1], [0, 1, 1, 0]] cat_names = ["alternated", "paired", "reflected"] # Get this run's schedule in a manner that is consistent # within and random between subjects if p.train: letter = letters[p.run - 1] p.sched_id = "train_%s" % letter sched_file = "sched/schedule_%s.csv" % p.sched_id else: state = RandomState(abs(hash(p.subject))) choices = list(letters[:p.total_schedules]) p.sched_id = state.permutation(choices)[p.run - 1] sched_file = "sched/schedule_%s.csv" % p.sched_id # Read in this run's schedule s = read_csv(sched_file) # Max the screen brightness tools.max_brightness(p.monitor_name) # Open up the stimulus window calib.monitorFolder = "./calib" mon = calib.Monitor(p.monitor_name) m = tools.WindowInfo(p, mon) win = visual.Window(**m.window_kwargs) # Set up the stimulus objects fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_color, size=p.fix_size) a_fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_antic_color, size=p.fix_size) r_fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_resp_color, size=p.fix_size) d_fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_demo_color, size=p.fix_size) c_fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_catch_color, size=p.fix_size) b_fix = visual.PatchStim(win, tex=None, mask="circle", color=p.fix_break_color, size=p.fix_size) halo = visual.PatchStim(win, tex=None, mask=p.demo_halo_mask, opacity=p.demo_halo_opacity, color=p.demo_halo_color, size=p.demo_halo_size) grate = visual.PatchStim(win, "sin", p.stim_mask, size=p.stim_size, contrast=p.stim_contrast, sf=p.stim_sf, opacity=p.stim_opacity) disk = visual.PatchStim(win, tex=None, mask=p.stim_mask, color=win.color, size=p.stim_disk_ratio) stims = [grate, disk, fix] # Set up some timing variables running_time = 0 antic_secs = p.tr demo_secs = 4 * p.demo_stim_dur + 3 * p.demo_stim_isi + p.tr seq_secs = p.tr + 4 * p.stim_dur + 3 * p.stim_isi catch_secs = p.tr rest_secs = p.rest_trs * p.tr # Draw the instructions and wait to go instruct = dedent(""" Watch the sample sequence and say if the target sequences match Blue dot: sample sequence Red dot: get ready Orange dot: relax Green dot: say if sequence matched the sample Button 1: same Button 2: different Grey dot: quick break Experimenter: Press space to prep for scan""") # TODO # Draw the instructions and wait to go tools.WaitText(win, instruct, height=.7)(check_keys=["space"]) # Possibly wait for the scanner if p.fmri: tools.wait_for_trigger(win, p) # Start a data file and write the params to it f, fname = tools.start_data_file(p.subject, p.experiment_name, p.run, train=p.train) p.to_text_header(f) # Save run params to JSON save_name = op.join("./data", op.splitext(fname)[0]) p.to_json(save_name) # Write the datafile header header = ["trial", "block", "cat_id", "cat_name", "event_type", "event_sched", "event_time", "ori_a", "ori_b", "oddball", "odd_item", "odd_orient", "iti", "response", "rt", "acc"] tools.save_data(f, *header) # Start a clock and flush the event buffer exp_clock = core.Clock() trial_clock = core.Clock() event.clearEvents() # Main experiment loop # -------------------- try: # Dummy scans fix.draw() win.flip() dummy_secs = p.dummy_trs * p.tr running_time += dummy_secs wait_check_quit(dummy_secs, p.quit_keys) for t in s.trial: cat_seq = cat_list[s.cat_id[t]] block_ori_list = np.array([s.ori_a[t], s.ori_b[t]])[cat_seq] # Set up some defaults for variables that aren't always set oddball_seq = [0, 0, 0 ,0] odd_item, odd_ori = -1, -1 acc, response, resp_rt = -1, -1, -1 # Possibly rest and then bail out of the rest of the loop if s.ev_type[t] == "rest": if p.train and not p.fmri: b_fix.draw() win.flip() wait_check_quit(2) before = exp_clock.getTime() msg = "Quick break! Press space to continue." tools.WaitText(win, msg, height=.7)(check_keys=["space"]) b_fix.draw() win.flip() wait_check_quit(2) after = exp_clock.getTime() rest_time = after - before running_time += rest_time continue else: b_fix.draw() win.flip() wait_check_quit(rest_secs) running_time += rest_secs continue # Otherwise, we always get an anticipation if p.antic_fix_dur <= p.tr: # possibly problematic fix.draw() win.flip() core.wait(p.tr - p.antic_fix_dur) if s.ev_type[t] == "demo": stim = d_fix else: stim = a_fix end_time = running_time + p.antic_fix_dur tools.precise_wait(win, exp_clock, end_time, stim) running_time += antic_secs # The event is about to happen so stamp that time event_sched = running_time event_time = exp_clock.getTime() # Demo sequence if s.ev_type[t] == "demo": for i, ori in enumerate(block_ori_list): # Draw each stim grate.setOri(ori) halo.draw() draw_all(*stims) d_fix.draw() win.flip() core.wait(p.demo_stim_dur) # Short isi fix if i < 3: d_fix.draw() win.flip() core.wait(p.demo_stim_isi) check_quit() # Demo always has >1 TR fixation fix.draw() win.flip() wait_check_quit(p.tr) # Update timing running_time += demo_secs # Proper test sequence if s.ev_type[t] == "seq": # If this is an oddball, figure out where if s.oddball[t]: oddball_seq = multinomial(1, [.25] * 4).tolist() odd_item = oddball_seq.index(1) # Iterate through each element in the sequence for i, ori in enumerate(block_ori_list): # Set the grating attributes if oddball_seq[i]: ori_choices = [o for o in p.stim_orients if not o == ori] odd_ori = ori_choices[randint(3)] grate.setOri(odd_ori) else: grate.setOri(ori) grate.setPhase(uniform()) # Draw the grating set draw_all(*stims) win.flip() core.wait(p.stim_dur) # ISI Fix (on all but last stim) if i < 3: fix.draw() win.flip() core.wait(p.stim_isi) check_quit() # Response fixation r_fix.draw() trial_clock.reset() event.clearEvents() win.flip() acc, response, resp_rt = wait_get_response(p, trial_clock, s.oddball[t], p.resp_dur) # Update timing running_time += seq_secs # Catch trial if s.ev_type[t] == "catch": c_fix.draw() win.flip() wait_check_quit(p.tr) running_time += catch_secs # Save data to the datafile data = [t, s.block[t], s.cat_id[t], cat_names[s.cat_id[t]], s.ev_type[t], event_sched, event_time, s.ori_a[t], s.ori_b[t], s.oddball[t], odd_item, odd_ori, s.iti[t], response, resp_rt, acc] tools.save_data(f, *data) # ITI interval # Go by screen refreshes for precise timing this_iti = s.iti[t] * p.tr end_time = running_time + this_iti tools.precise_wait(win, exp_clock, end_time, fix) running_time += this_iti finally: # Clean up f.close() win.close() # Good execution, print out some info try: data_file = op.join("data", fname) with open(data_file, "r") as fid: lines = fid.readlines() n_comments = len([l for l in lines if l.startswith("#")]) df = read_csv(data_file, skiprows=n_comments, na_values=["-1"]) info = dict() time_error = df.event_sched - df.event_time info["run"] = p.run info["acc"] = df.acc.mean() info["mean_rt"] = df.rt.mean() info["missed_resp"] = (df.response == 0).sum() info["time_error_mean"] = abs(time_error).mean() info["time_error_max"] = max(time_error) print dedent("""Performance summary for run %(run)d: Accuracy: %(acc).3f Mean RT: %(mean_rt).3f Missed responses: %(missed_resp)d Mean timing error: %(time_error_mean).4f Max timing error: %(time_error_max).4f """ % info) except Exception as err: print "Could not read data file for summary" print err
def train(train_set_x, train_set_y, hyper_parameters, symmetric_double_encoder, params, regularization_methods, print_verbose=False, top=0, validation_set_x=None, validation_set_y=None, moving_averages=None, decay=False, reduce_val=0, autoencoder_x=False, autoencoder_y=False): OutputLog().write('Using Decay = {0}'.format(decay)) # Calculating number of batches n_training_batches = int(train_set_x.shape[0] / hyper_parameters.batch_size) random_stream = RandomState() early_stop_count = 0 model_updates = [shared(p.get_value() * 0) for p in params] model_deltas = [shared(p.get_value() * 0) for p in params] eps = 1e-8 symmetric_double_encoder.set_eval(False) last_metric = 0 correlations = [] tester = TraceCorrelationTester(validation_set_x, validation_set_y, top, reduce_val) learning_rate = hyper_parameters.learning_rate # The training phase, for each epoch we train on every batch best_loss = 0 for epoch in numpy.arange(hyper_parameters.epochs): OutputLog().write('----------Starting Epoch ({0})-----------'.format(epoch), 'debug') print 'Building model' model = Trainer._build_model(hyper_parameters, learning_rate, symmetric_double_encoder, params, regularization_methods, model_updates, model_deltas, moving_averages, n_training_batches, hyper_parameters.training_strategy, 0.9, 0.999, hyper_parameters.rho, eps, 'L2', len(symmetric_double_encoder) - 1, autoencoder_x, autoencoder_y) OutputLog().write('Shuffling dataset', 'debug') indices_positive = random_stream.permutation(train_set_x.shape[0]) loss_forward = 0 loss_backward = 0 OutputLog().write('Training {0} batches'.format(n_training_batches), 'debug') for index in xrange(n_training_batches): start_tick = cv2.getTickCount() # need to convert the input into tensor variable symmetric_double_encoder.var_x.set_value( train_set_x[indices_positive[index * hyper_parameters.batch_size: (index + 1) * hyper_parameters.batch_size], :], borrow=True) symmetric_double_encoder.var_y.set_value( train_set_y[indices_positive[index * hyper_parameters.batch_size: (index + 1) * hyper_parameters.batch_size], :], borrow=True) output = model(index + 1) loss_backward += output[0] loss_forward += output[1] if math.isnan(loss_backward) or math.isnan(loss_forward): OutputLog().write('loss equals NAN, exiting') sys.exit(-1) tickFrequency = cv2.getTickFrequency() current_time = cv2.getTickCount() regularizations = [regularization_method for regularization_method in regularization_methods if not regularization_method.weight == 0] string_output = '' if len(regularizations) > 0: zipped = zip(output[8:8 + len(regularizations)], regularizations) string_output = ' ' for regularization_output, regularization_method in zipped: string_output += '{0}: {1} '.format(regularization_method.regularization_type, regularization_output) OutputLog().write( 'batch {0}/{1} ended, time: {2:.3f}, loss_x: {3}, loss_y: {4}, loss_h: ' '{7:.2f} var_x: {5} var_y: {6} mean_g: {9} var_g: {10} {8}'. format(index, n_training_batches, ((current_time - start_tick) / tickFrequency), output[0], output[1], output[2], output[3], calculate_reconstruction_error(output[4], output[5]), string_output, numpy.mean(output[6]), numpy.mean(output[7])), 'debug') OutputLog().write('Average loss_x: {0} loss_y: {1}'.format(loss_backward / (n_training_batches * 2), loss_forward / (n_training_batches * 2))) if print_verbose and not validation_set_y is None and not validation_set_x is None and epoch % hyper_parameters.validation_epoch == 0: OutputLog().write('----------epoch (%d)----------' % epoch, 'debug') symmetric_double_encoder.set_eval(True) correlations, best_correlation, var, x, y, layer_id = tester.test( DoubleEncoderTransformer(symmetric_double_encoder, 0), hyper_parameters) symmetric_double_encoder.set_eval(False) if math.isnan(var): sys.exit(0) current_metric = \ tester._metrics[hyper_parameters.early_stopping_layer][hyper_parameters.early_stopping_metric][-1] if last_metric > current_metric: early_stop_count += 1 if hyper_parameters.decay_factor > 0: if not hyper_parameters.decay: if last_metric - current_metric > 0.1: OutputLog().write('Decaying learning rate') learning_rate *= hyper_parameters.decay_factor else: if epoch in hyper_parameters.decay: OutputLog().write('Decaying learning rate') learning_rate *= hyper_parameters.decay_factor symmetric_double_encoder.export_encoder(OutputLog().output_path, 'epoch_{0}'.format(epoch)) last_metric = current_metric if early_stop_count == 1 and hyper_parameters.early_stopping: tester.saveResults(OutputLog().output_path) return OutputLog().write('epoch (%d) ,Loss X = %f, Loss Y = %f, learning_rate = %f\n' % (epoch, loss_backward / n_training_batches, loss_forward / n_training_batches, learning_rate), 'debug') tester.saveResults(OutputLog().output_path) del model
def balanced_train_test_split(X, y, test_size=None, train_size=None, bootstrap=False, random_state=None): """ Split the data into a balanced training set and test set of some given size. For a dataset with an unequal numer of samples in each class, one useful procedure is to split the data into a training and a test set in such a way that the classes are balanced. Parameters ---------- X : array, shape = [n_samples, n_features] Feature matrix. y : array, shape = [n_features] Target vector. test_size : float or int (default=0.3) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is automatically set to the complement of the train size. If train size is also None, test size is set to 0.3. train_size : float or int (default=1-test_size) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, optional (default=None) Pseudo-random number generator state used for random sampling. Returns ------- X_train : array The feature vectors (stored as columns) in the training set. X_test : array The feature vectors (stored as columns) in the test set. y_train : array The target vector in the training set. y_test : array The target vector in the test set. """ # initialise the random number generator rng = RandomState(random_state) # make sure X and y are numpy arrays X = np.asarray(X) y = np.asarray(y) # get information about the class distribution classes, y_indices = np.unique(y, return_inverse=True) n_classes = len(classes) cls_count = np.bincount(y_indices) # get the training and test size train_size, test_size = _get_train_test_size(train_size, test_size, len(y)) # number of samples in each class that is included in the training and test set n_train = np.round(train_size / n_classes).astype(int) n_test = np.round(test_size / n_classes).astype(int) n_total = n_train + n_test # make sure we have enough samples to create a balanced split min_count = min(cls_count) if min_count < (n_train + n_test) and not bootstrap: raise ValueError('The smallest class contains {} examples, which is not ' 'enough to create a balanced split. Choose a smaller size ' 'or enable bootstraping.'.format(min_count)) # selected indices are stored here train = [] test = [] # get the desired sample from each class for i, cls in enumerate(classes): if bootstrap: shuffled = rng.choice(cls_count[i], n_total, replace=True) else: shuffled = rng.permutation(cls_count[i]) cls_i = np.where(y == cls)[0][shuffled] train.extend(cls_i[:n_train]) test.extend(cls_i[n_train:n_total]) train = list(rng.permutation(train)) test = list(rng.permutation(test)) return X[train], X[test], y[train], y[test]
def _subsample_nonzero(counts, ns, replace=False, seed=0): """Randomly subsample from a vector of counts and returns the number of nonzero values for each number of element to subsample specified. Parameters ---------- counts : 1-D array_like of integers Vector of counts. ns : 1-D array_like of integers List of numbers of element to subsample. replace : bool, optional Subsample with or without replacement. seed : int, optional Random seed. Returns ------- nonzero : 1-D ndarray Number of nonzero values for each value of ns. Raises ------ ValueError, TypeError """ counts = np.asarray(counts) ns = np.asarray(ns) if counts.ndim != 1: raise ValueError("'counts' must be an 1-D array_like object") if (ns < 0).sum() > 0: raise ValueError("values in 'ns' must be > 0 ") counts = counts.astype(int, casting='safe') ns = ns.astype(int, casting='safe') counts_sum = counts.sum() prng = RandomState(seed) nonzero = [] if replace: p = counts / counts_sum for n in ns: if n > counts_sum: nonzero.append(np.nan) else: subcounts = prng.multinomial(n, p) nonzero.append(np.count_nonzero(subcounts)) else: nz = np.flatnonzero(counts) expanded = np.concatenate([np.repeat(i, counts[i]) for i in nz]) permuted = prng.permutation(expanded) for n in ns: if n > counts_sum: nonzero.append(np.nan) else: subcounts = np.bincount(permuted[:n], minlength=counts.size) nonzero.append(np.count_nonzero(subcounts)) return np.array(nonzero)