def plot_projection(P, feature_names): d = {} import pandas as pd for (i,v) in enumerate(P.T): idxs = np.argsort(-np.abs(v)) print 'v_%d' % i print pd.Series(v, index=feature_names).iloc[idxs].iloc[0:10]
def get_brightest(self, object_type='star', num_srcs=1, band='r', return_idx=False): """return brightest sources (by source type, band)""" fluxes = np.array([s.params.flux_dict[band] for s in self.srcs]) type_idx = np.where(self.source_types == object_type)[0] type_fluxes = fluxes[type_idx] type_idx = type_idx[np.argsort(type_fluxes)[::-1]][:num_srcs] blist = [self.srcs[i] for i in type_idx] if return_idx: return blist, type_idx else: return blist
def dial_settings(self): self.colors = [[1,0,0.4], [ 0, 0.4, 1],[0, 1, 0.5],[1, 0.7, 0.5],[0.7, 0.6, 0.5],'mediumaquamarine'] #### create degree set for polys #### e = 0 self.degs = [] while e < self.num_elements + 1: for i in range(0,self.num_elements): for j in range(0,i+1): dg1 = i dg2 = j self.degs.append([dg1,dg2]) e+=1 # generate poly features self.F_poly = self.poly_feats(self.num_elements + 1) #### random weights for tanh network, tanh transform #### scale = 1 self.R = scale*np.random.randn(self.num_elements+1,3) self.F_tanh = self.tanh_feats(self.num_elements+1) #### initialize split points for trees #### splits = [] levels = [] dims = [] residual = copy.deepcopy(self.y) ## create simple 'weak learner' between each consecutive pair of points ## for j in range(0,2): # sort data by values of input in each dimension x_t = copy.deepcopy(self.x) y_t = copy.deepcopy(self.y) sorted_inds = np.argsort(x_t[:,j],axis = 0) x_t = x_t[sorted_inds] y_t = y_t[sorted_inds] # loop over and create all stumps in this dimension of the input for p in range(len(self.y) - 1): # determine points on each side of split split = (x_t[p,j] + x_t[p+1,j])/float(2) splits.append(split) dims.append(j) # gather points to left and right of split pts_left = [t for t in x_t if t[j] <= split] resid_left = residual[:len(pts_left)] resid_right = residual[len(pts_left):] # compute average on each side ave_left = np.mean(resid_left) ave_right = np.mean(resid_right) levels.append([ave_left,ave_right]) # randomize splits for this experiment self.orig_splits = splits self.orig_levels = levels r = np.random.permutation(len(self.orig_splits)) self.orig_splits = [self.orig_splits[v] for v in r] self.orig_levels = [self.orig_levels[v] for v in r] self.orig_dims = [dims[v] for v in r] # generate features self.F_tree = self.tree_feats()
pool = multiprocessing.Pool(processes=5) x0_list = [] for i in range(int(K/p)): x0_list.append(model.rand_x(p)) results = pool.map(task, x0_list) pool.close() pool.join() candidate = results[0][0] wEI_tmp = results[0][1] for j in range(1, int(K/p)): candidate = np.concatenate((candidate.T, results[j][0].T)).T wEI_tmp = np.concatenate((wEI_tmp.T, results[j][1].T)).T idx = np.argsort(wEI_tmp)[-1:] new_x = candidate[:, idx] new_y = funct(new_x, bounds) print('idx',idx) print('x',new_x.T) print('y',new_y.T) dataset['train_x'] = np.concatenate((dataset['train_x'].T, new_x.T)).T dataset['train_y'] = np.concatenate((dataset['train_y'].T, new_y.T)).T with open('dataset.pickle', 'wb') as f: pickle.dump(dataset, f) EI = model.calc_wEI(X_star) new_x_real = new_x * (bounds[0,1]-bounds[0,0]) + (bounds[0,1]+bounds[0,0])/2
def sorted_r_eigs(w): drW,prW = np.linalg.eig(w) srtinds = np.argsort(drW) return drW[srtinds],prW[:,srtinds]
def argsort(a: Numeric, axis: Int = -1, descending: bool = False): if descending: return anp.argsort(-a, axis=axis) else: return anp.argsort(a, axis=axis)
model.compute_bounds() NSBnd = model.NSBnd IJBnd = model.IJBnd totalIJBnd = IJAppxBnd + IJBnd[sets] print('NS bound holds:', np.all(np.abs(NS - exact) < NSBnd[sets])) print('IJ bound holds:', np.all(np.abs(IJ - exact) < IJBnd[sets])) predErrsExact = (Y[sets] - np.exp(exact))**2 predErrsNS = (Y[sets] - np.exp(NS))**2 predErrsNSUpper = (Y[sets] - np.exp(NS + NSBnd[sets]))**2 predErrsNSLower = (Y[sets] - np.exp(NS - NSBnd[sets]))**2 predErrsNSUpperBnd = np.maximum(predErrsNSUpper, predErrsNSLower) predErrsNSLowerBnd = np.minimum(predErrsNSUpper, predErrsNSLower) sortInds = np.argsort(predErrsNS) predErrsNSUpperBnd[np.isinf(predErrsNSUpperBnd)] = 0.0 predErrsNSLowerBnd[np.isinf(predErrsNSLowerBnd)] = 0.0 predErrsExact = (Y[sets] - np.exp(exact))**2 predErrsIJ = (Y[sets] - np.exp(IJ))**2 predErrsIJUpper = (Y[sets] - np.exp(IJ + totalIJBnd))**2 predErrsIJLower = (Y[sets] - np.exp(IJ - totalIJBnd))**2 predErrsIJUpperBnd = np.maximum(predErrsIJUpper, predErrsIJLower) predErrsIJLowerBnd = np.minimum(predErrsIJUpper, predErrsIJLower) sortInds = np.argsort(predErrsIJ) predErrsIJUpperBnd[predErrsIJUpperBnd > 1e5] = 0.0 predErrsIJLowerBnd[predErrsIJLowerBnd > 1e5] = 0.0 exactAss[ii] = predErrsExact.mean() IJAss[ii] = predErrsIJ.mean()
def rl1_selection(y_bin, y_ord, y_categ, y_cont, zl1_ys, w_s): ''' Selects the number of factors on the first latent discrete layer y_bin (n x p_bin ndarray): The binary and count data matrix y_ord (n x p_ord ndarray): The ordinal data matrix y_categ (n x p_categ ndarray): The categorical data matrix y_cont (n x p_cont ndarray): The continuous data matrix zl1_ys (k_1D x r_1D ndarray): The first layer latent variables w_s (list): The path probabilities starting from the first layer ------------------------------------------------------------------ return (list of int): The dimensions to keep for the GLLVM layer ''' M0 = zl1_ys.shape[0] numobs = zl1_ys.shape[1] r0 = zl1_ys.shape[2] S0 = zl1_ys.shape[3] nb_bin = y_bin.shape[1] nb_ord = y_ord.shape[1] nb_categ = y_categ.shape[1] nb_cont = y_cont.shape[1] PROP_ZERO_THRESHOLD = 0.25 PVALUE_THRESHOLD = 0.10 # Detemine the dimensions that are weakest for Binomial variables zero_coef_mask = np.zeros(r0) for j in range(nb_bin): for s in range(S0): Nj = int(np.max(y_bin[:,j])) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:,j] z = zl1_ys[:,:,:,s] else: # If not, need to convert Binomial output to Bernoulli output yj, z = bin_to_bern(Nj, y_bin[:,j], zl1_ys[:,:,:,s]) # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs * Nj, r0), order = 'C') y_repeat = np.repeat(yj, M0).astype(int) # Repeat rather than tile to check lr = LogisticRegression(penalty = 'l1', solver = 'saga') lr.fit(X, y_repeat) zero_coef_mask += (lr.coef_[0] == 0) * w_s[s] # Detemine the dimensions that are weakest for Ordinal variables for j in range(nb_ord): for s in range(S0): ol = OrderedLogit() X = zl1_ys[:,:,:,s].flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_ord[:, j], M0).astype(int) # Repeat rather than tile to check ol.fit(X, y_repeat) zero_coef_mask += np.array(ol.summary['p'] > PVALUE_THRESHOLD) * w_s[s] # Detemine the dimensions that are weakest for Categorical variables for j in range(nb_categ): for s in range(S0): z = zl1_ys[:,:,:,s] # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_categ[:,j], M0).astype(int) # Repeat rather than tile to check lr = LogisticRegression(penalty = 'l1', solver = 'saga', \ multi_class = 'multinomial') lr.fit(X, y_repeat) zero_coef_mask += (lr.coef_[0] == 0) * w_s[s] # Detemine the dimensions that are weakest for Continuous variables for j in range(nb_cont): for s in range(S0): z = zl1_ys[:,:,:,s] # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_cont[:,j], M0) # Repeat rather than tile to check linr = Lasso() linr.fit(X, y_repeat) #coefs = np.concatenate([[linr.intercept_], linr.coef_]) #zero_coef_mask += (coefs == 0) * w_s[s] zero_coef_mask += (linr.coef_[0] == 0) * w_s[s] # Voting: Delete the dimensions which have been zeroed a majority of times zeroed_coeff_prop = zero_coef_mask / ((nb_ord + nb_bin + nb_categ + nb_cont)) # Need at least r1 = 2 for algorithm to work new_rl = np.sum(zeroed_coeff_prop <= PROP_ZERO_THRESHOLD) if new_rl < 2: dims_to_keep = np.argsort(zeroed_coeff_prop)[:2] else: dims_to_keep = list(set(range(r0)) - \ set(np.where(zeroed_coeff_prop > PROP_ZERO_THRESHOLD)[0].tolist())) dims_to_keep = np.sort(dims_to_keep) return dims_to_keep
def iterGP_rotation(x, y, yerr, period_guess, acf_1pk, num_iter=20, ax=None, n_samp=4000): # Here is the kernel we will use for the GP regression # It consists of a sum of two stochastically driven damped harmonic # oscillators. One of the terms has Q fixed at 1/sqrt(2), which # forces it to be non-periodic. There is also a white noise term # included. # Do some aggressive sigma clipping m = np.ones(len(x), dtype=bool) while True: mu = np.mean(y[m]) sig = np.std(y[m]) m0 = y - mu < 3 * sig if np.all(m0 == m): break m = m0 x_clip, y_clip, yerr_clip = x[m], y[m], yerr[m] if len(x_clip) < n_samp: n_samp = len(x_clip) # Randomly select n points from the light curve for the GP fit x_ind_rand = np.random.choice(len(x_clip), n_samp, replace=False) x_ind = x_ind_rand[np.argsort(x_clip[x_ind_rand])] x_gp = x_clip[x_ind] y_gp = y_clip[x_ind] yerr_gp = yerr_clip[x_ind] fac = 0.9 min_period = period_guess * fac max_period = period_guess / fac gp = fh.get_rotation_gp(x_gp, y_gp, yerr_gp, period_guess, min_period, max_period) # Now calculate the covariance matrix using the initial # kernel parameters gp.compute(x, yerr) def neg_log_like(params, y, gp, m): gp.set_parameter_vector(params) return -gp.log_likelihood(y[m]) def grad_neg_log_like(params, y, gp, m): gp.set_parameter_vector(params) return -gp.grad_log_likelihood(y[m])[1] bounds = gp.get_parameter_bounds() initial_params = gp.get_parameter_vector() if ax: ax.plot(x_gp, y_gp) # Find the best fit kernel parameters. We want to try to ignore the flares # when we do the fit. To do this, we will repeatedly find the best fit # solution to the kernel model, calculate the covariance matrix, predict # the flux and then mask out points based on how far they deviate from # the model. After a few passes, this should cause the model to fit mostly # to periodic features. m = np.ones(len(x_gp), dtype=bool) for i in range(num_iter): n_pts_prev = np.sum(m) gp.compute(x_gp[m], yerr_gp[m]) soln = minimize(neg_log_like, initial_params, jac=grad_neg_log_like, method='L-BFGS-B', bounds=bounds, args=(y_gp, gp, m)) gp.set_parameter_vector(soln.x) initial_params = soln.x mu, var = gp.predict(y_gp[m], x_gp, return_var=True) sig = np.sqrt(var + yerr_gp**2) m0 = y_gp - mu < sig m[m == 1] = m0[m == 1] n_pts = np.sum(m) if n_pts <= 10: raise ValueError('GP iteration threw out too many points') break if (n_pts_prev - n_pts) <= 3: break gp.compute(x_gp[m], yerr_gp[m]) mu, var = gp.predict(y_gp[m], x_gp, return_var=True) return x_gp, mu, var, gp.get_parameter_vector()
arhmm_em_lls = arhmm.fit(traintrials, method="em", num_em_iters=numiters) # Get the inferred states for train and test trials traintrials_z=[arhmm.most_likely_states(traintrial) for traintrial in traintrials] traintrials_z=np.asarray(traintrials_z) testtrials_z=[arhmm.most_likely_states(testtrial) for testtrial in testtrials] testtrials_z=np.asarray(testtrials_z) As=[None]*K; maxvals=[None]*K for k in np.arange(K): As[k]=arhmm.params[2][0][k,:,:]; maxvals[k]=np.var(As[k]) # Tried to permute the states so that it would be 'no movement' --> 'movement', based on the variance of the values in the A matrix (didn't really work) # permute the states sortorder=np.argsort(maxvals) sortedmaxvals=np.sort(maxvals) print(sortorder); print(sortedmaxvals) As=[As[i] for i in sortorder] traintrials_sorted=traintrials_z.copy(); testtrials_sorted=testtrials_z.copy() for k in np.arange(K): traintrials_sorted[traintrials_z==sortorder[k]]=k testtrials_sorted[testtrials_z==sortorder[k]]=k # Plot states of all trials fig = plt.figure(figsize=(8, 10)) plt.imshow(traintrials_sorted, aspect="auto", vmin=0, vmax=K) fig.savefig(os.path.join(resultsfolder,'traintrials_K'+str(K)+'.png')) fig = plt.figure(figsize=(8, 4)) plt.imshow(testtrials_sorted, aspect="auto", vmin=0, vmax=K)
def atomsDistances(positions, cell, cutoff_radius=6.0, self_interaction=False): """ Compute the distance of every atom to its neighbors. This function computes the distances of every central atom to its neighbors. If the distances is larger than the cutoff radius, then the distances will be handled as 0. Here, periodic boundary condition is assuming true for every axis. Parameters: ----------- positions: np.ndarray Atomic positions. The size of this tensor will be (N_atoms, 3), where N_atoms is the number of atoms in the cluster. cell: np.ndarray Periodic cell, which has the size of (3, 3) cutoff_radius: float Cutoff Radius, which is a hyper parameters. The default is 6.0 Angstrom. self_interaction: boolean Default is False, which means that results will not consider the atom itself as its neighbor. Returns: ---------- distances: np.ndarray Differentialble distances array. first_atoms: np.ndarray Atoms that we observed in the cell. The np.unique of first_atoms will be np.arange of the number of atoms in the cell. second_atoms: np.ndarray Atoms that are considered as the neighbor atoms of first atoms. The distances of first_atoms and second_atoms will be computed and stored in the distances array. cell_shift_vector: np.ndarray The cell shift vector of every atom. """ # Compute reciprocal lattice vectors. inverse_cell = np.linalg.pinv(cell).T # Compute distances of cell faces. face_dist_c = 1 / np.linalg.norm(inverse_cell, axis=0) # We use a minimum bin size of 3 A bin_size = max(cutoff_radius, 3) # Compute number of bins, the minimum bin size must be [1., 1., 1.]. nbins_c = np.maximum( (face_dist_c / bin_size - (face_dist_c / bin_size) % 1), [1., 1., 1.]) nbins = np.prod(nbins_c) # Compute the number of neighbor cell that need to be search neighbor_search_x, neighbor_search_y, neighbor_search_z =\ np.ceil(bin_size * nbins_c / face_dist_c).astype(int) # Sort atoms into bins. scaled_positions_ic = np.dot(positions, inverse_cell) % 1 bin_index_ic = scaled_positions_ic * nbins_c - (scaled_positions_ic * nbins_c) % 1 # Convert Cartesian bin index to unique scalar bin index. bin_index_i = (bin_index_ic[:, 0] + nbins_c[0] * (bin_index_ic[:, 1] + nbins_c[1] * bin_index_ic[:, 2])) # atom_i contains atom index in new sort order. atom_i = np.argsort(bin_index_i) bin_index_i = bin_index_i[atom_i] # Compute the maximum number of atoms in a bin max_natoms_per_bin = np.bincount(np.int_(bin_index_i)).max() # Sort atoms into bins. The atoms_in_bin_ba contains the information about where the atoms located. atoms_in_bin_ba = -np.ones([np.int_(nbins), max_natoms_per_bin], dtype=int) for i in range(max_natoms_per_bin): # Create a mask array that identifies the first atom of each bin. mask = np.append([True], bin_index_i[:-1] != bin_index_i[1:]) # Assign all first atoms. atoms_in_bin_ba[np.int_(bin_index_i[mask]), i] = atom_i[mask] # Remove atoms that we just sorted into atoms_in_bin_ba. The next # "first" atom will be the second and so on. mask = np.logical_not(mask) atom_i = atom_i[mask] bin_index_i = bin_index_i[mask] # Create the shift list that indicates that where the cell might shift. shift = [] for x in range(-neighbor_search_x, neighbor_search_x + 1): for y in range(-neighbor_search_y, neighbor_search_y + 1): for z in range(-neighbor_search_z, neighbor_search_z + 1): shift += [[x, y, z]] # Therefore, the possible positions of neighborhood bin can be computed by the following code. neighborbin = (bin_index_ic[:, None] + np.array(shift)[None, :]) % nbins_c cell_shift = ((bin_index_ic[:, None] + np.array(shift)[None, :]) - neighborbin) / nbins_c neighborbin = neighborbin[:, :, 0] + nbins_c[0] * ( neighborbin[:, :, 1] + nbins_c[1] * neighborbin[:, :, 2]) distances = [] first_atoms = [] second_atoms = [] cell_shift_vector = [] for i in range(len(positions)): # Create a mask that indicates which neighborhood bin contains atoms. if self_interaction: mask = (atoms_in_bin_ba[np.int_(neighborbin[i])] != -1) else: mask = np.logical_and( atoms_in_bin_ba[np.int_(neighborbin[i])] != -1, atoms_in_bin_ba[np.int_(neighborbin[i])] != i) distances_vec = positions[atoms_in_bin_ba[np.int_( neighborbin[i])]] - positions[i] # the distance should consider the cell shift distances_vec = distances_vec + np.dot(cell_shift[i], cell)[:, None] # make the cell shift vector for every atom instead of every bin. _cell_shift_vector = np.repeat(cell_shift[i][:, None], max_natoms_per_bin, axis=1)[mask] distances_vec = distances_vec[mask] temp_distances = np.sum(distances_vec * distances_vec, axis=1) temp_distances = (temp_distances)**0.5 cutoff_mask = (temp_distances < cutoff_radius) _second_atoms = atoms_in_bin_ba[np.int_( neighborbin[i])][mask][cutoff_mask] _first_atoms = [i] * len(_second_atoms) _cell_shift_vector = _cell_shift_vector[cutoff_mask] first_atoms.extend(_first_atoms) second_atoms.extend(_second_atoms) distances.extend(temp_distances[cutoff_mask]) cell_shift_vector.extend(_cell_shift_vector) distances = np.array(distances) cell_shift_vector = np.array(cell_shift_vector) first_atoms = np.array(first_atoms) second_atoms = np.array(second_atoms) return distances, first_atoms, second_atoms, cell_shift_vector
def other_r_selection(rl1_select, z2_z1s, Lt, head = True,\ mode_multi = False): ''' Chose the meaningful dimensions from the second layer of each head h/tail rl1_select (list): The dimension kept over the first layer of head/tail z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s Lt (list of int): The number of layers on the common tail head (Bool): Whether to determine head (True) or tail layers (False) dimensions mode_multi (Bool): Whether the algorithm is in multi_clus mode -------------------------------------------------------------------------- return (list of int): The dimensions to keep from the second layer of the head/tail ''' S = [zz.shape[2] for zz in z2_z1s] + [1] CORR_THRESHOLD = 0.20 Lh = len(z2_z1s) rh = [z2_z1s[l].shape[-1] for l in range(Lh)] M = np.array([zz.shape[0] for zz in z2_z1s] + [z2_z1s[-1].shape[1]]) prev_new_r = [len(rl1_select)] dims_to_keep = [] dims_corr = [] # The correlations associated with the different dimensions for l in range(Lh): # Will not keep the following layers if one of the previous layer is of dim 1 if prev_new_r[l] <= 1: dims_to_keep.append([]) prev_new_r.append(0) else: old_rl = rh[l] corr = np.zeros(old_rl) for s in range(S[l]): for m1 in range(M[l + 1]): pca = PCA(n_components=1) pca.fit_transform(z2_z1s[l][m1, :, s]) corr += np.abs(pca.components_[0]) average_corr = corr / (S[l] * M[l + 1]) dims_corr.append(average_corr) new_rl = np.sum(average_corr > CORR_THRESHOLD) if new_rl < prev_new_r[l]: # Respect r1 > r2 > r3 .... # If multimode keep the same number of components and layer on the tail if mode_multi: if head: min_rl_for_viable_arch = Lh + Lt - (l + 1) else: min_rl_for_viable_arch = np.max(Lt - (l + 1), 0) else: if head: # If last layer of an head if (Lh >= 1) & (l == Lh - 2): # If this layer is a bottleneck, we have to delete it if new_rl <= 2: # Empty last head layer dims_to_keep.append([]) prev_new_r.append(0) dims_corr[-1] = np.full(rh[l], 0.0) # Tail layers remain the unchanged for l1 in range(l + 1, Lh): dims_to_keep.append(list(range(rh[l1]))) prev_new_r.append(rh[l1]) dims_corr.append(np.full(rh[l1], 1.0)) break else: min_rl_for_viable_arch = new_rl else: # To adapt min_rl_for_viable_arch = 2 + Lh - (l + 1) else: min_rl_for_viable_arch = np.max(1 - l, 0) # Need to have an identifiable model but also a viable architecture if new_rl >= min_rl_for_viable_arch: wanted_dims = np.where(average_corr > CORR_THRESHOLD)[0].tolist() else: wanted_dims = np.argsort(- average_corr)[:min_rl_for_viable_arch]# -avg_score: In descending order wanted_dims = np.sort(wanted_dims) dims_to_keep.append(deepcopy(wanted_dims)) else: # Have to delete other dimensions to match r1 > r2 > r3 .... nb_dims_to_remove = old_rl - prev_new_r[l] + 1 unwanted_dims = np.argpartition(average_corr, nb_dims_to_remove)[:nb_dims_to_remove] wanted_dims = list(set(range(old_rl)) - set(unwanted_dims)) wanted_dims = np.sort(wanted_dims) dims_to_keep.append(deepcopy(wanted_dims)) new_rl = len(wanted_dims) prev_new_r.append(new_rl) return dims_to_keep, dims_corr
day = day[:day.rfind('_')] if area is not None and fit[0] > 0.2: temp_model.set_filter_params(params) filtDict = getFiltersFromParams(temp_model) bestim = 0 peaks = [] for filtname, filt in filtDict.items(): filtpeak = np.max(filt) # fitDists[filtname].append(filtpeak) if 'im' in filtname and filtpeak > bestim: bestim = filtpeak peaks.append((filtname, filtpeak)) orderOfimport = np.argsort([p[1] for p in peaks]) for i, sortind in enumerate(orderOfimport): fitDists[peaks[sortind][0]].append(i) fitDists['bestImageWeight'].append(bestim) master_id.append(day + '_' + uid) master_fit.append(fit) master_area.append(area) master_area = np.array(master_area) filtname = 'change' plt.figure(filtname) for area, _ in areas: # normed = np.array(fitDists[area][filtname])/np.array(fitDists[area]['bestImageWeight']) x, c = cumulative_dist(np.array(fitDists[filtname])[master_area == area])
def inference_graph_iht(self, x, w, relaxed=False, return_energy=False, **kwargs): """find argmax_y np.dot(w, joint_feature(x, y))""" # j = int(len(x) * 0.1 / 2.0) np.set_printoptions(threshold=sys.maxsize) max_iter = 1000 y_hat = x # y_hat = np.random.rand(len(x)) yt = np.copy(y_hat) for iter in range(max_iter): # print("---------------------------------------------------------------------------") # print("iter {}".format(iter)) # print("current w: {}".format(w)) # print("current yt {}".format(yt)) # print("current joint feature: {}".format(self.joint_feature(x, yt))) # print("current delta joint feature: {}".format(self.delta_joint_feature(x, yt, w))) Omega_X = [] y_prev = np.copy(yt) gradient = self._get_objective_grad(x, yt, w) # print("gradient: {}".format(gradient)) normalized_grad = self._normalized_gradient(yt, gradient) # print("normalized gradient {}".format(normalized_grad)) # print("normalized gradient {}".format(np.nonzero(normalized_grad))) sig_nodes = [] for i, ng in enumerate(normalized_grad): if ng == 1.0: sig_nodes.append(i) # print("sig nodes {}".format(len(sig_nodes))) # print("sig nodes {}".format(sig_nodes)) # g: number of connected component edges = np.array(self.edges) costs = np.ones(len(edges)) # re_head = head_proj(edges=edges, weights=costs, x=normalized_grad, g=1, s=k, budget=k - 1, delta=1. / 169., # max_iter=100, err_tol=1e-8, root=-1, pruning='strong', epsilon=1e-10, verbose=0) # re_nodes, re_edges, p_y = re_head re_head = self.algo_head_tail_bisearch(edges=edges, x=normalized_grad, costs=costs, g=1, root=-1, s_low=250, s_high=300, max_num_iter=1000, verbose=0) re_nodes, p_y = re_head omega_yt = set(re_nodes) # print("omega_yt {}".format(len(omega_yt))) indicator_yt = np.zeros_like(yt) indicator_yt[list(omega_yt)] = 1.0 # print("current yt {}".format(yt)) by = (yt + 0.001 * gradient) * indicator_yt # print("gradient ascent result {}".format(yt + 0.001 * gradient)) # print("current by {}".format(by)) sorted_indices = np.argsort(by)[::-1] by[by <= 0.0] = 0.0 num_non_posi = len(np.where(by == 0.0)) by[by > 1.0] = 1.0 if num_non_posi == len(x): print("siga-1 is too large and all values in the gradient are non-positive") for i in range(5): by[sorted_indices[i]] = 1.0 edges = np.array(self.edges) costs = np.ones(len(edges)) # re_tail = tail_proj(edges=edges, weights=costs, x=by, g=1, s=k, budget=k - 1, nu=2.5, max_iter=100, # err_tol=1e-8, root=-1, pruning='strong', verbose=0) # re_nodes, re_edges, p_y = re_tail re_tail = self.algo_head_tail_bisearch(edges=edges, x=by, costs=costs, g=1, root=-1, s_low=240, s_high=260, max_num_iter=1000, verbose=0) re_nodes, p_y = re_tail psi_y = re_nodes # print("psi_y {}".format(psi_y)) yt = np.zeros_like(yt) yt[list(psi_y)] = by[list(psi_y)] # TODO: note the non-zero entries of xt[list(psi_x)] may not be connected # print("yt {}".format(np.nonzero(yt))) gap_y = np.linalg.norm(yt - y_prev) ** 2 if gap_y < 1e-6: break value = np.dot(w, self.joint_feature(x, yt)) # print("value {}, w {}, joint feature {}".format(value, w, self.joint_feature(x, yt))) return yt
def nn_sgd(x_train, x_valid, x_test, y_train, y_valid, y_test, M, B, l, rates, t=False, v=False, d=False): #Xavier Initialization for Weights #Zero Initialization for Biases w_1 = np.random.randn(M, 784) / np.sqrt(784) w_2 = np.random.randn(M, M) / np.sqrt(M) w_3 = np.random.randn(10, M) / np.sqrt(M) b_1 = np.zeros((M, 1)) b_2 = np.zeros((M, 1)) b_3 = np.zeros((10, 1)) #Neg Log-Likelihood for Training and Validation Sets nll_t = {} nll_v = {} for r in rates: nll_t[r] = list() nll_v[r] = list() #Mimimum Validation Log-Likelihood ll_v_min = np.inf ll_v_it_min = 0 for i in range(0, l): if not v: #Compute Full-Batch Neg Log-Likelihood for Training and Validation Sets nll_v_fb = negative_log_likelihood(w_1, w_2, w_3, b_1, b_2, b_3, x_valid, y_valid) nll_v[r].append(nll_v_fb) #Compute Minimum Log-Likelihood for Validation Set and Corresponding Weights if nll_v_fb < ll_v_min: ll_v_min = nll_v_fb ll_v_it_min = i w_1_opt = w_1 w_2_opt = w_2 w_3_opt = w_3 b_1_opt = b_1 b_2_opt = b_2 b_3_opt = b_3 #Compute a list of 250 random integers as the Mini-Batch Indices ind = np.random.choice(x_train.shape[0], size=B, replace=False) mini_b_x = x_train[ind, :] mini_b_y = y_train[ind, :] #Compute Log-Likelihood and Corresponding Gradients (nll, (w_1_g, w_2_g, w_3_g, b_1_g, b_2_g, b_3_g)) = nll_gradients(w_1, w_2, w_3, b_1, b_2, b_3, mini_b_x, mini_b_y) if not t and not v: nll_t[r].append(nll / 250 * 10000) #Update Weights w_1 = up_weight(w_1, w_1_g, r, 1) w_2 = up_weight(w_2, w_2_g, r, 1) w_3 = up_weight(w_3, w_3_g, r, 1) b_1 = up_weight(b_1, b_1_g, r, 1) b_2 = up_weight(b_2, b_2_g, r, 1) b_3 = up_weight(b_3, b_3_g, r, 1) if not v and not d: #Print Results print('Results of ' + str(M) + ' Neuron w/ Learning Rate ' + str(r) + ':' + '\n') if not t: print('Train Neg Log-Likelihood: ' + str(nll_t[r][-1]) + '\n') print('Valid Neg Log-Likelihood: ' + str(nll_v[r][-1]) + '\n') print('Minimum Valid Neg Log-Likelihood: ' + str(ll_v_min) + ' at Iteration ' + str(ll_v_it_min + 1) + '\n') #Compute Optimal Validation and Test Sets Log-Likelihood and Accuracy Ratio if t: ratio_v = acc_ratio_2(w_1_opt, w_2_opt, w_3_opt, b_1_opt, b_2_opt, b_3_opt, x_valid, y_valid) ratio_test = acc_ratio_2(w_1_opt, w_2_opt, w_3_opt, b_1_opt, b_2_opt, b_3_opt, x_test, y_test) nll_test = negative_log_likelihood(w_1_opt, w_2_opt, w_3_opt, b_1_opt, b_2_opt, b_3_opt, x_test, y_test) print('Optimal Validation Ratio: ' + str(ratio_v) + '\n') print('Optimal Test Ratio: ' + str(ratio_test) + '\n') print('Optimal Test Neg Log-Likelihood: ' + str(nll_test) + ' at Iteration ' + str(ll_v_it_min + 1)) if d: F = np.max(np.exp( forward_pass(w_1_opt, w_2_opt, w_3_opt, b_1_opt, b_2_opt, b_3_opt, x_test)), axis=1) ind_sorted = np.argsort(F) test_sorted = x_test[ind_sorted] print('\n') if d: return ind_sorted, test_sorted if not v: if not t: return nll_t, nll_v else: return nll_v else: seen = list() for i in range(0, 17): j = np.random.randint(M) if j not in seen: seen.append(j) plot_digit(w_1[j], j, 0, neuron=True) else: i -= 1
# Save the parameters of variational posterior pickle.dump(var_par, open('./data/var_par.p', 'wb')) var_par = pickle.load(open('./data/var_par.p', 'rb')) pars = get_pars(var_par[:l], k, d) pi = stick_backward(pars['pi']) mus = pars['mu'].reshape([k, d]) taus = np.exp(pars['tau']) colors = ['red', 'blue', 'yellow', 'orange', 'turquoise'] from matplotlib.backends.backend_pdf import PdfPages pp = PdfPages('mog_advi.pdf') fracs = np.argsort(pi)[-5:] mus = mus[fracs, :] taus = taus[fracs] circle = [] true_circle = [] for n, color in enumerate(colors): v, w = np.linalg.eigh(taus[n] * np.eye(k)) v_true, w_true = np.linalg.eigh(ts[n] * np.eye(k)) u = w[0] / np.linalg.norm(w[0]) u_true = w_true[0] / np.linalg.norm(w_true[0]) angle = np.arctan2(u[1], u[0]) angle_true = np.arctan2(u_true[1], u_true[0]) angle = 180 * angle / np.pi angle_true = 180 * angle_true / np.pi v = 2. * np.sqrt(2.) * np.sqrt(v)
w, u, b = W[k], U[k], B[k] u_hat = (m(np.dot(w, u)) - np.dot(w, u)) * (w / np.linalg.norm(w)) + u z_prev = z_prev + np.outer(h(np.matmul(z_prev, w) + b), u_hat) z_K = z_prev plt.figure(figsize=(10, 8)) plt.plot(objectives) plt.show() # fig,ax=plt.subplots(1,1,figsize = (10,8)) # nbins = 100 # x, y = z0[:, 0], z0[:, 1] # xi, yi = numpy.mgrid[-4:4:nbins*1j, -4:4:nbins*1j] # zi = np.array([func(np.vstack([xi.flatten(), yi.flatten()])[:,i].reshape(-1,2)) for i in range(nbins**2)]) # ax.pcolormesh(xi, yi, zi.reshape(xi.shape)) # ax.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=plt.cm.Reds_r) # plt.scatter(z_K[:,0], z_K[:,1], alpha=0.2) # plt.xlim([-4, 4]) # plt.ylim([-4, 4]) # # plt.savefig('results/'+func.__name__+'/'+func.__name__+'_'+str(K)+'_'+str(num_iter)+'.png') # plt.show() plt.figure(figsize=(10, 8)) samples = np.linspace(-3, 3, 601) z = np.array([gmm(samples[i]) for i in range(samples.shape[0])]) idx = np.argsort(samples) plt.plot(samples[idx], z[idx], label='p') plt.hist(z_K, 100, label='q', density=True) plt.legend() plt.show()
def illustrate_gradients(g, pts, **kwargs): # user defined args pts_max = np.max(np.max(pts)) + 3 viewmax = max(3, pts_max) colors = ['lime', 'magenta', 'orangered'] if 'viewmax' in kwargs: viewmax = kwargs['viewmax'] num_contours = 15 if 'num_contours' in kwargs: num_contours = kwargs['num_contours'] ##### setup figure to plot ##### # initialize figure fig = plt.figure(figsize=(8, 4)) # create subplot with 3 panels, plot input function in center plot gs = gridspec.GridSpec(1, 3, width_ratios=[1, 5, 1]) ax1 = plt.subplot(gs[0]) ax1.axis('off') ax2 = plt.subplot(gs[1]) ax2.set_aspect('equal') ax3 = plt.subplot(gs[2]) ax3.axis('off') ### compute gradient of input function ### nabla_g = grad(g) # loop over points and determine levels num_pts = pts.shape[1] levels = [] for t in range(num_pts): pt = pts[:, t] g_val = g(pt) levels.append(g_val) levels = np.array(levels) inds = np.argsort(levels, axis=None) pts = pts[:, inds] levels = levels[inds] # evaluate all input points through gradient function grad_pts = [] num_pts = pts.shape[1] for t in range(num_pts): # point color = colors[t] pt = pts[:, t] nabla_pt = nabla_g(pt) nabla_pt /= np.linalg.norm(nabla_pt) # plot original points ax2.scatter(pt[0], pt[1], s=80, c=color, edgecolor='k', linewidth=2, zorder=3) ### draw 2d arrow in right plot ### # create gradient vector grad_pt = pt - nabla_pt # plot gradient direction scale = 0.3 arrow_pt = (grad_pt - pt) * 0.78 * viewmax * scale ax2.arrow(pt[0], pt[1], arrow_pt[0], arrow_pt[1], head_width=0.1, head_length=0.1, fc='k', ec='k', linewidth=4, zorder=2, length_includes_head=True) ax2.arrow(pt[0], pt[1], arrow_pt[0], arrow_pt[1], head_width=0.1, head_length=0.1, fc=color, ec=color, linewidth=2.75, zorder=2, length_includes_head=True) ### compute orthogonal line to contour ### # compute slope of gradient direction slope = float(arrow_pt[1]) / float(arrow_pt[0]) perp_slope = -1 / slope perp_inter = pt[1] - perp_slope * pt[0] # find points on orthog line approx 'scale' away in both directions (lazy quadratic formula) scale = 1.5 s = np.linspace(pt[0] - 5, pt[0] + 5, 1000) y2 = perp_slope * s + perp_inter dists = np.abs(((s - pt[0])**2 + (y2 - pt[1])**2)**0.5 - scale) ind = np.argmin(dists) x2 = s[ind] # plot tangent line to contour if x2 < pt[0]: s = np.linspace(x2, pt[0] + abs(x2 - pt[0]), 200) else: s = np.linspace(pt[0] - abs(x2 - pt[0]), x2, 200) v = perp_slope * s + perp_inter ax2.plot(s, v, zorder=2, c='k', linewidth=3) ax2.plot(s, v, zorder=2, c=colors[t], linewidth=1) # generate viewing range contour_plot(ax2, g, pts, viewmax, num_contours, colors, levels) plt.show()
def __init__(self, *args, **kwargs): super(TestNeuralNetworkHingeSynthetic, self).__init__(*args, **kwargs) self.train_size = 250 self.test_size = 10 self.noise_factor = 0.0 np.random.seed(1) features_train = np.asarray( onp.random.randint(low=0, high=30, size=(self.train_size, 4))) features_test = np.asarray( onp.random.randint(low=0, high=30, size=(self.test_size, 4))) def create_performances(feature_list): performances = [] for features in feature_list: # generate performances as functions linear in the features performance_1 = 5 * features[0] + 2 * features[ 1] + 7 * features[2] + 42 performance_2 = 3 * features[1] + 5 * features[3] + 14 performance_3 = 2 * features[0] + 4 * features[ 1] + 11 * features[3] + 77 performance_4 = 7 * features[1] + 4 * features[ 0] + 11 * features[2] + features[3] performance_5 = 2 * features[1] + 9 * features[ 2] + 7 * features[3] + 12 + features[0] performances.append([ performance_1, performance_2, performance_3, performance_4, performance_5 ]) # performances.append([performance_1, performance_5]) return performances performances_train = np.asarray(create_performances(features_train), dtype=np.float64) performances_test = np.asarray(create_performances(features_test), dtype=np.float64) features_train = np.asarray(features_train, dtype=np.float64) features_test = np.asarray(features_test, dtype=np.float64) rankings_train = np.argsort(np.argsort( np.asarray(performances_train))) + 1 rankings_test = np.argsort(np.argsort( np.asarray(performances_test))) + 1 scaler = StandardScaler() features_train = scaler.fit_transform(features_train) features_test = scaler.transform(features_test) self.train_inst = pd.DataFrame(data=features_train, columns=["a", "b", "c", "d"]) self.test_inst = pd.DataFrame(data=features_test, columns=["a", "b", "c", "d"]) self.train_performances = pd.DataFrame( data=performances_train, columns=["alg1", "alg2", "alg3", "alg4", "alg5"]) self.test_performances = pd.DataFrame( data=performances_test, columns=["alg1", "alg2", "alg3", "alg4", "alg5"]) self.train_ranking = pd.DataFrame( data=rankings_train, columns=["alg1", "alg2", "alg3", "alg4", "alg5"]) self.test_ranking = pd.DataFrame( data=rankings_test, columns=["alg1", "alg2", "alg3", "alg4", "alg5"]) print("train instances", self.train_inst) print("test instances", self.test_inst) print("train performances", self.train_performances) print("train rankings", self.train_ranking) print("test performances", self.test_performances) print("test rankings", self.test_ranking)
def fda(X, y, p=2, reg=1e-16): """ Fisher Discriminant Analysis Parameters ---------- X : numpy.ndarray (n,d) Training samples y : np.ndarray (n,) labels for training samples p : int, optional size of dimensionnality reduction reg : float, optional Regularization term >0 (ridge regularization) Returns ------- P : (d x p) ndarray Optimal transportation matrix for the given parameters proj : fun projection function including mean centering """ mx = np.mean(X) X -= mx.reshape((1, -1)) # data split between classes d = X.shape[1] xc = split_classes(X, y) nc = len(xc) p = min(nc - 1, p) Cw = 0 for x in xc: Cw += np.cov(x, rowvar=False) Cw /= nc mxc = np.zeros((d, nc)) for i in range(nc): mxc[:, i] = np.mean(xc[i]) mx0 = np.mean(mxc, 1) Cb = 0 for i in range(nc): Cb += (mxc[:, i] - mx0).reshape((-1, 1)) * \ (mxc[:, i] - mx0).reshape((1, -1)) w, V = linalg.eig(Cb, Cw + reg * np.eye(d)) idx = np.argsort(w.real) Popt = V[:, idx[-p:]] def proj(X): return (X - mx.reshape((1, -1))).dot(Popt) return Popt, proj
def _read_kurucz_spec(f): """ Read Kurucz spectra that have been precomputed Args: f (string) : path to the file to be read Returns: new_vel (real array) : velocity axis in km/s spectrum (real array) : spectrum for each velocity bin """ f = open(f, "rb") res = f.read() n_chunk = struct.unpack('i',res[0:4]) freq = [] stokes = [] cont = [] left = 4 for i in range(n_chunk[0]): right = left + 4 n = struct.unpack('i',res[left:right]) left = right right = left + 4 nmus = struct.unpack('i',res[left:right]) left = right right = left + 8*n[0] t1 = np.asarray(struct.unpack('d'*n[0],res[left:right])) freq.append(t1) left = right right = left + 8*n[0]*nmus[0] t2 = np.asarray(struct.unpack('d'*n[0]*nmus[0],res[left:right])).reshape((n[0],nmus[0])) stokes.append(t2) left = right right = left + 8*n[0]*nmus[0] t2 = np.asarray(struct.unpack('d'*n[0]*nmus[0],res[left:right])).reshape((n[0],nmus[0])) cont.append(t2) left = right freq = np.concatenate(freq) stokes = np.concatenate(stokes) cont = np.concatenate(cont) ind = np.argsort(freq) freq = freq[ind] stokes = stokes[ind] cont = cont[ind] wavelength = const.c.to('cm/s').value / freq mean_wavelength = np.mean(wavelength) vel = (wavelength - mean_wavelength) / mean_wavelength * const.c.to('km/s').value nl, nmus = stokes.shape # Reinterpolate in a equidistant velocity axis new_vel = np.linspace(np.min(vel), np.max(vel), nl) for i in range(nmus): interpolator = scipy.interpolate.interp1d(vel, stokes[:,i], kind='linear') stokes[:,i] = interpolator(new_vel) return new_vel, wavelength, stokes
def fo_ess_compute_newton(diagonal, num_peds, robot_mu_x, robot_mu_y, \ ped_mu_x, ped_mu_y, cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, cov_ped_x, cov_ped_y, \ inv_cov_ped_x, inv_cov_ped_y, \ one_over_cov_sum_x, one_over_cov_sum_y, normalize): delta0 = [0. for _ in range(num_peds)] norm_delta0 = [0. for _ in range(num_peds)] norm_delta0_normalized = [0. for _ in range(num_peds)] T = np.size(robot_mu_x) for ped in range(num_peds): x0 = np.zeros(4 * T) x0 = robot_mu_x x0 = np.concatenate((x0, robot_mu_y)) x0 = np.concatenate((x0, ped_mu_x[ped])) x0 = np.concatenate((x0, ped_mu_y[ped])) if diagonal: g_ll = fo_diag_ess.d_ll(x0, T, \ robot_mu_x, robot_mu_y, \ ped_mu_x[ped], ped_mu_y[ped], \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x[ped], cov_ped_y[ped], \ inv_cov_ped_x[ped], inv_cov_ped_y[ped], \ one_over_cov_sum_x[ped], one_over_cov_sum_y[ped], \ normalize) h_ll = fo_diag_ess.dd_ll(x0, T, \ robot_mu_x, robot_mu_y, \ ped_mu_x[ped], ped_mu_y[ped], \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x[ped], cov_ped_y[ped], \ inv_cov_ped_x[ped], inv_cov_ped_y[ped], \ one_over_cov_sum_x[ped], one_over_cov_sum_y[ped], \ normalize) else: g_ll = fo_dense_ess.d_ll(x0, T, \ robot_mu_x, robot_mu_y, \ ped_mu_x[ped], ped_mu_y[ped], \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x[ped], cov_ped_y[ped], \ inv_cov_ped_x[ped], inv_cov_ped_y[ped], \ one_over_cov_sum_x[ped], one_over_cov_sum_y[ped], \ normalize) h_ll = fo_dense_ess.dd_ll(x0, T, \ robot_mu_x, robot_mu_y, \ ped_mu_x[ped], ped_mu_y[ped], \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x[ped], cov_ped_y[ped], \ inv_cov_ped_x[ped], inv_cov_ped_y[ped], \ one_over_cov_sum_x[ped], one_over_cov_sum_y[ped], \ normalize) delta0[ped] = np.linalg.solve(h_ll, -g_ll) norm_delta0[ped] = np.linalg.norm(delta0[ped]) #############################MINIMIZE ON EACH AGENT # x0 = np.zeros(4*T) # x0 = robot_mu_x # x0 = np.concatenate((x0, robot_mu_y)) # x0 = np.concatenate((x0, ped_mu_x[ped])) # x0 = np.concatenate((x0, ped_mu_y[ped])) # f = sp.optimize.minimize(diag_ll_ess, x0, \ # args=(T, robot_mu_x, robot_mu_y, \ # ped_mu_x[ped], ped_mu_y[ped], \ # inv_cov_robot_x, inv_cov_robot_y, \ # inv_cov_ped_x[ped], inv_cov_ped_y[ped], \ # one_over_cov_sum_x[ped], one_over_cov_sum_y[ped], \ # one_over_std_sum_x[ped], one_over_std_sum_y[ped]), \ # method='trust-krylov',\ # jac=fo_diag_ess.d_ll, hess=so_diag_ess.dd_ll) # norm_delta0[ped] = np.linalg.norm(f.x[:T]-robot_mu_x) + \ # np.linalg.norm(f.x[T:2*T]-robot_mu_y) # norm_z_ess_normalized = np.divide(norm_z_ess, (np.sum(norm_z_ess))) # ess = 1./np.sum(np.power(norm_z_ess_normalized, 2)) # top_Z_indices = np.argsort(norm_z_ess_normalized)[::-1] norm_delta0_normalized = norm_delta0 / (np.sum(norm_delta0)) ess = np.power(np.sum(np.power(norm_delta0_normalized, 2)), -1) if np.isnan(ess): ess = 1. print(f"ESS IS 0 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") else: ess = np.int(ess) top_Z_indices = np.argsort(norm_delta0_normalized)[::-1] return ess, top_Z_indices
def neural_network(x_train, x_valid, x_test, y_train, y_valid, y_test, weights, bias, batch_size, test=False, acc=False, \ visual=False, digit=False, iterations=1000, rates=[0.0001, 0.001]): ''' trains a neural network for the data in x_train, and computes the training and validation error per iteration param data: from the data_utils.py script param weights: list of W, initial weight vector guesses param bias: list of b, initial bias guesses param batch_size: int, number of random elements to include in the mini-batch param test: bool, True if we should calculate the test error for the set param acc: bool, True if we want to calculate the accuracy of the model param visual: bool, True if we want to visualize 16 of the weights param iterations: int, number of iterations of GD to execute param rates: list of floats, learning rates to train the model with GD ''' W1 = weights[0] W2 = weights[1] W3 = weights[2] b1 = bias[0] b2 = bias[1] b3 = bias[2] train_nll = {} valid_nll = {} valid_acc = {} test_nll = {} test_acc = {} min_valid_nll = {} min_nll_it = {} for rate in rates: train_nll[rate] = [] valid_nll[rate] = [] min_nll = np.inf min_nll_weights = None min_nll_bias = None for i in range(iterations): # compute mini-batch gradient descent to estimate the new weights batch_indices = np.random.choice(np.shape(x_train)[0], size=batch_size, replace=False) x_mini_batch = x_train[batch_indices, :] y_mini_batch = y_train[batch_indices, :] # use the autograd nll function (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = \ nll_gradients(W1, W2, W3, b1, b2, b3, x_mini_batch, y_mini_batch) # calculate the full-batch validation neg. ll at every iteration cur_valid_nll = negative_log_likelihood(W1, W2, W3, b1, b2, b3, x_valid, y_valid) valid_nll[rate].append(cur_valid_nll) # use the mini-batch neg. log likelihood but scaled to the size of x_train (NORMALIZED) train_nll[rate].append(nll*len(x_train)/batch_size) # compute the minimum neg. ll to use this number of iterations on the test set if cur_valid_nll < min_nll: min_nll = cur_valid_nll min_nll_weights = [W1, W2, W3] min_nll_bias = [b1, b2, b3] # dictionaries we need to return min_valid_nll[rate] = cur_valid_nll min_nll_it[rate] = i+1 # calculate new weights [W1, W2, W3] = update_weights([W1, W2, W3], [W1_grad, W2_grad, W3_grad], rate) [b1, b2, b3] = update_bias([b1, b2, b3], [b1_grad, b2_grad, b3_grad], rate) if test: # use early stopping for the test set based on the minimum validation error test_nll[rate] = negative_log_likelihood(min_nll_weights[0], min_nll_weights[1], \ min_nll_weights[2], min_nll_bias[0], min_nll_bias[1], min_nll_bias[2], x_test, y_test) # use the min validation error iteration to compute accuracy (test and validation) valid_acc[rate] = calculate_accuracy(min_nll_weights, min_nll_bias, x_valid, y_valid) test_acc[rate] = calculate_accuracy(min_nll_weights, min_nll_bias, x_test, y_test) if digit: Fhat = np.max(np.exp(forward_pass(min_nll_weights[0], min_nll_weights[1], \ min_nll_weights[2], min_nll_bias[0], min_nll_bias[1], min_nll_bias[2], x_test)), axis=1) sorted_ind = np.argsort(Fhat) sorted_test_set = x_test[sorted_ind] if visual: # visualize 16 random weights for the first layer of the network M = len(W1) for i in range(17): j = np.random.randint(M) plot_digit_mod(W1[j], i + 10, "weight_vis_" + str(j)) elif digit: # sorted ind and sorted_test_set exist, so we will plot the figures for i in range(10): plot_digit_mod(sorted_test_set[i], i + 26, "test_" + str(sorted_ind[i]) + "rank_" + str(i)) return train_nll, valid_nll, valid_acc, test_nll, test_acc, min_valid_nll, min_nll_it
def fit( self, train_sample: tp.Tuple[np.ndarray], validation_sample: tp.Tuple[np.ndarray], max_gmdh_layers: int, n_best_to_take: int, batch_size: tp.Optional[int] = None, minimize_metric: bool = True, verbose: tp.Optional[bool] = None, ): """ Fit network on given input Parameters ---------- :param train_sample: Train pair (X, y). :type train_sample: tuple :param validation_sample: Validation pair (X, y). :type validation_sample: tuple :max_gmdh_layers: Maximum number of GMDH layers. :type max_gmdh_layers: int :n_best_to_take: Number of best GMDH outputs that go to the next layer. :type n_best_to_take: int :n_best_to_take: Number of best GMDH outputs that go to the next layer. :type n_best_to_take: int :batch_size: If we have `long` data we can optimize it by batches. :type batch_size: int :minimize_metric: Whether we minimize target metric. :type minimize_metric: bool :verbose: Whether we turn on verbosity. :type verbose: bool Returns ------- :returns: Tuple (trained_model, loss_history, best_test_pred, best_train_pred) :rtype: union[FFN, dict, np.array, np.array] """ verbose = verbose if verbose else False is_fuzzy = self._method_type == "fuzzy" all_possible_pairs = list( combinations(range(train_sample[0].shape[1]), 2)) overall_best_metric = np.inf if minimize_metric else -np.inf best_test_pred = None best_train_pred = None history = dict(layer=[], train_loss=[], validation_loss=[]) for r in tqdm(range(max_gmdh_layers), desc="Training "): layer_metrics = [] layer_metrics_train = [] layer_val_preds = [] layer_train_preds = [] history_weights = [] histoty_pairs = [] for pair in tqdm(all_possible_pairs, desc="One fit"): if batch_size is not None and train_sample[0].shape[ 1] < batch_size: for (X, y) in gen_batch(train_sample, batch_size): ( metric_val, metric_train, prediction_val, prediction_train, stop_outer_loop, ) = self.one_fit( is_fuzzy=is_fuzzy, train_sample=(X, y), validation_sample=validation_sample, pair=pair, ) if stop_outer_loop: break # Exclude not full batch if prediction_train.shape[0] != batch_size: break layer_metrics.append(metric_val) layer_metrics_train.append(metric_train) layer_val_preds.append(prediction_val) layer_train_preds.append(prediction_train) history_weights.append(self.W_vect) histoty_pairs.append(pair) if stop_outer_loop: break else: ( metric_val, metric_train, prediction_val, prediction_train, stop_outer_loop, ) = self.one_fit( is_fuzzy=is_fuzzy, train_sample=train_sample, validation_sample=validation_sample, pair=pair, ) if stop_outer_loop: break layer_metrics.append(metric_val) layer_metrics_train.append(metric_train) layer_val_preds.append(prediction_val) layer_train_preds.append(prediction_train) history_weights.append(self.W_vect) histoty_pairs.append(pair) if stop_outer_loop: warnings.warn("Something gone wrong in optimization") break layer_metrics = np.array(layer_metrics) layer_metrics_train = np.array(layer_metrics_train) layer_val_preds = np.concatenate(layer_val_preds, axis=-1) layer_train_preds = np.concatenate(layer_train_preds, axis=-1) if minimize_metric: sorted_indices = np.argsort(layer_metrics) else: sorted_indices = np.argsort(-layer_metrics) best_metric = layer_metrics[sorted_indices[0]] history["validation_loss"].append(best_metric) history["train_loss"].append( layer_metrics_train[sorted_indices[0]]) layer_val_preds = layer_val_preds[:, sorted_indices] validation_sample = ( layer_val_preds[:, :n_best_to_take], validation_sample[1], ) layer_train_preds = layer_train_preds[:, sorted_indices] train_sample = (layer_train_preds[:, :n_best_to_take], train_sample[1]) all_possible_pairs = list( combinations(range(train_sample[0].shape[1]), 2)) if verbose: print(f"Layer: {r}. Metric: {best_metric}") if minimize_metric and best_metric < overall_best_metric: overall_best_metric = best_metric best_test_pred = layer_val_preds[:, 0][..., np.newaxis] best_train_pred = layer_train_preds[:, 0][..., np.newaxis] self.predict_history["pairs"].append([ histoty_pairs[i] for i in sorted_indices[:n_best_to_take] ]) self.predict_history["weights"].append([ history_weights[i] for i in sorted_indices[:n_best_to_take] ]) elif (not minimize_metric) and best_metric > overall_best_metric: overall_best_metric = best_metric best_test_pred = layer_val_preds[:, 0][..., np.newaxis] best_train_pred = layer_train_preds[:, 0][..., np.newaxis] self.predict_history["pairs"].append([ histoty_pairs[i] for i in sorted_indices[:n_best_to_take] ]) self.predict_history["weights"].append([ history_weights[i] for i in sorted_indices[:n_best_to_take] ]) else: break return self, history
def compute_gsw(self, X, sw_mu, sw_sigma, theta=None, proj=True, requires_grad=False): """ Computes Sliced-Wasserstein distance of order 2 between two empirical distributions. Note that the number of samples is assumed to be equal (This is however not necessary and could be easily extended for empirical distributions with different number of samples) :param X: stacks of samples from the first distribution (M x N x d matrix, with M the number of stacks, N the number of samples and d the dimension) :param Y: stacks of samples from the second distribution (M x N x d matrix, with M the number of stacks, N the number of samples and d the dimension) :param theta: stacks of directions of projections (M x L x d matrix, with M the number of stacks, L the number of directions and d the dimension) :return: the sliced-Wasserstein distance between X[i] and Y[i] for i in 1,...,M (vector of size M) """ M, N, d = X.shape n_montecarlo = M n_generated_samples = N gamma = sw_sigma * np.eye(d) U = np.random.normal(size=(n_montecarlo, n_generated_samples, d)) Y = sw_mu + np.einsum('nij,njk->nik', U, gamma.T[np.newaxis, :]) M_Y, N_Y, d_Y = Y.shape assert d == d_Y and M == M_Y order = self.order if proj: if theta is None: theta = self.random_slice(M, d) Xslices = self.get_slice(X, theta) Yslices = self.get_slice(Y, theta) else: Xslices, Yslices = X, Y Xslices_sorted = np.sort(Xslices, axis=1) indices_sorted = np.argsort(Yslices, axis=1) Yslices_sorted = np.take_along_axis(Yslices, indices_sorted, axis=1) if N == N_Y: diff = Xslices_sorted - Yslices_sorted sw_dist = np.sum(np.abs(diff)**order, (1, 2)) / (self.n_projections * N) sw_dist = sw_dist.mean() if requires_grad: theta_U = self.get_slice(U, theta) theta_U = np.take_along_axis(theta_U, indices_sorted, axis=1) replicate_theta = (np.stack( [theta] * Xslices.shape[1])).transpose((1, 0, 2, 3)) sw_grad_mu = -order * (diff[:, :, :, np.newaxis])**( order - 1) * replicate_theta sw_grad_sigma = -order * diff**(order - 1) * theta_U if order % 2 == 1: sw_grad_mu = -order * (diff[:, :, :, np.newaxis])**( order - 1) * (diff[:, :, :, np.newaxis] / np.abs( diff[:, :, :, np.newaxis])) * replicate_theta sw_grad_sigma = -order * diff**(order - 1) * ( diff / np.abs(diff)) * theta_U sw_grad_mu = (sw_grad_mu.reshape( -1, sw_grad_mu.shape[-1])).mean(axis=0) sw_grad_sigma = sw_grad_sigma.mean() return sw_dist, sw_grad_mu, sw_grad_sigma else: return sw_dist else: n_quantiles = 100 discretization_quantiles = np.linspace(0, 1, n_quantiles + 2) discretization_quantiles = discretization_quantiles[1:-1] # With linear interpolation positions = (N - 1) * discretization_quantiles floored = np.floor(positions).astype(int) ceiled = floored + 1 ceiled[ceiled > N - 1] = N - 1 weight_ceiled = positions - floored weight_floored = 1.0 - weight_ceiled d0 = Xslices_sorted[:, :, floored] * weight_floored[np.newaxis, np.newaxis, :] d1 = Xslices_sorted[:, :, ceiled] * weight_ceiled[np.newaxis, np.newaxis, :] X_empirical_qf = d0 + d1 positions = (N_Y - 1) * discretization_quantiles floored = np.floor(positions).astype(int) ceiled = floored + 1 ceiled[ceiled > N_Y - 1] = N_Y - 1 weight_ceiled = positions - floored weight_floored = 1.0 - weight_ceiled d0 = Yslices_sorted[:, :, floored] * weight_floored[np.newaxis, np.newaxis, :] d1 = Yslices_sorted[:, :, ceiled] * weight_ceiled[np.newaxis, np.newaxis, :] Y_empirical_qf = d0 + d1 return (np.sum( np.abs(X_empirical_qf - Y_empirical_qf)**self.order, (1, 2)) / (self.n_projections * n_quantiles))**(1 / self.order)