def get_dataset_files_fs(dataset_path): ''' Scales ALL data for feature selection Scaled files are saved in Normalized/ normal and anomalous folders ''' print('\t[...] Retrieving datasets') # Get paths to data path_normal = dataset_path + '/normal/not_normalized/' path_anomalous = dataset_path + '/anomalous/not_normalized/' dest_normal = dataset_path + '/normal/normalized/' dest_anomalous = dataset_path + '/anomalous/normalized/' # Removes every normalized files if any clean_dir(dest_normal) clean_dir(dest_anomalous) # Get list of files normal_files = [path_normal + x for x in os.listdir(path_normal)] anomal_files = [path_anomalous + x for x in os.listdir(path_anomalous)] all_files = normal_files + anomal_files # Scale data print('\t[...] Fitting scaler') scaler = utils.fit_scaler(all_files) # Apply std and save files @ dest_... print('\t[...] Scaling data') utils.standardize(normal_files, scaler, dest_normal) utils.standardize(anomal_files, scaler, dest_anomalous)
def dimensionality_reduction(data, labels, method=1): ''' :param data: :param labels: :param method: 1-. PCA 2-. LDA :return: ''' # PCA if method == 1: reductedPCAData = mpl.mlab.PCA(data) # We create a dataFrame matching the reductedData with their label. dataFramePCA = pds.DataFrame(reductedPCAData.Y) dataFramePCA['labels'] = pds.DataFrame(labels) dataFramePCA = ut.standardize(dataFramePCA) reduction = dataFramePCA # LDA else: clf = skl.LDA(n_components=3) # clf.fit_transform(data, labels) # We create a dataFrame matching the reductedData with their label. dataFrameLDA = pds.DataFrame(clf.fit_transform(data, labels)) dataFrameLDA['labels'] = pds.DataFrame(labels) dataFrameLDA = ut.standardize(dataFrameLDA) reduction = dataFrameLDA return reduction
def preprocess(no_wells_marmousi, no_wells_seam): """Function initializes data, performs standardization, and train test split Parameters: ---------- no_wells_marmousi : int, number of evenly spaced wells and seismic samples to be evenly sampled from marmousi section. no_wells_seam : int number of evenly spaced wells and seismic samples to be evenly sampled from SEAM Returns ------- seismic_marmousi : array_like, shape(num_traces, depth samples) 2-D array containing seismic section for marmousi seismic_seam : array_like, shape(num_traces, depth samples) 2-D array containing seismic section for SEAM model_marmousi : array_like, shape(num_wells, depth samples) 2-D array containing model section from marmousi 2 model_seam : array_like, shape(num_wells, depth samples) 2-D array containing model section from SEAM """ # get project root directory project_root = os.getcwd() if ~os.path.isdir( 'data'): # if data directory does not exists then extract extract('data.zip', project_root) # Load data seismic_marmousi = np.load(join( 'data', 'marmousi_synthetic_seismic.npy')).squeeze() seismic_seam = np.load(join('data', 'poststack_seam_seismic.npy')).squeeze()[:, 50:] seismic_seam = seismic_seam[::2, :] # Load targets and standardize data model_marmousi = np.load(join('data', 'marmousi_Ip_model.npy')).squeeze()[::5, ::4] model_seam = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :, 50:] model_seam = model_seam[:, 0, :] * model_seam[:, 2, :] # standardize seismic_marmousi, model_marmousi = standardize(seismic_marmousi, model_marmousi, no_wells_marmousi) seismic_seam, model_seam = standardize(seismic_seam, model_seam, no_wells_seam) return seismic_marmousi, seismic_seam, model_marmousi, model_seam
def resolve(query, kb): threshold = kb.size tbu = indexed_kb() query = cleanup_query(query) query[0]["truth"] = not query[0]["truth"] tbu.add(query, occur_check=True) iter = 0 #print query start = time.time() while not tbu.empty(): iter += 1 x, parent = tbu.pop() if not sanity_check(parent, kb, threshold): continue #print "Popped: ", x for x_pred in x: if not x_pred["truth"]: indices = get_indices(kb.true, x_pred["name"]) else: indices = get_indices(kb.false, x_pred["name"]) #print "X_pred is ", x_pred, indices for index in indices: y = kb.all[index] for y_pred in y: sub = unify(x_pred, y_pred) if sub is not None: resolved_sentence = get_resolved_sentence( copy.deepcopy(x), copy.deepcopy(y), copy.deepcopy(x_pred), copy.deepcopy(y_pred), sub) if resolved_sentence == []: return True if isTrue(resolved_sentence): return False resolved_sentence = standardize(resolved_sentence) new_parent = copy.deepcopy(parent) if index not in new_parent: new_parent[index] = 0 new_parent[index] += 1 tbu.add(resolved_sentence, new_parent, occur_check=True, verbose=False) print tbu.size, iter if len(resolved_sentence) > 10000: print "x: ", x print "y: ", y print "sub: ", sub print "Resolved: ", resolved_sentence print tbu.size print '\n' xxx = input() end = time.time() if (end - start) > 10: print "Breaking out in 10 seconds" break x = standardize(x) kb.add(x, occur_check=True) return False
def fit(self, path, print_after=1, plot=False): """Wrapper method for training and saving the model""" X_train, X_test, Y_train, Y_test = self._load(path) Y_train = Y_train.reshape((1, -1)) Y_test = Y_test.reshape((1, -1)) X_train = standardize(X_train) X_test = standardize(X_test) _, n_feature = X_train.shape accuracy_to_plot = [] error_to_plot = [] curr_best = -1 for iter_ in range(self.n_init): print("Running Model {}".format(iter_ + 1)) self._init_weight(n_feature) cost, accuracy, error = self._train(X_train, Y_train, print_after, plot) if iter_ == 0 or cost < curr_best: self._save() curr_best = cost accuracy_to_plot = accuracy error_to_plot = error print("Loading the best model ...") dict_ = self.load_state_dict() self.w = dict_['w'] self.b = dict_['b'] if plot: plt.figure(1) plt.plot(range(self.n_epoch + 1), error_to_plot, c='b') plt.xlabel('Number of Epochs') plt.ylabel('Logistic Loss') plt.title('Loss Function vs Epochs') plt.savefig('./regr_error_plot.png') plt.figure(2) plt.plot(range(self.n_epoch + 1), accuracy_to_plot, c='r') plt.xlabel('Number of Epochs') plt.ylabel('Accuracy %') plt.title('Accuracy vs Epochs') plt.savefig('./regr_accuracy_plot.png') Y_pred = self.classify(X_test) dict_ = metrics(Y_test.reshape(-1), Y_pred.reshape(-1)) print("Validation Accuracy: {:4}".format(dict_['accuracy'])) print("F-Score: {:4}".format(100 * dict_['f1-score']))
def load_data(data_name): timer = utils.timer(name='main') data_path = './data/' + data_name user_pref_file = data_path + '/U_BPR.npy' item_pref_file = data_path + '/V_BPR.npy' item_content_file = data_path + '/item_features.txt' train_file = data_path + '/train.csv' test_file = data_path + '/test.csv' vali_file = data_path + '/vali.csv' dat = {} # load preference data timer.tic() dat['u_pref'] = np.load(user_pref_file) dat['v_pref'] = np.load(item_pref_file) timer.toc('loaded U:%s,V:%s' % (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic() # pre-process preference data _, dat['u_pref'] = utils.standardize(dat['u_pref']) _, dat['v_pref'] = utils.standardize_2(dat['v_pref']) timer.toc('standardized U,V').tic() # load item(article) content data # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式 # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ... # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1 # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) # tfidf 文本特征化 item_content = tfidf(item_content) # svd 特征降维 u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s # 特征标准化 _, item_content = utils.standardize(item_content) dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['uid'].values dat['item_list'] = train['iid'].values timer.toc('read train triplets %s' % str(train.shape)) dat['test_eval'] = data.load_eval_data(test_file) dat['vali_eval'] = data.load_eval_data(vali_file) return dat
def fit(self, X, y): """Transforms dataset using computed S_w & S_b matrices to form new discriminants""" # if number of discriminants is not specified, make it equal to # of columns in dataset if self.n_discriminants is None: self.n_discriminants = X.shape[1] # standardize data if specified if self.centered: X_fit = standardize(X) else: X_fit = X # calculate S_w and S_b, to be used for eigen decomposition S_b = self.between_class_matrix(X_fit, y) S_w = self.within_class_matrix(X_fit, y) inv_Sw = np.linalg.inv(S_w) # get eigen values and eigen vectors to be used for data transformation eigen_vals, eigen_vecs = np.linalg.eig(inv_Sw @ S_b) # pair each eigen value with its eigen vector eigen_pairs = [(eigen_vals[i], eigen_vecs[:, i]) for i in range(len(eigen_vals))] # sort from high to low sorted_pairs = sorted(eigen_pairs, key=lambda x: x[0], reverse=True) # stack discriminants in appropriate order self.discriminants_ = np.hstack((sorted_pairs[i][1][:, np.newaxis].real for i in range(self.n_discriminants))) # calculated total explained variance for included discriminants self.variance_ratios_ = [np.abs(pair[0].real)/np.sum(eigen_vals.real) for pair in sorted_pairs[:self.n_discriminants]] return self
def fit(self, X): """ Determine the eigenvalues and eigenvectors of the feature matrix. Returns itself to be chained w/ the fit_transform() method """ # if no value for n_components is specified, create one for each column in dataset if self.n_components is None: self.n_components = X.shape[1] # standardize dataset, if specified if self.centered: X = standardize(X) # create covariance matrix, perform eigen decomposition # return the eigenvalues and eigen vectors from decomposition cov_mat = np.cov(X.T) eigen_vals, eigen_vecs = np.linalg.eig(cov_mat) # pair each eigen value with its eigen vector eigen_pairs = [(eigen_vals[i], eigen_vecs[:, i]) for i in range(len(eigen_vals))] # sort from high to low sorted_pairs = sorted(eigen_pairs, reverse=True) # stack components in appropriate order self.components_ = np.hstack((sorted_pairs[i][1][:, np.newaxis] for i in range(self.n_components))) self.variance_ratios_ = [ np.abs(pair[0].real) / np.sum(eigen_vals.real) for pair in sorted_pairs[:self.n_components] ] return self
def load_data(files, vnet, batch_size, num_gpus, norm): """Loads and preprocesses data.""" # Optionally standardizes data. if norm: arr = [standardize(np.load(file)) for file in files] else: arr = [np.load(file) for file in files] if len(arr) == 1: arr = arr[0] # If all the same shape, concat. elif len(set([sub_arr.shape for sub_arr in arr])) == 1: arr = np.concatenate(arr) # 2D case with different shapes not implemented else: raise NotImplementedError() # Ensure dimensionality is correct. if arr.ndim == 4 and arr.shape[3] == 2: arr = arr[:, :, :, 1] elif arr.ndim == 4: arr = arr[:, :, :, 0] arr = np.expand_dims(arr, axis=3) return arr, coords, orig_shape
def fit(self, X, y): """ Determine statistical relationship between columns in X and target variable y """ # standardize feature matrix if needed X_fit = np.zeros(X.shape) if self.centered: X_fit = standardize(X) else: X_fit = X # if gradient descent, then solve w/ closed form solution if not self.gd: # add bias unit X_fit = np.c_[np.ones(len(X_fit)), X_fit] self.coef_ = np.linalg.inv(X_fit.T @ X_fit + self.alpha * np.eye(X_fit.shape[1])) @ X_fit.T @ y # otherwise, use gradient descent else: # initialize weights, adding an extra for the intercept self.coef_ = np.random.normal(loc=0, scale=0.1, size=X.shape[1] + 1) self.cost_ = [] for i in range(self.n_iter): l2_grad = self.alpha * self.coef_[1:] # update l2 gradient l2_penalty = self.alpha * np.sum(self.coef_[1:]**2) # update l2 loss term output = self.predict(X_fit) # make prediction - linear output errors = y - output # get error column gradient = (X_fit.T @ errors + l2_grad) * 1/len(X) # get error wrt to each column, add l2, scale by 1/m self.coef_[1:] += gradient * self.eta # update the weights by gradients * learning rate self.coef_[0] += errors.sum() * self.eta * 1/len(X) # update intercept by error column * learning rate * 1/m cost = (np.sum(errors**2) + l2_penalty) / 2 # compute the cost self.cost_.append(cost) # log it
def fit(self, X, y): """ Determine statistical relationship between columns in X and target variable y """ # standardize feature matrix if needed if self.centered: X_fit = standardize(X) else: X_fit = X # if gradient descent, then solve w/ closed form solution if not self.gd: # add bias unit X_fit = np.c_[np.ones(len(X_fit)), X_fit] self.coef_ = np.linalg.inv(X_fit.T @ X_fit) @ X_fit.T @ y # otherwise, use gradient descent else: rgen = np.random.RandomState() # initialize weights, adding an extra for the intercept self.coef_ = rgen.normal(loc=0, scale=0.1, size=X_fit.shape[1] + 1) self.cost_ = [] for i in range(self.n_iter): output = self.predict(X_fit) # create prediction errors = y - output # get errors gradient = X_fit.T @ errors * 1 / len( X ) # get gradient w.r.t. each column, scale by # of samples self.coef_[1:] += gradient * self.eta # update weights self.coef_[0] += errors.sum() * self.eta * 1 / len( X) # update intercept -- no regularization cost = np.sum(errors**2) / 2 # calculate cost self.cost_.append(cost) # log it
def _corrupt(self, data, corruption): if type(corruption) == float: cdata = np.random.binomial(size=data.shape, n=1, p=1.-corruption) * data elif np.shape(np.asarray(corruption).T) == np.shape(data): cdata = corruption.T else: if self.layers[0].data_std is not None and self.layers[0].data_norm is not None: scales = np.random.uniform(low=corruption[0], high=corruption[1], size=data.shape[1]) data = u.unnormalise(data, self.layers[0].data_norm[0], self.layers[0].data_norm[1]) data = u.unstandardize(data, self.layers[0].data_std[0], self.layers[0].data_std[1]) p = np.random.binomial noise_maps = [np.random.normal(scale=sig, size=data.shape[0]) for sig in scales] #* p(1, 0.5) noise_maps = np.asarray(noise_maps) cdata = data + noise_maps.T cdata, _, _ = u.standardize(cdata, self.layers[0].data_std[0], self.layers[0].data_std[1]) cdata, _, _ = u.normalise(cdata, self.layers[0].data_norm[0], self.layers[0].data_norm[1]) # Just making sure we're not out of bounds: min_thr = 1e-6 max_thr = 0.99999 #if ((cdata < min_thr).sum() > 0 or (cdata > max_thr).sum() > 0) and False: # print np.amin(data), np.amax(data), np.mean(data), np.std(data) # print 'N/C:', (cdata < min_thr).sum(), (cdata > max_thr).sum() # print np.amin(cdata), np.amax(cdata), np.mean(cdata), np.std(cdata) # print cdata[cdata < min_thr] = min_thr cdata[cdata > max_thr] = max_thr return cdata
def predict(model, x, x_params_list, y_params, is_many=True): """ 支持单样本和多样本预测 当为多样本时,x = [zz_inputs, xx_inputs, ly_inputs, xc_inputs, decoder_inputs],其中.._inputs = (simples, units, features) 当为多样本时,x = [zz_input, xx_input, ly_input, xc_input, decoder_input],其中.._input = (units, features) :param model: AQPredict模型 :param x: :param x_params_list: :param y_params: :param is_many: 是否为多样本 :return: y_pred = (simples, features) or (1, features) """ if is_many: x_stded = standardize(x, x_params_list, return_params_list=False) return _predict_many(model, x_stded, y_params) else: x_stded = standardize(x, x_params_list, return_params_list=False) return _predict_one(model, x_stded, y_params)
def main(args): proj_path = os.getcwd() data_path = 'data' test_path = data_path + '/test/preprocessed' model_save_path = 'model' save_freq = 10 max_epoch = 5000 max_patience = 30 window_size = 7 num_features = 264 batch_size = 16 net = torch.load(args[1]) test_x_list, test_y_list = utils.data_load('data/final/preprocessed') train_piece_lens = [] test_piece_lens = [] for i in range(len(test_x_list)): # Add 1 to train data for log computability. # It can be inversed at post-processing phase. test_x_list[i] = utils.standardize(test_x_list[i] + 1, log=True).T test_y_list[i] = test_y_list[i].T test_piece_lens.append(test_x_list[i].shape[0]) print('test loaded {}/{}'.format(i + 1, len(test_x_list))) test_x = np.vstack(test_x_list) del test_x_list test_y = np.vstack(test_y_list) del test_y_list # For GPU computing. dtype = torch.cuda.FloatTensor test_x = Variable(torch.Tensor(test_x).type(dtype)) test_x.volatile = True test_y = Variable(torch.Tensor(test_y).type(dtype)) test_y.volatile = True min_valid_loss = float('inf') patience = 0 # criterion = nn.BCEWithLogitsLoss() criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters()) print('Preprocessing Completed.') # Train and calculate loss value. prec, recall, acc = run_test(net, test_x, test_y, criterion, test_piece_lens, batch_size, window_size) f_score = 2 * prec * recall / (prec + recall) print('Precision: {}\tRecall: {}\tAccuracy: {}'.format(prec, recall, acc)) print('F-score: {}'.format(f_score))
def load_data(files, vnet, batch_size, num_gpus, norm): """Loads and preprocesses data.""" # Optionally standardizes data. if norm: arr = [standardize(np.load(file)) for file in files] else: arr = [np.load(file) for file in files] if len(arr) == 1: arr = arr[0] # If all the same shape, concat. elif len(set([sub_arr.shape for sub_arr in arr])) == 1: arr = np.concatenate(arr) # If different shapes and 3D, chunk then concat. elif vnet: # TODO: Somehow save coords and orig_shape for each sub_arr. # Low priority because this block only used for training data right now. if arr[0].ndim == 4 and arr[0].shape[3] == 2: arr = [sub_arr[:, :, :, 1] for sub_arr in arr] elif arr[0].ndim == 4: arr = [sub_arr[:, :, :, 0] for sub_arr in arr] arr = [np.expand_dims(sub_arr, axis=3) for sub_arr in arr] chunked = [chunks(sub_arr, trim=False) for sub_arr in arr] arr = np.concatenate([chunk[0] for chunk in chunked]) # Avoids https://github.com/keras-team/keras/issues/11434 last_batch_gpus = (arr.shape[0] % batch_size) % num_gpus if last_batch_gpus != 0: arr = arr[:-last_batch_gpus, :, :, :, :] return arr, None, None # 2D case with different shapes not implemented else: raise NotImplementedError() # Ensure dimensionality is correct. if arr.ndim == 4 and arr.shape[3] == 2: arr = arr[:, :, :, 1] elif arr.ndim == 4: arr = arr[:, :, :, 0] arr = np.expand_dims(arr, axis=3) # Chunks data if necessary. if vnet: arr, coords, orig_shape = chunks(arr) else: # Avoids https://github.com/keras-team/keras/issues/11434 last_batch_gpus = (arr.shape[0] % batch_size) % num_gpus if last_batch_gpus != 0: arr = arr[:-last_batch_gpus, :, :, :] coords = None orig_shape = arr.shape return arr, coords, orig_shape
def _compute_spatial_kernels(self, train_paths, test_paths): for fn_train, fn_test in zip(train_paths, test_paths): # Process train set. ss = np.fromfile(fn_train, dtype=np.float32) xx = self.spatial_sstats_to_spatial_features(ss, self.gmm) xx, mu, sigma = standardize(xx) xx = power_normalize(xx, 0.5) self.Zx += compute_L2_normalization(xx) self.Kxx += dot(xx, xx.T) # Process test set. ss = np.fromfile(fn_test, dtype=np.float32) yy = self.spatial_sstats_to_spatial_features(ss, self.gmm) yy = standardize(yy, mu, sigma)[0] yy = power_normalize(yy, 0.5) self.Zy += compute_L2_normalization(yy) self.Kyx += dot(yy, xx.T)
def _compute_kernels(self, train_paths, test_paths): for fn_train, fn_test in zip(train_paths, test_paths): # Process train set. ss = np.fromfile(fn_train, dtype=np.float32) xx = self.sstats_to_features(ss, self.gmm) xx, mu, sigma = standardize(xx) xx = power_normalize(xx, 0.5) self.Zx += compute_L2_normalization(xx) self.Kxx += dot(xx, xx.T) # Process test set. ss = np.fromfile(fn_test, dtype=np.float32) yy = self.sstats_to_features(ss, self.gmm) yy = standardize(yy, mu, sigma)[0] yy = power_normalize(yy, 0.5) self.Zy += compute_L2_normalization(yy) self.Kyx += dot(yy, xx.T)
def read_data(self, index): """This function is used to read the data with the index :param index: the index of the data you want to get. """ # if this is for training, just load the the from training list if self.training: x1 = self.train_images[index] # the first list of images (ADC) x2 = self.train_images[index] # the second list of images (T2WI) y = self.train_labels[index] # the list of labels else: # if this is for testing, just load the the from testing list x1 = self.test_images[index] # the first list of images (ADC) x2 = self.test_images[index] # the second list of images (T2WI) y = self.test_labels[index] # the list of labels height, width = x1.shape # get the size of the image x1 = normalize( x1.reshape(height, width, 1)) # apply the normalization (norm to range [0, 1]) x1 = standardize(x1) # apply the standardization (reshape the data) x2 = normalize( x2.reshape(height, width, 1)) # apply the normalization (norm to range [0, 1]) x2 = standardize(x2) # apply the standardization (reshape the data) # apply data augmentation augmented_data = data_augmentation(np.concatenate([x1, x2], axis=2), use_rigid=self.use_rigid, use_non_rigid=self.use_non_rigid) # NOTE: because the data I used has multiple classes, so I have to modified it a bit. Remove the following line (just one line) y = (y != 1).astype(np.uint8) # remove this return augmented_data[:, :, :, : 3], augmented_data[:, :, :, 3:], tf.keras.utils.to_categorical( y, num_classes=2, dtype='float32')
def _load_data(dataset, is_training=False): """Load input data, target values and file names for a dataset. The input data is assumed to be a dataset of feature vectors. These feature vectors are standardized using a scaler that is either loaded from disk (if it exists) or computed on-the-fly. The latter is only possible if the input data is training data, which is indicated by the `is_training` parameter. Target values and file names are read from the metadata file. Args: dataset: Structure encapsulating dataset information. training (bool): Whether the input data is training data. Returns: x (np.ndarray): The input data. y (np.ndarray): The target values. names (list): The associated file names. """ import data_augmentation as aug import features features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5') x = utils.timeit(lambda: features.load_features(features_path), 'Loaded features of %s dataset' % dataset.name) # Clip dynamic range to 90 dB x = np.maximum(x, x.max() - 90.0) # Load scaler from file if cached, or else compute it. scaler_path = cfg.scaler_path if os.path.exists(scaler_path) or not is_training: with open(scaler_path, 'rb') as f: scaler = pickle.load(f) else: scaler = utils.timeit(lambda: utils.compute_scaler(x), 'Computed standard scaler') with open(scaler_path, 'wb') as f: pickle.dump(scaler, f) x = utils.timeit(lambda: utils.standardize(x, scaler), 'Standardized %s features' % dataset.name) names, y = utils.timeit(lambda: utils.read_metadata(dataset.metadata_path), 'Loaded %s metadata' % dataset.name) if dataset == cfg.training_set and cfg.enable_augmentation: names, y = aug.expand_metadata((names, y)) return x, y, names
def load_data(data_name): timer = utils.timer(name='main').tic() data_path = './data/' + data_name u_file = data_path + '/U_BPR.npy' v_file = data_path + '/V_BPR.npy' user_content_file = data_path + '/user_content.npz' train_file = data_path + '/train.csv' test_file = data_path + '/test.csv' vali_file = data_path + '/vali.csv' dat = {} # load preference data timer.tic() u_pref = np.load(u_file) v_pref = np.load(v_file) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref'] = utils.standardize_2(u_pref) _, dat['v_pref'] = utils.standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() user_content = scipy.sparse.load_npz(user_content_file) dat['user_content'] = user_content.tolil(copy=False) timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() # load split timer.tic() train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['uid'].values dat['item_list'] = train['iid'].values dat['warm_item'] = np.unique(train['iid'].values) timer.toc('read train triplets %s' % str(train.shape)).tic() dat['vali_eval'] = data.load_eval_data(vali_file, cold_user=True, test_item_ids=dat['warm_item']) dat['test_eval'] = data.load_eval_data(test_file, cold_user=True, test_item_ids=dat['warm_item']) return dat
def predict(file): data = pd.read_csv(file) data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[]) # pre-process data try: modify = pd.read_csv(file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(file, persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_classification_data(modify) # prediction clf = lgb.Booster(model_file='lgb_classifier.txt') winprob = clf.predict(x_test) data['winprob'] = 0 i = 0 groups = data.groupby(['rdate', 'rid']) for name, group in groups: total = np.sum(winprob[i, 0:len(group)]) j = 0 for index, row in group.iterrows(): row['winprob'] = winprob[i, j] / total data.iloc[index] = row j += 1 i += 1 data['plaprob'] = WinP2PlaP(data, wpcol='winprob') fixratio = 1 / 10000 mthresh = 9 print("Getting win stake...") data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] > mthresh) print("Getting place stake...") data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] > mthresh) data.to_csv('test_result.csv')
def train(self): # pre-process data try: modify = pd.read_csv(self.file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(self.file, persistent=True) # shuffle among groups groups = [ df.transform(np.random.permutation) for _, df in modify.groupby(['rdate', 'rid']) ] modify = pd.concat(groups).reset_index(drop=True) # drop outdated data # modify = modify[:][[val > '2017' for val in modify['rdate']]] # perform standardization modify = standardize(modify) # slice data x_train, y_train = slice_classification_data(modify) # convert training data into LightGBM dataset format d_train = lgb.Dataset(x_train, label=y_train) params = dict() params['learning_rate'] = 3e-4 params['boosting_type'] = 'rf' params['objective'] = 'multiclass' params['metric'] = 'multi_logloss' params['num_class'] = 16 params['bagging_freq'] = 1 params['bagging_fraction'] = 0.8 # params['lambda_l1'] = 10 # params['lambda_l2'] = 1 # params['max_depth'] = 10 # params['cat_smooth'] = 10 # params['feature_fraction'] = 0.8 # params['num_leaves'] = 128 # params['min_data_in_leaf'] = 32 self.lgb_model = lgb.train(params, d_train, 400) self.lgb_model.save_model('lgb_classifier.txt', num_iteration=self.lgb_model.best_iteration)
def joint_scores(query_features, query_cams, query_frames, gallery_features, gallery_cams, gallery_frames, distribution, alpha=5, interval=100): query_features, gallery_features = standardize(query_features, gallery_features) scores = torch.Tensor() for feature, cam, frame in zip(query_features, query_cams, query_frames): # n: Number of Gallery instances # (n, 1228*6) * 2048*6 -> n # Visual Feature Stream feature_score = torch.matmul(gallery_features, feature) # Size: n gallery_frames = gallery_frames gallery_cams = gallery_cams diff = torch.abs(gallery_frames - frame) hist_ = (diff / interval).type(torch.int16) # Size: n st_score = distribution[cam.type(torch.int16).tolist() - 1][(gallery_cams - 1).type(torch.int16).tolist(), hist_.tolist()] st_score = torch.tensor(st_score) # score -> probabilities; This must be a formula from the paper! # Size: n score = 1/(1+torch.exp(-alpha*feature_score)) * \ 1/(1+2*torch.exp(-alpha*st_score)) scores = torch.cat([scores, torch.unsqueeze(score, dim=0)]) # all_scores # Size: k * n; k -> Num. of Query Instansces return scores
def test_regression(model): Regression = models[model] print ("-- Regression Tree --") # Load temperature data data = pd.read_csv('data/TempLinkoping2016.txt', sep="\t") time = np.atleast_2d(data["time"].values).T temp = np.atleast_2d(data["temp"].values).T X = standardize(time) # Time. Fraction of the year [0, 1] y = temp[:, 0] # Temperature. Reduce to one-dim print (X.shape, y.shape) X_train, y_train, X_test, y_test = split_train_test(X, y) model = Regression() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') mse = mean_squared_error(y_test, y_pred) print ("Mean Squared Error:", mse) # Plot the results # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10) plt.suptitle("Regression Tree") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') plt.show()
def main(): # Load temperature data data = pd.read_csv('./TempLinkoping2016.txt', sep="\t") #[[0.00273224] [0.00546448] [0.00819672]......] time = np.atleast_2d(data["time"].values).T #[[ 0.1] [ -4.5] [ -6.3]...] temp = np.atleast_2d(data["temp"].values).T #X:[[-1.72732488], [-1.71786008],....[-1.72732488]] X = standardize(time) # Time. Fraction of the year [0, 1] 标准化 #[[ 0.1] [ -4.5] [ -6.3]...]---------->[0.1,-4.5,-6.3........] y = temp[:, 0] # Temperature. Reduce to one-dim X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) model = RegressionTree() model.fit(X_train, y_train) y_pred = model.predict(X_test) model.print_tree(indent=' ') # Color map cmap = plt.get_cmap('viridis') mse = mean_squared_error(y_test, y_pred) print("Mean Squared Error:", mse) # Plot the results # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10) plt.suptitle("Regression Tree") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') plt.show()
def init_data(nsamples, dx, dy): Xold = np.linspace(0, 1000, nsamples * dx).reshape([nsamples, dx]) X = utils.standardize(Xold) invertible = False while not invertible: W = np.random.randint(1, 10, size=(dy, dx)) if linalg.cond(W) < 1 / sys.float_info.epsilon: invertible = True print('W invertible') Y = W.dot(X.T) # target # for i in range(Y.shape[1]): # Y[:, i] = utils.add_noise(Y[:, i]) print('shapes Y = {}, X: {}, W: {}'.format(Y.shape, X.shape, W.shape)) x = Variable(torch.from_numpy(X), requires_grad=True).type(torch.FloatTensor) y = Variable(torch.from_numpy(Y), requires_grad=True).type(torch.FloatTensor) w = Variable(torch.from_numpy(W), requires_grad=True).type(torch.FloatTensor) return x, y, w
def preprocess(no_wells): """Function initializes data, performs standardization, and train test split Parameters: ---------- no_wells : int, number of evenly spaced wells and seismic samples to be evenly sampled from seismic section. Returns ------- seismic : array_like, shape(num_traces, depth samples) 2-D array containing seismic section model : array_like, shape(num_wells, depth samples) 2-D array containing model section """ # get project root directory project_root = os.getcwd() if ~os.path.isdir( 'data'): # if data directory does not exists then extract extract('data.zip', project_root) # Load data seismic = np.load(join('data', 'poststack_seam_seismic.npy')).squeeze()[:, 50:] seismic = seismic[::2, :] # Load targets and standardize data model = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :, 50:] model = model[:, 0, :] * model[:, 2, :] # standardize seismic, model = standardize(seismic, model, no_wells) return seismic, model
def __init__(self, sess, state_dim, action_dim, learning_rate): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate # Actor Network self.inputs, self.out = self.create_actor_network() # This returns will be provided by the Discount Reward self.returns = tf.placeholder("float", [None,1], name='returns') self.actions = tf.placeholder("float", [None,self.a_dim], name='actions') # tf reward processing self.tf_discounted_epr = self.tf_discount_rewards(self.returns) self.tf_discounted_epr = utils.standardize(self.tf_discounted_epr) self.loss = tf.nn.l2_loss(self.actions-self.out) optimizer = tf.train.AdamOptimizer(self.learning_rate) grads = optimizer.compute_gradients(self.loss, var_list=tf.trainable_variables(), grad_loss=self.tf_discounted_epr) self.optimize = optimizer.apply_gradients(grads)
def fit(self): episode_length = len(self.states) # These targets are used for optimization step. discounted_rewards = self.discount_rewards(self.rewards) # Standardized discounted rewards discounted_rewards = standardize(discounted_rewards) advantages = np.zeros((episode_length, self.action_size)) # Create inputs for our model (not crucial but it helps # to keep track of input dimension) update_input = np.zeros(((episode_length, ) + self.state_size)) for i in range(episode_length): update_input[i, :] = self.states[i] # We predict on batch using list of states values = self.critic.predict(update_input) for i in range(episode_length): advantages[i][self.actions[i]] = discounted_rewards[i] - values[i] # Refer to "https://medium.freecodecamp.org/an-intro-to-advantage-actor-critic-methods-lets-play-sonic-the-hedgehog-86d6240171d" # Actor use Cross-entropy with critic q value actor_loss = self.actor.fit(update_input, advantages, batch_size=self.batch_size, epochs=1, verbose=0) # Critic use MSE its predicted value (value) critic_loss = self.critic.fit(update_input, discounted_rewards, batch_size=self.batch_size, epochs=1, verbose=0) self.states, self.actions, self.rewards = [], [], [] return values, actor_loss.history['loss'], critic_loss.history['loss']
def main(): print ("-- Regression Tree --") # Load temperature data data = pd.read_csv('../TempLinkoping2016.txt', sep="\t") time = np.atleast_2d(data["time"].as_matrix()).T temp = np.atleast_2d(data["temp"].as_matrix()).T X = standardize(time) # Time. Fraction of the year [0, 1] y = temp[:, 0] # Temperature. Reduce to one-dim X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = RegressionTree() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') mse = mean_squared_error(y_test, y_pred) print ("Mean Squared Error:", mse) # Plot the results # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10) plt.suptitle("Regression Tree") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') plt.show()
def _corrupt(self, data): if type(self.corruption) == float: cdata = np.random.binomial(size=data.shape, n=1, p=1.-self.corruption) * data elif np.shape(np.asarray(self.corruption).T) == np.shape(data): cdata = self.corruption.T else: if self.data_std is not None and self.data_norm is not None: scales = np.random.uniform(low=self.corruption[0], high=self.corruption[1], size=data.shape[1]) data = u.unnormalise(data, self.data_norm[0], self.data_norm[1]) data = u.unstandardize(data, self.data_std[0], self.data_std[1]) p = np.random.binomial noise_maps = [np.random.normal(scale=sig, size=data.shape[0]) for sig in scales] # * p(1, 0.5) noise_maps = np.asarray(noise_maps) cdata = data + noise_maps.T cdata, _, _ = u.standardize(cdata, self.data_std[0], self.data_std[1]) cdata, _, _ = u.normalise(cdata, self.data_norm[0], self.data_norm[1]) # Just making sure we're not out of bounds: min_thr = 1e-6 max_thr = 0.99999 #if ((cdata < min_thr).sum() > 0 or (cdata > max_thr).sum() > 0) and False: # print np.amin(data), np.amax(data), np.mean(data), np.std(data) # print 'N/C:', (cdata < min_thr).sum(), (cdata > max_thr).sum() cdata[cdata < min_thr] = min_thr cdata[cdata > max_thr] = max_thr #print np.amin(cdata), np.amax(cdata), np.mean(cdata), np.std(cdata) else: raise RuntimeError("Can't normalise the data (%s, %s). You must provide the normalisation and standardisation values. Giving up." % (self.data_std, self.data_norm)) #print np.amin(data), np.amax(data) #print np.amin(cdata), np.amax(cdata) return cdata
def predict(file): data = pd.read_csv(file) data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[]) # pre-process data try: modify = pd.read_csv(file.replace('.csv', '_modified.csv')) except FileNotFoundError: modify = RacingPredictor.pre_process(file, persistent=True) # perform standardization modify = standardize(modify) # slice data x_test, y_test = slice_naive_data(modify) # prediction clf = lgb.Booster(model_file='lgb_classifier.txt') winprob = clf.predict(x_test) data['winprob'] = winprob[:, 1] data['plaprob'] = winprob[:, 1] + winprob[:, 2] + winprob[:, 3] fixratio = 5e-3 mthresh = 1.6 print("Getting win stake...") data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] > mthresh) print("Getting place stake...") data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] > mthresh) data.to_csv('test_result.csv') return data
do_regularize = False y_, song_id, nb_of_songs = load_y(DATADIR) X_ = load_X(DATADIR, song_id) # Now let's mix everything so that we can take test_set and train_set independantly # We need to separate PER SONG X_train, y_train, X_test, y_test, song_id_tst = mix(X_, y_, PURCENT, NUM_FRAMES, song_id, nb_of_songs) print X_train.shape, y_train.shape, X_test.shape, y_test.shape # print X_train[0:3,0:3] # standardize data X_train, scaler = standardize(X_train) X_test, _ = standardize(X_test, scaler) X_train = X_train[:, [ 10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220 ]] X_test = X_test[:, [ 10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220 ]] # X_train = X_train[:,[13,85,103,142,214]] # X_test = X_test[:,[13,85,103,142,214]] # one dimension at a time # 0: arousal, 1: valence
def main(POWER, INPUT_NOISE, C, REG, ATTENTION_HIDDEN, HIDDEN_SIZE, N_LAYERS): directory = sys.argv[1]#'data/mat/fox_100x100_matlab.mat' D = io.loadmat(directory) features0 = D['features'].todense() ## remove identical features uniid = [] for i in range(features0.shape[1]): if len(np.unique(np.array(features0[:,i]))) == 1: uniid.append(i) features = np.delete(features0,uniid,axis = 1) ## standardize all data (maybe flawed) all_mean,all_std = utils.standardize(features) features = (features - all_mean)/all_std from sklearn.decomposition import PCA pca = PCA() pca.fit(features) loading = pca.explained_variance_ratio_ n_components = len(loading) for p in range(len(loading)): if sum(loading[:p])>POWER: n_components = p break features = pca.transform(features)[:,:p] #pdb.set_trace() #features = features0 #pdb.set_trace() labels = np.array(D['labels'].todense())[0] bag_ids = D['bag_ids'][0] MAX_LENGTH = max([list(bag_ids).count(iBag) for iBag in set(bag_ids)]) N_FEATURE_DIM = features.shape[1] X = np.zeros((len(set(bag_ids)),MAX_LENGTH,N_FEATURE_DIM)) Y = np.zeros((len(set(bag_ids)),)) M = np.zeros((len(set(bag_ids)),MAX_LENGTH)) for iBag in set(bag_ids): instance_index = np.where(bag_ids == iBag)[0] # print instance_index[0] # print np.concatenate((features[instance_index],np.zeros((MAX_LENGTH-len(instance_index[0]),N_FEATURE_DIM))),axis = 0).shape # break X[iBag-1] = np.concatenate((features[instance_index],np.zeros((MAX_LENGTH-len(instance_index),N_FEATURE_DIM))),axis = 0).astype(theano.config.floatX) assert(len(set(labels[instance_index])) == 1) Y[iBag -1] = labels[instance_index[0]].astype(theano.config.floatX) Y[Y == -1] = 0 M[iBag-1] = np.concatenate((np.ones(len(instance_index)) ,np.zeros((MAX_LENGTH-len(instance_index)))),axis = 0).astype(theano.config.floatX) import csv ### train val test set DROPOUT_RATIO = 0 learning_rate = 0.001 R = 3 s = '%s'%REG s = s.split()[1] ## s is the name of regularization expDir = os.path.join('exp_spearmint/',os.path.basename(directory),'PCA%.1f_innoise_%f_%snorm_%f_attention_%d_hidden_%d_layers_%d'%(POWER, INPUT_NOISE, s, C, ATTENTION_HIDDEN, HIDDEN_SIZE, N_LAYERS)+os.path.sep) if not os.path.isdir(expDir): os.makedirs(expDir) with open(os.path.join(expDir,'README'),'w') as fid: fid.write('learning rate = %f\n'%learning_rate) fid.write('dropout ratio = %f\n'%learning_rate) fid.write('penalty factor = %f\n'%C) result = np.zeros(shape=(3,10)) for r in range(R): k=0 kf = KFold(X.shape[0],10,True) for train_index,test_index in kf: input_shape = (None, MAX_LENGTH, N_FEATURE_DIM) # Construct network layer = lasagne.layers.InputLayer(shape=input_shape, name='Input') n_batch, n_seq, n_features = layer.input_var.shape # Store a dictionary which conveniently maps names to layers we will # need to access later layers = {'in': layer} # Add dense input layer layer = lasagne.layers.GaussianNoiseLayer(layer,INPUT_NOISE) layer = lasagne.layers.ReshapeLayer( layer, (n_batch*n_seq, input_shape[-1]), name='Reshape 1') layer = lasagne.layers.DenseLayer( layer, HIDDEN_SIZE, W=lasagne.init.HeNormal(), name='Input dense', nonlinearity=lasagne.nonlinearities.leaky_rectify) layer = lasagne.layers.ReshapeLayer( layer, (n_batch, n_seq, HIDDEN_SIZE), name='Reshape 2') # Add the layer to aggregate over time steps # We must force He initialization because Lasagne doesn't like # 1-dim shapes in He and Glorot initializers layer = utils.AttentionLayer( layer,ATTENTION_HIDDEN, W=lasagne.init.Normal(1./np.sqrt(layer.output_shape[-1])), name='Attention') for _ in range(N_LAYERS): layer = lasagne.layers.DenseLayer( layer, HIDDEN_SIZE, W=lasagne.init.HeNormal(), name='Out dense 1', nonlinearity=lasagne.nonlinearities.leaky_rectify) layer = lasagne.layers.DropoutLayer(layer, p=DROPOUT_RATIO) # Add final dense layer, whose bias is initialized to the target mean layer = lasagne.layers.DenseLayer( layer, 1, W=lasagne.init.HeNormal(), name='Out dense 3', nonlinearity=lasagne.nonlinearities.sigmoid) layer = lasagne.layers.ReshapeLayer( layer, (-1,)) # Keep track of the final layer layers['out'] = layer #l_norm = regularize_layer_params(layer,l1) l_norm = regularize_layer_params(lasagne.layers.get_all_layers(layers['out']),REG) # Symbolic variable for target values target = T.vector('target') # Retrive the symbolic expression for the network network_output = lasagne.layers.get_output(layers['out'],deterministic=True) # Create a symbolic function for the network cost cost = T.mean(lasagne.objectives.binary_crossentropy(network_output,target)) # try Hinge loss #cost = T.mean(lasagne.objectives.binary_hinge_loss(network_output,target)) cost = cost + C*l_norm #cost = T.mean((network_output - target)**2) # Retrieve all network parameters all_params = lasagne.layers.get_all_params(layers['out']) # Compute updates updates = lasagne.updates.rmsprop(cost, all_params, learning_rate ) # Compile training function train = theano.function([layers['in'].input_var, target], cost, updates=updates) # Accuracy is defined as binary accuracy compute_cost = theano.function([layers['in'].input_var, target], cost,) accuracy = T.sum(lasagne.objectives.binary_accuracy(network_output, target)) compute_accuracy = theano.function( [layers['in'].input_var, target], accuracy) #print 'Model built.' X_train_all, X_test = X[train_index], X[test_index] y_train_all, y_test = Y[train_index], Y[test_index] m_train_all, m_test = M[train_index], M[test_index] kf_val = KFold(X_train_all.shape[0],10,True) for train_ind,val_ind in kf_val: X_train, X_val = X_train_all[train_ind], X_train_all[val_ind] y_train, y_val = y_train_all[train_ind], y_train_all[val_ind] m_train, m_val = m_train_all[train_ind], m_train_all[val_ind] break ## standardize three sets # x_tr_mean,x_tr_std = utils.standardize(X_train) # # X_train = (X_train-x_tr_mean)/x_tr_std # X_val = (X_val-x_tr_mean)/x_tr_std # X_test = (X_test-x_tr_mean)/x_tr_std # print X_train # pdb.set_trace() MAX_EPOCH = 500 NO_BEST = 10 train_acc = np.array([]) train_cost = np.array([]) test_acc = np.array([]) test_cost = np.array([]) val_acc = np.array([]) val_cost = np.array([]) early_stop = False for iEpoch in range(MAX_EPOCH): b = batch_generator(X_train,y_train,m_train) trac = 0 trco = 0 for x_b,y_b in b: #print x_b.shape,y_b.shape train(x_b,y_b) b_cost = compute_cost(x_b,y_b) trco += b_cost trac += compute_accuracy(x_b,y_b) if any([not np.isfinite(b_cost), any([not np.all(np.isfinite(p.get_value())) for p in all_params])]): # logger.info('####### Non-finite values found, aborting') print '####### Non-finite values found, aborting' break train_acc = np.append(train_acc, trac/X_train.shape[0]) #compute_accuracy(x_b,y_b) train_cost = np.append(train_cost,trco/X_train.shape[0]) vaco = 0 vaac = 0 bv = batch_generator(X_val,y_val,m_val) for xv,yv in bv: vaco += compute_cost(xv,yv) vaac += compute_accuracy(xv,yv) val_cost = np.append(val_cost,vaco) val_acc = np.append(val_acc,vaac) teac = 0 teco = 0 bt = batch_generator(X_test,y_test,m_test) for xt,yt in bt: teac+= compute_accuracy(xt,yt) teco+= compute_cost(xt,yt) test_acc = np.append(test_acc,teac) test_cost = np.append(test_cost,teac) if iEpoch > NO_BEST: early_stop = True last_val = val_cost[-NO_BEST: ] for i,v in enumerate(last_val[:-2]): if last_val[i] >= last_val[i+1]: early_stop = False break if early_stop: #print "early stoping, last %s validation costs are: "%NO_BEST + ','.join([str(tmp) for tmp in last_val]) break best_model = np.argmin(val_cost) #print train_acc #print train_cost #print val_cost #print test_acc #print 'Reach maxmal validation acc at %dth iteration'%best_model #print 'train_cost = %f'%train_cost[best_model] #print 'val_cost = %f'%val_cost[best_model] #print 'test_acc = %f'%test_acc[best_model] result[r][k] = test_acc[best_model]/X_test.shape[0] print "%d times, %d folder finished, test acc is %f"%(r,k,test_acc[best_model]/X_test.shape[0]) #pdb.set_trace() with open(os.path.join( expDir, 'val_cost_r%d_k%d.csv'%(r,k) ),'w') as fid: writer = csv.writer(fid) writer.writerows([val_cost]) with open(os.path.join( expDir, 'val_acc_r%d_k%d.csv'%(r,k) ),'w') as fid: writer = csv.writer(fid) writer.writerows([val_acc]) with open(os.path.join( expDir, 'test_cost_r%d_k%d.csv'%(r,k) ),'w') as fid: writer = csv.writer(fid) writer.writerows([test_cost]) with open(os.path.join( expDir, 'test_acc_r%d_k%d.csv'%(r,k) ),'w') as fid: writer = csv.writer(fid) writer.writerows([test_acc]) k=k+1 print np.mean(result[:]) with open(os.path.join( expDir, 'result_%f_%f.csv'%(np.mean(result[:]),np.std(result[:])) ),'w') as fid: writer = csv.writer(fid) writer.writerows(result)
MODEL_PATH = 'best_model' BASE_DATA_PATH = 'data' if __name__ == '__main__': # Load in training files X_train = [] Y_train = [] for filename in glob.glob(os.path.join(BASE_DATA_PATH, 'train', '*.npz')): data = np.load(filename) # Convert to floatX with correct column order X_train.append(np.array( data['X'], dtype=theano.config.floatX, order='C')) Y_train.append(np.array( data['Y'], dtype=theano.config.floatX, order='C')) # Stack to compute training mean and std X_mean, X_std = utils.standardize(np.concatenate(X_train, axis=0)) Y_mean, Y_std = utils.standardize(np.concatenate(Y_train, axis=0)) # Compute max length as median of lengths max_length_X = int(np.median([len(X) for X in X_train])) max_length_Y = int(np.median([len(Y) for Y in Y_train])) # Retrieve the hyperparameters which achivieved the lowest objective best_params, _ = train_best_network.get_best_trial(RESULTS_PATH) # Convert parameters to layer specifications (conv_layer_specs, dense_layer_specs) = train_network.layer_specs_from_params(best_params) # Build networks layers = { 'X': utils.build_network( (None, None, X_train[0].shape[-1]), conv_layer_specs, dense_layer_specs),
# We need to separate PER SONG X_train, y_train, X_test, y_test, song_id_tst = mix(X_, y_, PURCENT, NUM_FRAMES, song_id, nb_of_songs) print X_train.shape, y_train.shape, X_test.shape, y_test.shape # print X_train[0:3,0:3] # print np.mean(X_train[:,0:3], axis=0), np.std(X_train[:,0:3], axis=0) # print np.mean(X_test[:,0:3], axis=0), np.std(X_test[:,0:3], axis=0) # with(open('train_dummy.txt', mode='w')) as infile: # for i in range(X_train.shape[0]): # s='' # for feat in range(3): # s = s + '%g '%X_train[i,feat] # infile.write('%s\n'%s) # standardize data X_train, scaler = standardize(X_train) X_test, _ = standardize(X_test, scaler) # print np.mean(X_train[:,0:3], axis=0), np.std(X_train[:,0:3], axis=0) # print np.mean(X_test[:,0:3], axis=0), np.std(X_test[:,0:3], axis=0) # with(open('train_dummy_normed.txt', mode='w')) as infile: # for i in range(X_train.shape[0]): # s='' # for feat in range(3): # s = s + '%g '%X_train[i,feat] # infile.write('%s\n'%s) # one dimension at a time y_train = y_train[:, 0] y_test = y_test[:, 0]
yptr = yp_tr.drop(yp_tr.columns.difference(['GPPp', 'ETp', 'SWp']), axis=1) ypte = yp_te.drop(yp_te.columns.difference(['GPPp', 'ETp', 'SWp']), axis=1) #yp = yptr.merge(ypte, how="outer") #print(len(yptr), len(ypte)) #print(yptr, ypte) #yp = pd.concat([yptr, ypte]) #print(yp) n = [1,1] x_tr, n = utils.add_history(yptr, n, 1) x_te, n = utils.add_history(ypte, n, 1) x_tr = utils.standardize(x_tr) x_te = utils.standardize(x_te) y = y.to_frame() train_x = x_tr[~x_tr.index.year.isin([2007,2008])] train_y = y[~y.index.year.isin([2007,2008])] splits = len(train_x.index.year.unique()) test_x = x_te[x_te.index.year == 2008] test_y = y[y.index.year == 2008][1:] print(train_x, train_y) #print(len(x), len(y)) splits = len(train_x.index.year.unique())
import scipy.io.wavfile as wavfile import numpy as np from numpy import inf import utils import matplotlib.pyplot as plt import pdb sr, wav = wavfile.read('example2.wav') wav = np.mean(wav, axis=1) cqt = utils.cqt(wav) print(cqt.min()) std_cqt = utils.standardize(cqt) log_std_cqt = utils.standardize(cqt + 1, log=True) #log_std_cqt[log_std_cqt == -inf] = 0 pdb.set_trace() plt.pcolormesh(std_cqt, cmap='jet') plt.show() plt.pcolormesh(log_std_cqt, cmap='jet') plt.show()
if concatenate: df = pd.concat([data_red, data_white]) else: if red: df = data_red else: df = data_white # Filter Nans df = ut.filter_nans(df) # Normalize or standardize df = ut.standardize(df) df_no_pca = df label_name = df_no_pca.columns[-1] df_no_pca = df_no_pca.rename(columns={label_name: 'labels'}) # Change name of the last column to data ################################################################################################################### df_rfe = ut.reorder_by_RFE(df_no_pca) pd.options.display.mpl_style = 'default' utils_plots.plot_principal_component_2D(df_rfe, display=True) pd.options.display.mpl_style = 'default' utils_plots.plot_principal_component_3D(df_rfe, display=True) # with PCA # Separate data from labels
T = tetrahedra.shape[0] print 'Reading INRIA .mesh file',meshfile print '\tFound', V, 'vertices' print '\tFound', T, 'tetrahedra' bbox = np.empty((3,2)) for i in xrange(3): bbox[i,0] = np.min(vertices[:,i]) bbox[i,1] = np.max(vertices[:,i]) kernel = stats.gaussian_kde(vertices.T,0.1) G = 40 grids = [np.linspace(bbox[i,0],bbox[i,1],G) for i in xrange(3)] P = make_points(grids) P += 0.1*np.random.randn(*P.shape) Z = kernel(P.T) stdZ = standardize(Z) cmap = plt.get_cmap('spectral') C = cmap(stdZ) C[:,3] = (stdZ)**1.5 mask = (C[:,3] > 0.025) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(P[mask,0],P[mask,1],P[mask,2],c=C[mask,:], s=125,lw=0) plt.show()
def rnn_cv( folds, n_hidden=10, n_epochs=50, lr=0.001, lrd = 0.999, reg_coef= 0.01, doSmoothing=False, useEssentia=False): doSaveModel = False if doSmoothing: dir_name = 'nfeat%d_nh%d_ne%d_lr%g_reg%g_smoothed'%(nb_features, n_hidden, n_epochs, lr, reg_coef) else: dir_name = 'nfeat%d_nh%d_ne%d_lr%g_reg%g'%(nb_features, n_hidden, n_epochs, lr, reg_coef) MODELDIR = 'rnn/' + dir_name + '/' LOGDIR = MODELDIR if not path.exists(MODELDIR): makedirs(MODELDIR) print '... output dir: %s'%(MODELDIR) # smoothing params taille = 12 wts = np.ones(taille-1)*1./taille wts = np.hstack((np.array([1./(2*taille)]), wts, np.array([1./(2*taille)]))) delay = (wts.shape[0]-1) / 2 # # initialize global logger variable # print '... initializing global logger variable' # logger = logging.getLogger(__name__) # withFile = False # logger = settings.init(MODELDIR + 'train.log', withFile) # perf_file_name = LOGDIR + 'rnn_nh%d_ne%d_lr%g_reg%g.log'%(n_hidden, n_epochs, lr, reg_coef) perf_file_name = LOGDIR + 'performance.log' log_f = open(perf_file_name, 'w') all_fold_pred = list() all_fold_y_test = list() all_fold_id_test = list() for fold_id in range(10): # fold_id = 0 fold = folds[fold_id] t0 = time.time() # print '... loading FOLD %d'%fold_id # if useEssentia: # fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed_essentia.pkl'%(fold_id), "rb" ) ) if useEssentia: X_train = fold['train']['X'] y_train = fold['train']['y'] id_train = fold['train']['song_id'] X_test = fold['test']['X'] y_test = fold['test']['y'] id_test = fold['test']['song_id'] else: fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) ) X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT) X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT) print X_train.shape, y_train.shape, X_test.shape, y_test.shape if useMelodyFeatures: # first feature = slope, other two = mean, std melody_train, melody_test = subset_features(all_song_melody_features, id_train, id_test) # melody_train = melody_train[:,:,1:] # melody_test = melody_test[:,:,1:] # standardize train data melody_concat_train = np.reshape(melody_train, (melody_train.shape[0]*melody_train.shape[1], melody_train.shape[2]), order='C') melody_concat_train_normed, scaler = standardize(melody_concat_train) # print concat_train_normed.shape melody_train_normed = np.reshape(melody_concat_train_normed, (melody_train.shape[0], melody_train.shape[1], melody_train.shape[2]), order='C') del melody_concat_train, melody_concat_train_normed # standardize test data melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C') melody_concat_test_normed, _ = standardize(melody_concat_test, scaler) # print concat_test_normed.shape melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C') del melody_concat_test, melody_concat_test_normed # concat with the other features X_train = np.concatenate((X_train, melody_train_normed), axis=2) X_test = np.concatenate((X_test, melody_test_normed), axis=2) if useTempoFeatures: tempo_train, tempo_test = subset_features(all_song_tempo_features, id_train, id_test) # standardize train data tempo_concat_train = np.reshape(tempo_train, (tempo_train.shape[0]*tempo_train.shape[1], tempo_train.shape[2]), order='C') tempo_concat_train_normed, scaler = standardize(tempo_concat_train) # print concat_train_normed.shape tempo_train_normed = np.reshape(tempo_concat_train_normed, (tempo_train.shape[0], tempo_train.shape[1], tempo_train.shape[2]), order='C') del tempo_concat_train, tempo_concat_train_normed # standardize test data tempo_concat_test = np.reshape(tempo_test, (tempo_test.shape[0]*tempo_test.shape[1], tempo_test.shape[2]), order='C') tempo_concat_test_normed, _ = standardize(tempo_concat_test, scaler) # print concat_test_normed.shape tempo_test_normed = np.reshape(tempo_concat_test_normed, (tempo_test.shape[0], tempo_test.shape[1], tempo_test.shape[2]), order='C') del tempo_concat_test, tempo_concat_test_normed # concat with the other features X_train = np.concatenate((X_train, tempo_train_normed), axis=2) X_test = np.concatenate((X_test, tempo_test_normed), axis=2) # print id_test.shape # X_train = X_train[0:100,:,:] # y_train = y_train[0:100,:,:] # X_train = X_train[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]] # X_test = X_test[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]] # X_train = X_train[:,[13,85,103,142,214]] # X_test = X_test[:,[13,85,103,142,214]] # X_test = X_train[119:119+y_test.shape[0],:] # y_test = y_train[119:119+y_test.shape[0]] print X_train.shape, y_train.shape, X_test.shape, y_test.shape nb_seq_train, nb_frames_train, nb_features_train = X_train.shape nb_seq_test, nb_frames_test, nb_features_test = X_test.shape assert nb_frames_train == nb_frames_test, 'ERROR: nb of frames differ from TRAIN to TEST' assert nb_features_train == nb_features_test, 'ERROR: nb of features differ from TRAIN to TEST' dim_ouput_train = y_train.shape[2] dim_ouput_test = y_test.shape[2] assert dim_ouput_test == dim_ouput_train, 'ERROR: nb of targets differ from TRAIN to TEST' n_in = nb_features_train n_out = dim_ouput_train n_steps = nb_frames_train validation_frequency = nb_seq_train * 2 # for logging during training: every 2 epochs model = rnn_model.MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, learning_rate=lr, learning_rate_decay=lrd, L1_reg=reg_coef, L2_reg=reg_coef, n_epochs=n_epochs, activation='tanh') model.fit(X_train, y_train, validation_frequency=validation_frequency) if doSaveModel: # model_name = MODELDIR + 'rnn_fold%d_nh%d_nepochs%d_lr%g_reg%g.pkl'%(fold_id, n_hidden, n_epochs, lr, reg_coef) model_name = MODELDIR + 'model_fold%d.pkl'%(fold_id) model.save(fpath=model_name) pred = list() for ind_seq_test in xrange(nb_seq_test): pred.append(model.predict(X_test[ind_seq_test])) y_hat = np.array(pred, dtype=float) print y_hat.shape if doSmoothing: # smooooooth y_hat_smooth = np.zeros_like(y_hat, dtype=float) for i in xrange(y_hat.shape[0]): y_hat_smooth[i, :, 0] = np.convolve(y_hat[i, :, 0], wts, mode='same') y_hat_smooth[i, :delay, 0] = y_hat[i, :delay, 0] y_hat_smooth[i, -delay:, 0] = y_hat[i, -delay:, 0] y_hat_smooth[i, :, 1] = np.convolve(y_hat[i, :, 1], wts, mode='same') y_hat_smooth[i, :delay, 1] = y_hat[i, :delay, 1] y_hat_smooth[i, -delay:, 1] = y_hat[i, -delay:, 1] # save predictions on the test subset, before reshaping to 2-d arrays (I need 3d arrays) if doSmoothing: # fold_pred = [item for sublist in fold_pred for item in sublist] # fold_pred = np.array(fold_pred, dtype=float) pred_file = LOGDIR + 'fold%d_test_predictions.pkl'%(fold_id) pickle.dump( y_hat_smooth, open( pred_file, "wb" ) ) print ' ... predictions y_hat_smooth saved in: %s'%(pred_file) else: # fold_pred = [item for sublist in fold_pred for item in sublist] # fold_pred = np.array(fold_pred, dtype=float) pred_file = LOGDIR + 'fold%d_test_predictions.pkl'%(fold_id) pickle.dump( y_hat, open( pred_file, "wb" ) ) print ' ... predictions y_hat saved in: %s'%(pred_file) if doSmoothing: y_hat_smooth = np.reshape(y_hat_smooth, (y_hat_smooth.shape[0]*y_hat_smooth.shape[1], y_hat_smooth.shape[2])) y_hat = np.reshape(y_hat, (y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) y_test_concat = np.reshape(y_test, (y_test.shape[0]*y_test.shape[1], y_test.shape[2])) print y_hat.shape, y_test_concat.shape assert y_hat.shape == y_test_concat.shape, 'ERROR: pred and ref shapes are different!' # concat hyp labels: if doSmoothing: all_fold_pred.append(y_hat_smooth.tolist()) else: all_fold_pred.append(y_hat.tolist()) # concat ref labels: all_fold_y_test.append(y_test_concat.tolist()) if doSmoothing: RMSE, pcorr, error_per_song, mean_per_song = evaluate(y_test_concat, y_hat_smooth, id_test.shape[0]) else: RMSE, pcorr, error_per_song, mean_per_song = evaluate(y_test_concat, y_hat, id_test.shape[0]) s = ( 'fold: %d valence: %.4f %.4f arousal: %.4f %.4f\n' % (fold_id, RMSE[0], pcorr[0][0], RMSE[1], pcorr[1][0]) ) print s log_f.write(s) # predict on the train set and save predictions (useful to train rnn2) if doSmoothing: pred = list() for ind_seq_train in xrange(nb_seq_train): pred.append(model.predict(X_train[ind_seq_train])) train_y_hat = np.array(pred, dtype=float) print train_y_hat.shape train_y_hat_smooth = np.zeros_like(train_y_hat, dtype=float) for i in xrange(train_y_hat.shape[0]): train_y_hat_smooth[i, :, 0] = np.convolve(train_y_hat[i, :, 0], wts, mode='same') train_y_hat_smooth[i, :delay, 0] = train_y_hat[i, :delay, 0] train_y_hat_smooth[i, -delay:, 0] = train_y_hat[i, -delay:, 0] train_y_hat_smooth[i, :, 1] = np.convolve(train_y_hat[i, :, 1], wts, mode='same') train_y_hat_smooth[i, :delay, 1] = train_y_hat[i, :delay, 1] train_y_hat_smooth[i, -delay:, 1] = train_y_hat[i, -delay:, 1] # no reshape, I need 3d arrays # train_y_hat_smooth = np.reshape(train_y_hat_smooth, (train_y_hat_smooth.shape[0]*train_y_hat_smooth.shape[1], train_y_hat_smooth.shape[2])) pred_file = LOGDIR + 'fold%d_train_predictions.pkl'%(fold_id) pickle.dump( train_y_hat_smooth, open( pred_file, "wb" ) ) print ' ... predictions y_hat_smooth saved in: %s'%(pred_file) else: pred = list() for ind_seq_train in xrange(nb_seq_train): pred.append(model.predict(X_train[ind_seq_train])) train_y_hat = np.array(pred, dtype=float) pred_file = LOGDIR + 'fold%d_train_predictions.pkl'%(fold_id) pickle.dump( train_y_hat, open( pred_file, "wb" ) ) print ' ... predictions y_hat saved in: %s'%(pred_file) doPlot = False if doPlot: fig, ax = plt.subplots() x1 = np.linspace(1, y_test_concat.shape[0], y_test_concat.shape[0]) if EMO == 'valence': ax.plot(x1, y_test_concat[:, 0], 'o', label="Data") # ax.plot(x1, y_hat[:,0], 'r-', label="OLS prediction") ax.plot(x1, y_hat[:,0], 'ro', label="OLS prediction") else: ax.plot(x1, y_test_concat[:, 1], 'o', label="Data") ax.plot(x1, y_hat[:,1], 'ro', label="OLS prediction") plt.title(EMO + ' on Test subset') ax.legend(loc="best") plt.show() # plt.savefig('figures/rnn_%s_fold%d.png'%(EMO, fold_id), format='png') doPlotTrain = False if doPlotTrain: # plt.close('all') fig = plt.figure() ax1 = plt.subplot(211) plt.plot(X_train[0]) ax1.set_title('input') ax2 = plt.subplot(212) true_targets = plt.plot(y_train[0]) guess = model.predict(X_train[0]) guessed_targets = plt.plot(guess, linestyle='--') for i, x in enumerate(guessed_targets): x.set_color(true_targets[i].get_color()) ax2.set_title('solid: true output, dashed: model output') plt.show() doPlotTest = False if doPlotTest: # plt.close('all') fig = plt.figure() ax1 = plt.subplot(211) plt.plot(X_test[0]) ax1.set_title('input') ax2 = plt.subplot(212) true_targets = plt.plot(y_test[0]) # guess = model.predict(X_test[0]) guess = y_hat[0] guessed_targets = plt.plot(guess, linestyle='--') for i, x in enumerate(guessed_targets): x.set_color(true_targets[i].get_color()) ax2.set_title('solid: true output, dashed: model output') plt.show() print "... Elapsed time: %f" % (time.time() - t0) all_fold_pred = [item for sublist in all_fold_pred for item in sublist] all_fold_y_test = [item for sublist in all_fold_y_test for item in sublist] all_fold_pred = np.array(all_fold_pred, dtype=float) all_fold_y_test = np.array(all_fold_y_test, dtype=float) print all_fold_pred.shape, all_fold_y_test.shape # save predictions pred_file = LOGDIR + 'all_predictions.pkl' pickle.dump( all_fold_pred, open( pred_file, "wb" ) ) print ' ... all predictions saved in: %s'%(pred_file) # ref_file = 'rnn/all_groundtruth.pkl' # pickle.dump( all_fold_y_test, open( ref_file, "wb" ) ) # compute t-test p-values with baseline predictions baseline_prediction_file = 'rnn/all_baseline_predictions_260feat.pkl' baseline_preds = pickle.load(open( baseline_prediction_file, 'r' )) pvalue_val = stats.ttest_ind(baseline_preds[:,0], all_fold_pred[:,0])[1] pvalue_ar = stats.ttest_ind(baseline_preds[:,1], all_fold_pred[:,1])[1] pvalues = (pvalue_val, pvalue_ar) RMSE, pcorr, error_per_song, mean_per_song = evaluate(all_fold_y_test, all_fold_pred, 0) # print( # 'sklearn --> valence: %.4f, arousal: %.4f\n' # 'Pearson Corr --> valence: %.4f, arousal: %.4f \n' # # % (RMSE[0], -1. , pcorr[0][0], -1) # % (RMSE[0],RMSE[1],pcorr[0][0], pcorr[1][0]) # ) s = ( 'allfolds valence: %.4f %.4f arousal: %.4f %.4f p-values: %.4f, %.4f\n' % (RMSE[0], pcorr[0][0], RMSE[1], pcorr[1][0], pvalue_val, pvalue_ar) ) print s log_f.write(s) log_f.close() return RMSE, pcorr, pvalues
fold_id = 0 t0 = time.time() print '... loading FOLD %d'%fold_id fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) ) X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT) X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT) if useMelodyFeatures: # first feature = slope, other two = mean, std melody_train, melody_test = subset_features(all_song_melody_features, id_train, id_test) # standardize train data melody_concat_train = np.reshape(melody_train, (melody_train.shape[0]*melody_train.shape[1], melody_train.shape[2]), order='C') melody_concat_train_normed, scaler = standardize(melody_concat_train) # print concat_train_normed.shape melody_train_normed = np.reshape(melody_concat_train_normed, (melody_train.shape[0], melody_train.shape[1], melody_train.shape[2]), order='C') del melody_concat_train, melody_concat_train_normed # standardize test data melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C') melody_concat_test_normed, _ = standardize(melody_concat_test, scaler) # print concat_test_normed.shape melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C') del melody_concat_test, melody_concat_test_normed # concat with the other features X_train = np.concatenate((X_train, melody_train_normed), axis=2) X_test = np.concatenate((X_test, melody_test_normed), axis=2)
def plot_mesh(F,vertices,edges,triangles,tetrahedra,**kwargs): no_function = (F is None) if not no_function: std_F = standardize(F) # Between 0 and 1 print 'function size',F.shape print 'vertices',vertices.shape assert F.size == vertices.shape[0] else: F = 'k' V = vertices.shape[0] cmap = plt.get_cmap(kwargs.get('cmap','jet')) no_nodes = kwargs.get('no_nodes',False) no_mesh = kwargs.get('no_mesh',False) alpha_fn = kwargs.get('alpha_fn',lambda x : 0.1) # Plot points fig = plt.gcf() ax = plt.gca() p = ax.scatter(vertices[:,0], vertices[:,1], vertices[:,2], s=25, c = F, alpha=0.25, lw=0, cmap=cmap) if not no_function: fig.colorbar(p) # Build line collection if not no_mesh: segs = [] seg_set = set() obj_groups = [np.array(x,dtype=np.integer)\ for x in [edges,triangles,tetrahedra]] for objs in obj_groups: if 0 == objs.size: continue (N,D) = objs.shape for i in xrange(N): for verts in itertools.combinations(objs[i,:],2): verts = [int(v) - 1 for v in verts] for v in verts: assert 0 <= v < V key = tuple(verts) if key in seg_set: continue seg_set.add(key) segs.append([vertices[x,:] for x in verts]) S = len(segs) linecolors = [0.5,0.5,0.5,0.1] # Dark gray print 'Plotting {0} line segments'.format(S) seg_collection = Line3DCollection(segs,colors=linecolors) ax.add_collection3d(seg_collection) # Build a poly collection of faces # This makes for a "stained glass" look if not no_function: poly = [] poly_set = set() obj_groups = [x.astype(np.integer) for x in [triangles,tetrahedra]] facecolors = [] for (I,objs) in enumerate(obj_groups): if objs is None or no_function: continue (N,D) = objs.shape for i in xrange(N): for verts in itertools.combinations(objs[i,:],3): verts = [int(v) - 1 for v in verts] for v in verts: assert 0 <= v < V key = tuple(verts) if key in poly_set: continue poly_set.add(key) if np.any(np.isnan(std_F[verts])): continue mean_F = np.mean(std_F[verts]) alpha = alpha_fn(mean_F) if alpha < 0.025: # Skip if all vertices are greater # than cutoff continue triangle = [vertices[x,:] for x in verts] poly.append(triangle) # Color with the mean vertex color color = list(cmap(mean_F)) color[3] = alpha facecolors.append(color) P = len(poly) print 'Plotting {0} triangles'.format(P) edgecolors = np.zeros((P,4)) poly_collection = Poly3DCollection(poly, facecolors=facecolors, edgecolors=edgecolors) ax.add_collection3d(poly_collection)
return self.get_batch() ## data formating # directory = 'data/mat/fox_100x100_matlab.mat' directory = sys.argv[1] #'data/mat/fox_100x100_matlab.mat' D = io.loadmat(directory) features0 = D["features"].todense() ## remove identical features uniid = [] for i in range(features0.shape[1]): if len(np.unique(np.array(features0[:, i]))) == 1: uniid.append(i) features = np.delete(features0, uniid, axis=1) ## standardize all data (maybe flawed) all_mean, all_std = utils.standardize(features) features = (features - all_mean) / all_std from sklearn.decomposition import PCA pca = PCA() features = pca.fit_transform(features) # features = features0 # pdb.set_trace() labels = np.array(D["labels"].todense())[0] bag_ids = D["bag_ids"][0] MAX_LENGTH = max([list(bag_ids).count(iBag) for iBag in set(bag_ids)]) N_FEATURE_DIM = features.shape[1]
def plot_mesh_slice(f,bound,meshfile,**kwargs): G = kwargs.get('grid_points',64) flat = kwargs.get('flat',True) assert((3,2) == bound.shape) idx = np.where(bound[:,0] == bound[:,1])[0] nidx = np.where(bound[:,0] != bound[:,1])[0] if 2 != nidx.size: print "Check slice bounds, need exactly 2 non-trivial dimensions" assert 1 == idx.size bound = np.hstack([bound,G*np.ones((3,1))]) bound[idx,2] = 1 grids = [np.linspace(*list(bound[i,:])) for i in xrange(3)] (points,meshes) = make_points(grids,True) timestamp = str(time.time()) point_file = "/tmp/points." + timestamp value_file = "/tmp/value." + timestamp out_file = "/tmp/out." + timestamp arch = Archiver(points=points) arch.write(point_file) arch.clear() arch.add(values=f) arch.write(value_file) (base,ext) = os.path.splitext(meshfile) assert '.mesh' == ext cmd = ['cdiscrete/tet_interp', '--mesh',base + '.ctri', '--points',point_file, '--values',value_file, '--out',out_file] cmd = ' '.join(cmd) print cmd try: subprocess.check_call(cmd,shell=True) except Exception: print "Interpolation failed; check .ctri file?" quit() unarch = Unarchiver(out_file) F = np.reshape(unarch.interp,(G,G)) Fm = np.ma.masked_where(np.isnan(F),F) if flat: plt.gcf() [X,Y] = [meshes[i].squeeze() for i in nidx] plt.pcolormesh(X,Y,Fm) else: Fm = standardize(Fm) [X,Y,Z] = [mesh.squeeze() for mesh in meshes] fig = plt.gcf() ax = fig.gca(projection='3d') cmap = plt.get_cmap('jet') colors = cmap(Fm) colors[...,3]= 0.25*(1-Fm)**1.5 p = ax.plot_surface(X,Y,Z, rstride=1,cstride=1, facecolors=colors, shade=False)