def fetch(args,dataset="",number=0): if not dataset: dataset = args.dataset.upper() names = "" gt = [] if dataset in ["TURBOFAN","MILL","IGBT"]: data,gt,explanations = nasa.main(args) elif dataset == "BACKBLAZE": data,gt,explanations,names = backblaze.main(args) elif dataset == "OCCUPANCY": data,gt,explanations,names = occupancy.main(args) elif dataset == "DODGERS": data,gt,explanations,names = dodgers.main(args) elif dataset == "EYE": data,gt,explanations,names = eye.main(args) #elif dataset == "ARMA_SIM": # data = sim.arma_sim(np.array([1]),np.array([1,0.5,-0.2]),1000,num=5) elif dataset == "VARMA_SIM": if args.filename: data,gt = sim.read(args.filename,args.elemsep,args.linesep) data = [pp.normalize(dat) for dat in data] else: num_timepoints = args.settings["num_timepoints"] num_samples = args.settings["num_samples"] case = args.settings["case"] data = [sim.mixed_varma(num_timepoints,case) for i in range(num_samples)] data = [pp.normalize(dat) for dat in data] sim.write(data,gt,"VARMA",args)
def test_sine_regression(self): errors = [] for iteration in range(10): input_data = np.ones((1, 40)) * np.linspace(0, 1, 40) target_data = (np.sin(2 * np.pi * input_data) + np.cos(4 * np.pi * input_data) + np.random.randn(40) * 0.2) input_data = np.transpose(input_data) target_data = np.transpose(target_data) input_data = normalize(input_data) target_data = normalize(target_data) training_inputs = input_data[0::2, :] testing_inputs = input_data[1::4, :] validation_inputs = input_data[3::4, :] training_targets = target_data[0::2, :] testing_targets = target_data[1::4, :] validation_targets = target_data[3::4, :] neural_net = mlp.MultilayerPerceptron( (1, 5, 4, 3, 1), Backpropagation(800), learner_type=mlp.LearnerType.REGRESSION) neural_net.train_with_early_stopping(training_inputs, training_targets, validation_inputs, validation_targets) testing_outputs = neural_net.recall(testing_inputs) errors.append(0.5 * np.sum((testing_targets - testing_outputs)**2)) average_error = np.median(errors) self.assertLessEqual(average_error, 0.5)
def oct_target(alpha): print('Solving ILP for hyperparameter tuning...') all_results = [] tree_depth = tree_depths[0] for r in range(val_repeat): train_df, val_df = preprocessing.train_test_split( train_val_df, split=train_val_ratio, random_state=random_state) preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(val_df, norm_cols=norm_cols) all_results.append( get_results(train_df=train_df, test_df=val_df, alpha=alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) results_df = pd.concat(all_results) all_results_df.append(results_df) aggregated = calc_mean_accuracy_per_alpha(results_df) all_aggregated_df.append(aggregated) best_alpha_acc = aggregated.max()['testing_accuracy'] return best_alpha_acc
def transform(self, X): tfidf = np.multiply(X, self.idf_) if self.norm == 'l2': tfidf = tfidf / normalize(tfidf, p=2, axis=1).reshape((-1, 1)) elif self.norm == 'l1': tfidf = tfidf / normalize(tfidf, p=1, axis=1).reshape((-1, 1)) return tfidf
def __solve__(self): if self.normalize: normalize(self.X) if self.method == 'gd': return gradient_descent(self.X, self.Y, self.reg_score) if self.method == 'exact': return exact_solution(self.X, self.Y) if self.method == 'evolution': return de(self.X, self.Y) if self.method == 'conj': return conjugate_gradients(self.X, self.Y)
def write_results_file_2(results, test_data, test_measures): INFOFILE = open("results2.txt", "w", encoding='utf-8') #print(len(zip(results, test_data['Tweet text'].values, test_measures["M2"]))) for result, text, M2 in zip(results, test_data['Tweet text'].values, test_measures["M2"]): if result < 0.5: output = "0, prob:" + str(result) + " M2:" + str( M2) + " " + pre.normalize(text) + "\n" INFOFILE.write(output) else: output = "1, prob:" + str(result) + " M2:" + str( M2) + " " + pre.normalize(text) + "\n" INFOFILE.write(output) INFOFILE.close()
def normalize(input_dict): """Normalizes all numeric values in the given dataset""" instances = input_dict['instances'] output_dict = {} # 1,0 -> normalize to [0,1]; 2,-1 then to [-1,1] output_dict['normalized'] = preprocessing.normalize(instances, '-S 2.0 -T -1.0') return output_dict
def embed(wavform_slice, rate): norm_wavform_slice = preprocessing.normalize(wavform_slice) examples_batch = vggish_input.waveform_to_examples(norm_wavform_slice,rate) #print('examples_batch:') #print(examples_batch) print('examples_batch len: ' + str(len(examples_batch))) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print('embedding_batch: ') #print(embedding_batch) #print(embedding_batch.shape) postprocessed_batch = pproc.postprocess(embedding_batch) print('postprocessed_batch: ') print(postprocessed_batch) print(postprocessed_batch.shape) return postprocessed_batch
def create_edges( images, outputs, sigma, upt, lwt, kernel=np.ones((3, 3), np.uint8), thresh=0.1 ): # 2 ways: Edges of raw images * outputs or Edges of [raw images*outputs] im = images[0].permute(1, 2, 0).data.cpu().numpy() out = outputs[0, 1].permute(1, 2, 0).data.cpu().numpy() vol_norm_raw = normalize(im) vol_norm_raw = np.uint8(vol_norm_raw) sitk_img_raw = sitk.GetImageFromArray(vol_norm_raw) sitk_img_raw_float = sitk.Cast(sitk_img_raw, sitk.sitkFloat32) edges_raw = sitk.CannyEdgeDetection(sitk_img_raw_float, lowerThreshold=lwt, upperThreshold=upt, variance=[sigma, sigma, sigma]) edges_array_raw = sitk.GetArrayFromImage(edges_raw) edges_array_final = ((edges_array_raw * out) > thresh).astype(np.uint8) edges_closing = cv2.morphologyEx(edges_array_final, cv2.MORPH_CLOSE, kernel, iterations=1) return edges_closing
def write_results_file(results, data, measures): PREDICTIONSFILE = open("results1.txt", "w", encoding='utf-8') for result, label, text, M1, M2 in zip(results, data['Label'].values, data['Tweet text'].values, measures["M1"], measures["M2"]): if result < 0.5: output = "label:" + str(label) + "/0 prob:" + str( result) + " M1:" + str(M1) + " M2:" + str( M2) + " " + pre.normalize(text) + "\n" PREDICTIONSFILE.write(output) else: output = "label:" + str(label) + "/1 prob:" + str( result) + " M1:" + str(M1) + " M2:" + str( M2) + " " + pre.normalize(text) + "\n" PREDICTIONSFILE.write(output) PREDICTIONSFILE.close()
def main(): parser = argparse.ArgumentParser(description='Generate an ingredient-ID mapping file') parser.add_argument('json', help='Input data file') parser.add_argument('--threshold', type=int, default=5, help='Cutoff of how many times an ingredient should occur in recipes',) parser.add_argument('pkl', help='Output pickle file') args = parser.parse_args() ingredients_counter = Counter() with open(args.json, 'r') as f: for line in f: recipe = json.loads(line.strip()) ingredients = recipe['ingredients'] for ingredient in ingredients: for normalized_ingredient in preprocessing.normalize(ingredient): ingredients_counter[normalized_ingredient] += 1 ingredient_id = 0 ingredient2id = {} for ingredient in sorted(ingredients_counter): count = ingredients_counter[ingredient] if count < args.threshold: continue ingredient2id[ingredient] = ingredient_id ingredient_id += 1 id2ingredient = dict((v, k) for k, v in ingredient2id.iteritems()) with open(args.pkl, 'w') as f: pickle.dump({ 'ingredient2id': ingredient2id, 'id2ingredient': id2ingredient, }, f)
def create_captions(classes, texts, category2idx, verbose=True, save=True): ''' helper function to create text_c10 folder ''' cls2count = {k.replace(" ", "_"): 1 for k in category2idx} filenames = [] for index, (cls, text) in enumerate(zip(classes, texts)): category = cls.replace(" ", "_").replace("&", 'AND') cls = cls.replace("&", 'AND') dirname = "%.3i.%s" % (category2idx[cls], category) filename = "%s_%i.txt" % (category, cls2count[category]) directory = os.path.join(DATA_PATH, "text_c10/%s" % dirname) if not os.path.exists(directory): os.makedirs(directory) if verbose and (index % 5000) == 0: print("%i - %s" % (index, filename)) if save: with open(os.path.join(directory, filename), 'wt') as f: f.write("%s\n" % normalize(text)) filenames.append(os.path.join(dirname, filename)) cls2count[category] += 1 return filenames
def create_numpy_arrays(self, preprocess_function): # always normalize before doing any other preprocessing self.np_values = preprocessing.normalize(np.array(self.values)) self.np_time_stamps = np.array(self.time_stamps) #print("Before pre processing:", self.np_values) if preprocess_function: self.np_values = preprocess_function(self.np_values, self.np_time_stamps)
def transform(self, txts): res = [] for txt in txts: # see https://github.com/RaRe-Technologies/gensim/issues/447 self.d2v.random.seed(conf.SEED) v = self.d2v.infer_vector(micro_tokenize(normalize(txt))) res.append(v) return numpy.vstack(res)
def fit(d, l, kn): global data global label global k data = prepro.normalize(d) label = l k = kn
def load_csv_data_meta(csv_file): from sklearn import preprocessing df = pd.read_csv(csv_file, delimiter=',', header=None, skiprows=1, names=['name', 'author', 'score', 'body', 'Class', 'response_count', 'ARI_value', 'polarity', 'BadWords']) comments_data = df.drop(['name', 'author', 'body', 'Class'], axis=1) comments_data = preprocessing.normalize(comments_data, norm='l2') return comments_data
def save_edges_output(edges, dir_path, patient_id, object_name): edges = normalize(edges) edges = edges.astype(np.uint8) obj_path = dir_path + str(patient_id) + '/' + object_name + '/' if not os.path.exists(obj_path): os.makedirs(obj_path) for j in range(edges.shape[2]): # for cv2.imwrite {0,1} should be mapped to mapped to {0,255} cv2.imwrite(obj_path + pad_zerro(j + 1) + ".tiff", edges[:, :, j])
def _parse_function(example): features = tf.io.parse_single_example(example, feature_description) image = tf.image.decode_jpeg(features['image/encoded']) image = normalize(image) label = tf.one_hot(features['label'], depth=one_hot_depth, dtype=tf.float32) return image, label
def determine_threshold(signal, freq, heart_rate): signal = pr.normalize(signal) i = 0 thresholds = np.arange(threshold_start, threshold_stop, threshold_step) thr = 0 begginings = [] endings = [] for threshold in thresholds: under = 0 is_above = False begginings = [] endings = [] i = 0 for x in signal: if x < threshold: under = under + 1 if is_above == True: is_above = False endings.append(i) else: if is_above == False: is_above = True begginings.append(i) i = i + 1 begginings, endings = investigate_tone_boundaries(begginings, endings) n = len(begginings) rate = n / (len(signal) * 1.0/ freq) * 30 print str(threshold) + ': ' + str(under * 1.0 / len(signal)) + ' ' + str(n) + ' ' + str(rate) if ((1 - rate_confidence) * heart_rate <= rate and (1 + rate_confidence) * heart_rate >= rate): print str(threshold) + ' - HERE!' thr = threshold # wo.plot_wave_signal(signal, freq) # plt.axhline(y = threshold, xmin = 0, xmax = 3, c = "red", linewidth = 0.5, zorder = 0) break elif ((2 - rate_confidence) * heart_rate <= rate and (2 + rate_confidence) * heart_rate >= rate): heart_rate = 2 * heart_rate print str(threshold) + ' - HERE!' thr = threshold # wo.plot_wave_signal(signal, freq) # plt.axhline(y = threshold, xmin = 0, xmax = 3, c = "red", linewidth = 0.5, zorder = 0) break n = len(begginings) peaks_energy = np.zeros(n) for index in range(0, n - 1): peaks_energy[index] = sum(signal[begginings[index] : endings[index]]) return thr, begginings, endings, heart_rate, peaks_energy
def threshold_with_custom_threshold(signal, freq, heart_rate, threshold): thr = max(signal) * threshold signal = pr.normalize(signal) i = 0 signal_type = 0 # 1 - s1 & s2 # 2 - only s1 begginings = [] endings = [] under = 0 is_above = False begginings = [] endings = [] i = 0 for x in signal: if x < threshold: under = under + 1 if is_above == True: is_above = False if (i - begginings[len(begginings) - 1]) > (freq * 0.02): endings.append(i) else: del begginings[-1] else: if is_above == False: is_above = True begginings.append(i) i = i + 1 begginings, endings = investigate_tone_boundaries(begginings, endings) n = len(begginings) rate = (n) / (len(signal) * 1.0/ freq) * 30 print str(threshold) + ': ' + str(under * 1.0 / len(signal)) + ' ' + str(n) + ' ' + str(rate) if ((1 - rate_confidence) * heart_rate <= rate and (1 + rate_confidence) * heart_rate >= rate): heart_rate = (heart_rate + rate) / 2 signal_type = 1 elif ((2 - rate_confidence) * heart_rate <= rate and (2 + rate_confidence) * heart_rate >= rate): heart_rate = (2 * heart_rate + rate) / 2 signal_type = 1 elif ((1 - 3 * rate_confidence) * heart_rate <= rate and (1 - rate_confidence) * heart_rate >= rate): heart_rate = (heart_rate + 2 * rate) / 2 signal_type = 2 else: signal_type = 3 # wo.plot_wave_signal(signal, freq) # plt.axhline(y = threshold, xmin = 0, xmax = 3, c = "red", linewidth = 0.5, zorder = 0) peaks_energy = np.zeros(n) for index in range(0, n - 1): peaks_energy[index] = sum(signal[begginings[index] : endings[index]]) return thr, begginings, endings, heart_rate, peaks_energy, signal_type
def preprocess(file_path, args): # Get sound and sample rate from file using librosa try: sound, sample_rate = librosa.load(file_path) except ZeroDivisionError as e: raise ZeroDivisionError("File for error above:", file_path) from e # Resampling if sample_rate != universal_sample_rate: sound = resample( sound, int(universal_sample_rate * (len(sound) / sample_rate))) pass # If argument for noise addition is set, adds random white- or background noise or removes noise if args.noise_aug: if args.noise_aug == "white_noise": if args.n_steps: sound = sound_shuffling.add_white_noise( sound, target_snr=np.random.normal(args.n_steps[0], args.n_steps[1])) else: sound = sound_shuffling.add_white_noise( sound, target_snr=np.random.normal(4.5, 2.0)) if args.noise_aug == "background_noise": sound = sound_shuffling.add_random_background_noise( sound, sample_rate) if args.noise_aug == "no_noise": sound = preprocessing.extract_noise(sound, sample_rate, window_width=2048, step_size=512, verbose=False) # If argument for shifting is set, shifts amplitude, frequency or time randomly if args.shift_aug: if args.shift_aug == "amplitude_shift": n_steps = random.randint(0, 5) sound = sound_shuffling.amplitude_shift(sound, n_steps) if args.shift_aug == "frequency_shift": n_steps = random.randint(-5, 5) sound = sound_shuffling.frequency_shift(sound, sample_rate, n_steps) if args.shift_aug == "time_stretch": n_steps = random.randint(1, 5) sound = sound_shuffling.time_stretch(sound, n_steps) # Normalize sound = preprocessing.normalize(sound) # Cut sound up in frames of 5 seconds window_width = universal_sample_rate * 5 step_size = window_width # TODO: paramererize stepsize nr_of_frames, frames = get_frames(sound, window_width, step_size) return np.array(frames)
def preprocess(row): if row[0] and row[1]: txt = row[0] + ' ' + row[1] elif row[0]: txt = row[0] elif row[1]: txt = row[1] else: txt = '' return micro_tokenize(normalize(txt))
def _preprocess(self, inputs): """Preprocess the input images. Args: inputs: a batch of raw images. Returns: a batch of processed images as tensors. """ return normalize(inputs)
def load_ingredient2recipes(filename): ingredient2recipes = defaultdict(set) with open(filename, 'r') as f: for line in f: recipe = json.loads(line.strip()) recipe_id = recipe['id'] ingredients = recipe['ingredients'] for ingredient in ingredients: for normalized_ingredient in preprocessing.normalize(ingredient): ingredient2recipes[normalized_ingredient].add(recipe_id) return ingredient2recipes
def process_frame(pose_scores, keypoint_scores, keypoint_coords, frame_num, fps, call_cnt): seconds = frame_num / fps normalized = normalize(pose_scores, keypoint_scores, keypoint_coords) if not normalized: return True print(f'Inserting #{call_cnt}') cursor.execute(insert_sql, (seconds, vid_id, ujson.dumps(normalized))) if frame_num % 1000 == 0: print('Committing...') mydb.commit()
def predict(self, X, ntree_limit=-1): X = np.array(X) if self.params["normalize"]: X = normalize(X) if ntree_limit == -1: ntree_limit = len(self.model) preds = self.model[0].predict(X) for ntree in np.arange(1, ntree_limit): preds += self.params["learning_rate"] * self.model[ntree].predict(X) return preds
def load_data(path): my_dir = sorted(os.listdir(path)) data = [] gt = [] for p in tqdm(my_dir): data_list = sorted(os.listdir(path + p)) # print("sorted(os.listdir(path+p))",sorted(os.listdir(path+p))) ['Brats18_2013_0_1_flair.nii.gz', 'Brats18_2013_0_1_seg.nii.gz', 'Brats18_2013_0_1_t1.nii.gz', 'Brats18_2013_0_1_t1ce.nii.gz', 'Brats18_2013_0_1_t2.nii.gz'] img_itk = sitk.ReadImage(path + p + '/' + data_list[0]) # print("image path",path + p + '/'+ data_list[0]) Data/Brats2018/LGG/Brats18_2013_0_1/Brats18_2013_0_1_flair.nii.gz flair = sitk.GetArrayFromImage(img_itk) # print("flair shape",flair.shape) # (155, 240, 240) # print("flair dtype",flair.dtype) # int16 flair = normalize(flair) img_itk = sitk.ReadImage(path + p + '/' + data_list[1]) seg = sitk.GetArrayFromImage(img_itk) # print("seg shape",seg.shape) # (155, 240, 240) # print("seg dtype",seg.dtype) # uint8 / int16 img_itk = sitk.ReadImage(path + p + '/' + data_list[2]) t1 = sitk.GetArrayFromImage(img_itk) t1 = normalize(t1) img_itk = sitk.ReadImage(path + p + '/' + data_list[3]) t1ce = sitk.GetArrayFromImage(img_itk) t1ce = normalize(t1ce) img_itk = sitk.ReadImage(path + p + '/' + data_list[4]) t2 = sitk.GetArrayFromImage(img_itk) t2 = normalize(t2) data.append([flair, t1, t1ce, t2]) gt.append(seg) data = np.asarray(data, dtype=np.float32) gt = np.asarray(gt, dtype=np.uint8) return data, gt
def prepare_embeddings(texts, model, limit=None, batch_size=128): # normalize texts texts_ = [normalize(text) for text in texts[:limit]] hs = [] for index, batch in enumerate(get_batch(texts_, batch_size)): if index and index % 100 == 0: print("Processing batch number %i" % index) hs.extend([h.reshape(1, -1) for h in model.embed(batch)]) return hs
def process_frame(pose_scores, keypoint_scores, keypoint_coords, frame_num, fps): global cnt seconds = frame_num / fps normalized = normalize(pose_scores, keypoint_scores, keypoint_coords) # print(normalized, seconds) print('Inserting...') cursor.execute(sql, (seconds, 'test_video3', ujson.dumps(normalized))) cnt += 1 if cnt % 500 == 0: print('Committing...') mydb.commit()
def filter_sound(samples, sampling_rate, window_width=2048, stepsize=512, verbose=False): noise = get_noise_frames(samples=samples, sampling_rate=sampling_rate, window_width=window_width, stepsize=stepsize, verbose=verbose) if len(noise) > 0: reduced_noise = nr.reduce_noise(audio_clip=samples, noise_clip=noise, verbose=verbose) return preprocessing.normalize(reduced_noise) else: return samples
def data_generator(data_dir, name, image_size, number_marks, training): """A generator function used to make TensorFlow dataset. Currently only `universal` dataset (image + json) of FMD is supported. Args: data_dir: the direcotry of the raw image and json files. name: the name of the dataset. image_size: the width and height of the input images for the network. number_marks: how many marks/points does one sample contains. training: generated data will be used for training or not. Yields: preprocessed image and heatmaps. """ # Initialize the dataset with files. dataset = Universal(name.decode("utf-8")) dataset.populate_dataset(data_dir.decode("utf-8"), key_marks_indices=None) dataset.meta.update({"num_marks": number_marks}) image_size = tuple(image_size) width, _ = image_size for sample in dataset: # Follow the official preprocessing implementation. image = sample.read_image("RGB") marks = sample.marks if training: # Rotate the image randomly. image, marks = rotate_randomly(image, marks, (-30, 30)) # Scale the image randomly. image, marks = scale_randomly(image, marks, output_size=image_size) # Flip the image randomly. image, marks = flip_randomly(image, marks) else: # Scale the image to output size. marks = marks / image.shape[0] * width image = cv2.resize(image, image_size) # Normalize the image. image_float = normalize(image.astype(float)) # Generate heatmaps. heatmaps = generate_heatmaps(marks, width, (64, 64)) heatmaps = np.transpose(heatmaps, (1, 2, 0)) yield image_float, heatmaps
def add_white_noise(samples, target_snr=2): # Calculate the root mean square of the samples RMS_samples = np.sqrt(np.mean(samples ** 2)) # Calculate the root mean square of the noise given a target SNR RMS_noise = np.sqrt((RMS_samples ** 2) / 10 ** (target_snr / 10)) # Generate Additive White Gaussian Noise noise = np.random.normal(0, RMS_noise, samples.shape[0]) # Add noise to samples samples += noise return preprocessing.normalize(samples)
def generate_and_save_images(model, epoch, test_input): predictions = model(test_input, training=False) fig = plt.figure(figsize=(4, 4)) for i in range(predictions.shape[0]): plt.subplot(4, 4, i + 1) plt.imshow(normalize(predictions[i, :, :, 0], input_range=(-1, 1), output_range=(0, 255)), cmap='gray') plt.axis('off') plt.savefig('./images/epoch_{:04d}.png'.format(epoch)) plt.close()
def detectWakewords(): threading.Timer(0.10, detectWakewords).start() global count global buff p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=CHUNKSIZE) data = stream.read(CHUNKSIZE) raw_speech = np.fromstring(data, dtype=np.int16) if modelname.startswith('cnn'): X = normalize(mfcc(raw_speech)).reshape(1, 99, 13, 1) else: X = normalize(mfcc(raw_speech)).reshape(1, 99 * 13) pred = [int(round(x[0])) for x in model.predict(X)] if np.sum(np.abs(raw_speech)) < len(buff)*100000: pred = [0] buff.pop(0) buff.append(pred[0]) if (buff[-min_positives:] == [1]*min_positives and buff[-(min_positives+1)] == 0): print("Wake word detected #", count) winsound.Beep(1000, 300) count+=1 stream.stop_stream() stream.close() p.terminate()
def get_training_sample(train, sample_type, normalize): if (normalize): df_norm = preprocessing.normalize(train)[0] train = df_norm if sample_type == 'uniform': uni_sample = uniform_sampling(train, 17000) uni_sample.index = np.arange(0, len(uni_sample)) return uni_sample else: choice_sample = choice_sampling(train, 1.6) # Selecting 10000 samples from the choice sample choice_sample = uniform_sampling(choice_sample, 17000) choice_sample.index = np.arange(0, len(choice_sample)) return choice_sample
def main(): parser = argparse.ArgumentParser(description='Inspect top n ingredients') parser.add_argument('json', help='Input data file') parser.add_argument('--n', help='Number of ingredients to print', type=int, default=1000) args = parser.parse_args() ingredients_counter = Counter() with open(args.json, 'r') as f: for line in f: recipe = json.loads(line.strip()) ingredients = recipe['ingredients'] for ingredient in ingredients: for normalized_ingredient in preprocessing.normalize(ingredient): ingredients_counter[normalized_ingredient] += 1 for ingredient, count in ingredients_counter.most_common(args.n): print('{}\t{}'.format(ingredient.encode('utf8'), count))
def ingredients(recipe): ingredientz = recipe['ingredients'] for ingredient in ingredientz: normalized_ingredients = preprocessing.normalize(ingredient) for normalized_ingredient in normalized_ingredients: yield ('meta', 'ingr', normalized_ingredient)
def main(n_z, n_hidden, dataset, seed, comment, gfx=True): # Initialize logdir import time logdir = 'results/gpulearn_z_x_'+dataset+'_'+str(n_z)+'-'+str(n_hidden)+'_'+comment+'_'+str(int(time.time()))+'/' if not os.path.exists(logdir): os.makedirs(logdir) print 'logdir:', logdir print 'gpulearn_z_x', n_z, n_hidden, dataset, seed with open(logdir+'hook.txt', 'a') as f: print >>f, 'learn_z_x', n_z, n_hidden, dataset, seed np.random.seed(seed) gfx_freq = 1 weight_decay = 0 f_enc, f_dec = lambda x:x, lambda x:x # Init data if dataset == 'mnist': import anglepy.data.mnist as mnist # MNIST size = 28 train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy(size) x = {'x': train_x.astype(np.float32)} x_valid = {'x': valid_x.astype(np.float32)} x_test = {'x': test_x.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 50000 n_batch = 1000 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train if dataset == 'mnist_binarized': import anglepy.data.mnist_binarized as mnist_binarized # MNIST train_x, valid_x, test_x = mnist_binarized.load_numpy(28) x = {'x': np.hstack((train_x, valid_x)).astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 dim_input = (28,28) n_x = 28*28 n_y = 10 type_qz = 'gaussianmarg' type_pz = 'mog' nonlinear = 'rectlin' type_px = 'bernoulli' n_train = 60000 n_batch = 1000 colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'freyface': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy() np.random.shuffle(train_x) x = {'x': train_x.T[:,0:n_train]} x_valid = {'x': train_x.T[:,n_train:]} L_valid = 1 dim_input = (28,20) n_x = 20*28 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'bounded01' nonlinear = 'tanh' #tanh works better with freyface #'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'freyface_pca': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy().T np.random.shuffle(train_x.T) f_enc, f_dec, _ = pp.PCA(train_x, 0.99) train_x = f_enc(train_x) x = {'x': train_x[:,0:n_train].astype(np.float32)} x_valid = {'x': train_x[:,n_train:].astype(np.float32)} L_valid = 1 dim_input = (28,20) n_x = train_x.shape[0] type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False elif dataset == 'freyface_bernoulli': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy().T np.random.shuffle(train_x.T) x = {'x': train_x[:,0:n_train].astype(np.float32)} x_valid = {'x': train_x[:,n_train:].astype(np.float32)} L_valid = 1 dim_input = (28,20) n_x = train_x.shape[0] type_pz = 'gaussianmarg' type_px = 'bernoulli' nonlinear = 'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False elif dataset == 'norb': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False byteToFloat = False bernoulli_x = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'norb_pca': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) f_enc, f_dec, _ = pp.PCA(train_x, 0.999) #f_enc, f_dec, _ = pp.normalize_random(train_x) train_x = f_enc(train_x) test_x = f_enc(test_x) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False bernoulli_x = False byteToFloat = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'norb_normalized': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) #f_enc, f_dec, _ = pp.PCA(train_x, 0.99) #f_enc, f_dec, _ = pp.normalize_random(train_x) f_enc, f_dec, _ = pp.normalize(train_x) train_x = f_enc(train_x) test_x = f_enc(test_x) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False bernoulli_x = False byteToFloat = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'svhn': # SVHN dataset import anglepy.data.svhn as svhn size = 32 train_x, train_y, test_x, test_y = svhn.load_numpy(False, binarize_y=True) #norb.load_resized(size, binarize_y=True) extra_x, extra_y = svhn.load_numpy_extra(False, binarize_y=True) x = {'x': np.hstack((train_x, extra_x)), 'y':np.hstack((train_y, extra_y))} ndict.shuffleCols(x) print 'Performing PCA, can take a few minutes... ', f_enc, f_dec, pca_params = pp.PCA(x['x'][:,:10000], cutoff=600, toFloat=True) ndict.savez(pca_params, logdir+'pca_params') print 'Done.' n_y = 10 x = {'x': f_enc(x['x']).astype(np.float32)} x_valid = {'x': f_enc(test_x).astype(np.float32)} L_valid = 1 n_x = x['x'].shape[0] dim_input = (size,size) n_batch = 5000 colorImg = True bernoulli_x = False byteToFloat = False type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' # Construct model from anglepy.models import GPUVAE_Z_X updates = get_adam_optimizer(learning_rate=3e-4, weight_decay=weight_decay) model = GPUVAE_Z_X(updates, n_x, n_hidden, n_z, n_hidden[::-1], nonlinear, nonlinear, type_px, type_qz=type_qz, type_pz=type_pz, prior_sd=100, init_sd=1e-3) if False: #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412689061/' #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412676966/' #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412695481/' #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412695455/' #dir = '/Users/dpkingma/results/gpulearn_z_x_svhn_pca_300-(500, 500)__1413904756/' dir = '/home/ubuntu/results/gpulearn_z_x_mnist_50-[500, 500]__1414259423/' w = ndict.loadz(dir+'w_best.ndict.tar.gz') v = ndict.loadz(dir+'v_best.ndict.tar.gz') ndict.set_value(model.w, w) ndict.set_value(model.v, v) # Some statistics for optimization ll_valid_stats = [-1e99, 0] # Progress hook def hook(epoch, t, ll): if epoch%10 != 0: return ll_valid, _ = model.est_loglik(x_valid, n_samples=L_valid, n_batch=n_batch, byteToFloat=byteToFloat) # Log ndict.savez(ndict.get_value(model.v), logdir+'v') ndict.savez(ndict.get_value(model.w), logdir+'w') if ll_valid > ll_valid_stats[0]: ll_valid_stats[0] = ll_valid ll_valid_stats[1] = 0 ndict.savez(ndict.get_value(model.v), logdir+'v_best') ndict.savez(ndict.get_value(model.w), logdir+'w_best') else: ll_valid_stats[1] += 1 # Stop when not improving validation set performance in 100 iterations if ll_valid_stats[1] > 1000: print "Finished" with open(logdir+'hook.txt', 'a') as f: print >>f, "Finished" exit() print epoch, t, ll, ll_valid, ll_valid_stats with open(logdir+'hook.txt', 'a') as f: print >>f, epoch, t, ll, ll_valid, ll_valid_stats # Graphics if gfx and epoch%gfx_freq == 0: #tail = '.png' tail = '-'+str(epoch)+'.png' v = {i: model.v[i].get_value() for i in model.v} w = {i: model.w[i].get_value() for i in model.w} if 'pca' not in dataset and 'random' not in dataset and 'normalized' not in dataset: if 'w0' in v: image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T), dim_input, True, colorImg=colorImg) image.save(logdir+'q_w0'+tail, 'PNG') image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]), dim_input, True, colorImg=colorImg) image.save(logdir+'out_w'+tail, 'PNG') if 'out_unif' in w: image = paramgraphics.mat_to_img(f_dec(w['out_unif'].reshape((-1,1))), dim_input, True, colorImg=colorImg) image.save(logdir+'out_unif'+tail, 'PNG') if n_z == 2: n_width = 10 import scipy.stats z = {'z':np.zeros((2,n_width**2))} for i in range(0,n_width): for j in range(0,n_width): z['z'][0,n_width*i+j] = scipy.stats.norm.ppf(float(i)/n_width+0.5/n_width) z['z'][1,n_width*i+j] = scipy.stats.norm.ppf(float(j)/n_width+0.5/n_width) x, _, _z = model.gen_xz({}, z, n_width**2) if dataset == 'mnist': x = 1 - _z['x'] image = paramgraphics.mat_to_img(f_dec(_z['x']), dim_input) image.save(logdir+'2dmanifold'+tail, 'PNG') else: _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144) x_samples = _z_confab['x'] image = paramgraphics.mat_to_img(f_dec(x_samples), dim_input, colorImg=colorImg) image.save(logdir+'samples'+tail, 'PNG') #x_samples = _x['x'] #image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg) #image.save(logdir+'samples2'+tail, 'PNG') else: # Model with preprocessing if 'w0' in v: image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T), dim_input, True, colorImg=colorImg) image.save(logdir+'q_w0'+tail, 'PNG') image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]), dim_input, True, colorImg=colorImg) image.save(logdir+'out_w'+tail, 'PNG') _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144) x_samples = f_dec(_z_confab['x']) x_samples = np.minimum(np.maximum(x_samples, 0), 1) image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg) image.save(logdir+'samples'+tail, 'PNG') # Optimize #SFO dostep = epoch_vae_adam(model, x, n_batch=n_batch, bernoulli_x=bernoulli_x, byteToFloat=byteToFloat) loop_va(dostep, hook) pass
_model = "./output/5" _name_filter = ["KK201617T1", "KK201617T2"] _words = [] _norm_dict = None pca_components = None model = None with open(_model+"/preprocess.json", "r") as f: preprocess_dict = json.load(f) _words = preprocess_dict["words"] if "norm_info" in preprocess_dict: _norm_dict = preprocess_dict["norm_info"] if preprocess_dict["pca"]: pca_components = np.load(_model+'/pca.npy') def get_label(sample): #return sample.think + sample.understand + sample.lang + sample.pres return sample.think + sample.understand samples = preprocessing.tp_sample.get_samples(_sample_folder) texts = [sample.comment for sample in samples if sample.batch_name in _name_filter] test_matrix, _, _ = preprocessing.preprocess(texts, words_src = _words) if pca_components is not None: test_matrix = np.matmul(test_matrix, pca_components.T) if _norm_dict is not None: test_matrix, _, _ = preprocessing.normalize(test_matrix, norm_info = _norm_dict) model = models.SVR.load(_model) result = model.predict(test_matrix) print([get_label(sample) for sample in samples]) print(result)
def main(n_z, n_hidden, dataset, seed, comment, gfx=True): # Initialize logdir import time pre_dir = 'models/gpulearn_z_x_mnist_96-(500, 500)' if os.environ.has_key('pretrain') and bool(int(os.environ['pretrain'])) == True: comment+='_pre-train' if os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True: comment+='_prior' pre_dir+='_prior' if os.environ.has_key('cutoff'): comment+=('_'+str(int(os.environ['cutoff']))) if os.environ.has_key('train_residual') and bool(int(os.environ['train_residual'])) == True: comment+='_train-residual' pre_dir+='_train-residual' if os.environ.has_key('sigma_square'): comment+=('_'+str(float(os.environ['sigma_square']))) pre_dir+=('_'+str(float(os.environ['sigma_square']))) pre_dir+='/' logdir = 'results/gpulearn_z_x_'+dataset+'_'+str(n_z)+'-'+str(n_hidden)+comment+'_'+str(int(time.time()))+'/' if not os.path.exists(logdir): os.makedirs(logdir) print 'logdir:', logdir print 'gpulearn_z_x', n_z, n_hidden, dataset, seed with open(logdir+'hook.txt', 'a') as f: print >>f, 'learn_z_x', n_z, n_hidden, dataset, seed np.random.seed(seed) gfx_freq = 1 weight_decay = 0 # Init data if dataset == 'mnist': import anglepy.data.mnist as mnist # MNIST size = 28 train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy(size) f_enc, f_dec = pp.Identity() if os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True: color.printBlue('Loading prior') mnist_prior = sio.loadmat('data/mnist_prior/mnist_prior.mat') train_mean_prior = mnist_prior['z_train'] test_mean_prior = mnist_prior['z_test'] valid_mean_prior = mnist_prior['z_valid'] else: train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) print '---------------------', type(train_x) x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} print '---------------------', type(x_train) L_valid = 1 dim_input = (size,size) n_x = size*size type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 50000 n_test = 10000 n_valid = 10000 n_batch = 1000 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'higgs': size = 28 f_enc, f_dec = pp.Identity() inputfile = 'data/higgs/HIGGS.csv' print 'loading file.' x = np.loadtxt(inputfile, dtype='f4', delimiter=',') print 'done.' y = x[:,0].reshape((-1,1)) x = x[:,1:] x = np.array(x, dtype='float32') y = np.array(y, dtype='float32') n_train = 10000000 n_valid = 500000 n_test = 500000 n_batch = 1000 derived_feat = 'all' if os.environ.has_key('derived_feat'): derived_feat = os.environ['derived_feat'] color.printBlue(derived_feat) if derived_feat == 'high': # Only the 7 high level features. x = x[:, 21:28] elif derived_feat == 'low': # Only the 21 raw features. x = x[:, 0:21] else: pass train_x = x[0:n_train, :].T y_train = y[0:n_train, :] valid_x = x[n_train:n_train+n_valid, :].T y_valid = y[n_train:n_train+n_valid, :] test_x = x[n_train+n_valid:n_train+n_valid+n_test, :].T y_test = y[n_train+n_valid:n_train+n_valid+n_test, :] n_y = 2 n_x = train_x.shape[0] train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'tanh' if os.environ.has_key('nonlinear'): nonlinear = os.environ['nonlinear'] color.printBlue(nonlinear) L_valid = 1 dim_input = (1,size) type_px = 'gaussian' colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'cifar10': import anglepy.data.cifar10 as cifar10 size = 32 train_x, train_y, test_x, test_y = cifar10.load_numpy() train_x = train_x.astype(np.float32).T test_x = test_x.astype(np.float32).T ## f_enc, f_dec = pp.Identity() if os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True: color.printBlue('Loading prior') cifar_prior = sio.loadmat('data/cifar10_prior/cifar10_prior.mat') train_mean_prior = cifar_prior['z_train'] test_mean_prior = cifar_prior['z_test'] else: train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} x_valid = x_test L_valid = 1 n_y = 10 dim_input = (size,size) n_x = x['x'].shape[0] type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'gaussian' if os.environ.has_key('type_px'): type_px = os.environ['type_px'] color.printBlue('Generative type: '+type_px) n_train = 50000 n_test = 10000 n_batch = 5000 colorImg = True bernoulli_x = False byteToFloat = False #weight_decay = float(n_batch)/n_train elif dataset == 'cifar10_zca': import anglepy.data.cifar10 as cifar10 size = 32 train_x, train_y, test_x, test_y = cifar10.load_numpy() train_x = train_x.astype(np.float32).T test_x = test_x.astype(np.float32).T ## f_enc, f_dec = pp.Identity() zca_mean, zca_w, zca_winv = cifar10.zca(train_x) train_x = zca_w.dot(train_x-zca_mean) test_x = zca_w.dot(test_x-zca_mean) if os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True: color.printBlue('Loading prior') cifar_prior = sio.loadmat('data/cifar10_prior/cifar10_prior.mat') train_mean_prior = cifar_prior['z_train'] test_mean_prior = cifar_prior['z_test'] else: train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} x_valid = x_test L_valid = 1 dim_input = (size,size) n_y = 10 n_x = x['x'].shape[0] type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'gaussian' n_train = 50000 n_test = 10000 n_batch = 5000 colorImg = True bernoulli_x = False byteToFloat = False if os.environ.has_key('type_px'): type_px = os.environ['type_px'] color.printBlue('Generative type: '+type_px) nonlinear = 'softplus' elif dataset == 'mnist_basic': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'mnist_' tmp = sio.loadmat(data_dir+'train.mat') #color.printRed(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) ''' x = {'x': train_x.astype(np.float32), 'y': labelToMat(train_y).astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'y': labelToMat(valid_y).astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'y': labelToMat(test_y).astype(np.float32)} ''' x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'rectangle': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'rectangles_' tmp = sio.loadmat(data_dir+'train.mat') color.printRed(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,1000:] valid_y = train_y[1000:] train_x = train_x[:,:1000] train_y = train_y[:1000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) ''' x = {'x': train_x.astype(np.float32), 'y': labelToMat(train_y).astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'y': labelToMat(valid_y).astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'y': labelToMat(test_y).astype(np.float32)} ''' x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 2 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 1000 n_valid = 200 n_test = 50000 n_batch = 500 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train #print '3', n_x elif dataset == 'convex': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'convex_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,6000:] valid_y = train_y[6000:] train_x = train_x[:,:6000] train_y = train_y[:6000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) ''' x = {'x': train_x.astype(np.float32), 'y': labelToMat(train_y).astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'y': labelToMat(valid_y).astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'y': labelToMat(test_y).astype(np.float32)} ''' x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 2 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 6000 n_valid = 2000 n_test = 50000 n_batch = 120 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'rectangle_image': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'rectangles_im_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) ''' x = {'x': train_x.astype(np.float32), 'y': labelToMat(train_y).astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'y': labelToMat(valid_y).astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'y': labelToMat(test_y).astype(np.float32)} ''' x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 2 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_rot': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'mnist_all_rotation_normalized_float_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) f_enc, f_dec = pp.Identity() x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_back_rand': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'mnist_background_random_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) f_enc, f_dec = pp.Identity() x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_back_image': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'mnist_background_images_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) f_enc, f_dec = pp.Identity() x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_back_image_rot': # MNIST size = 28 data_dir = os.environ['ML_DATA_PATH']+'/mnist_variations/'+'mnist_all_background_images_rotation_normalized_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T train_y = tmp['t_train'].T.astype(np.int32) # validation 2000 valid_x = train_x[:,10000:] valid_y = train_y[10000:] train_x = train_x[:,:10000] train_y = train_y[:10000] tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T test_y = tmp['t_test'].T.astype(np.int32) print train_x.shape print train_y.shape print test_x.shape print test_y.shape train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) f_enc, f_dec = pp.Identity() x = {'x': train_x.astype(np.float32), 'mean_prior': train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': valid_x.astype(np.float32), 'mean_prior': valid_mean_prior.astype(np.float32)} x_test = {'x': test_x.astype(np.float32), 'mean_prior': test_mean_prior.astype(np.float32)} L_valid = 1 dim_input = (size,size) n_x = size*size n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 10000 n_valid = 2000 n_test = 50000 n_batch = 200 colorImg = False bernoulli_x = True byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_binarized': #import anglepy.data.mnist_binarized as mnist_binarized # MNIST import anglepy.data.mnist as mnist size = 28 data_dir = '/home/lichongxuan/regbayes2/data/mat_data/'+'binarized_mnist_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['x_train'].T #train_y = tmp['t_train'].T.astype(np.int32) tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['x_test'].T tmp = sio.loadmat(data_dir+'valid.mat') #print tmp.keys() valid_x = tmp['x_valid'].T #test_y = tmp['t_test'].T.astype(np.int32) f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) train_x = np.hstack((train_x, valid_x)).astype(np.float32) train_mean_prior = np.hstack((train_mean_prior,valid_mean_prior)).astype(np.float32) print train_mean_prior.shape print train_x.shape x = {'x': train_x.astype(np.float32), 'mean_prior':train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': test_x.astype(np.float32),'mean_prior':test_mean_prior.astype(np.float32)} x_test = x_valid L_valid = 1 dim_input = (28,28) n_x = 28*28 n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 60000 n_valid = 10000 n_batch = 1000 colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'mnist_binarized_own': #import anglepy.data.mnist_binarized as mnist_binarized # MNIST import anglepy.data.mnist as mnist size = 28 data_dir = 'data/mnist_binarized_own/'+'binarized_mnist_' tmp = sio.loadmat(data_dir+'train.mat') train_x = tmp['train_x'].T #train_y = tmp['t_train'].T.astype(np.int32) tmp = sio.loadmat(data_dir+'test.mat') test_x = tmp['test_x'].T tmp = sio.loadmat(data_dir+'valid.mat') #print tmp.keys() valid_x = tmp['valid_x'].T #test_y = tmp['t_test'].T.astype(np.int32) f_enc, f_dec = pp.Identity() train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) valid_mean_prior = np.zeros((n_z,valid_x.shape[1])) train_x = np.hstack((train_x, valid_x)).astype(np.float32) train_mean_prior = np.hstack((train_mean_prior,valid_mean_prior)).astype(np.float32) print train_mean_prior.shape print train_x.shape x = {'x': train_x.astype(np.float32), 'mean_prior':train_mean_prior.astype(np.float32)} x_train = x x_valid = {'x': test_x.astype(np.float32),'mean_prior':test_mean_prior.astype(np.float32)} x_test = x_valid L_valid = 1 dim_input = (28,28) n_x = 28*28 n_y = 10 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' nonlinear = 'softplus' type_px = 'bernoulli' n_train = 60000 n_valid = 10000 n_batch = 1000 colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'freyface': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy() np.random.shuffle(train_x) x = {'x': train_x.T[:,0:n_train]} x_valid = {'x': train_x.T[:,n_train:]} L_valid = 1 dim_input = (28,20) n_x = 20*28 type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'bounded01' nonlinear = 'tanh' #tanh works better with freyface #'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False weight_decay = float(n_batch)/n_train elif dataset == 'freyface_pca': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy().T np.random.shuffle(train_x.T) f_enc, f_dec, _ = pp.PCA(train_x, 0.99) train_x = f_enc(train_x) x = {'x': train_x[:,0:n_train].astype(np.float32)} x_valid = {'x': train_x[:,n_train:].astype(np.float32)} L_valid = 1 dim_input = (28,20) n_x = train_x.shape[0] type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False elif dataset == 'freyface_bernoulli': # Frey's face import anglepy.data.freyface as freyface n_train = 1600 train_x = freyface.load_numpy().T np.random.shuffle(train_x.T) x = {'x': train_x[:,0:n_train].astype(np.float32)} x_valid = {'x': train_x[:,n_train:].astype(np.float32)} L_valid = 1 dim_input = (28,20) n_x = train_x.shape[0] type_pz = 'gaussianmarg' type_px = 'bernoulli' nonlinear = 'softplus' n_batch = 100 colorImg = False bernoulli_x = False byteToFloat = False elif dataset == 'norb_48_24300_pca': size = 48 train_x, train_y, test_x, test_y = np.load('data/norb/norb_48_24300.npy') _x = {'x': train_x, 'y': train_y} #ndict.shuffleCols(_x) #train_x = _x['x'] #train_y = _x['y'] #print _x['x'][:,:10000].shape # Do PCA print 'pca' f_enc, f_dec, pca_params = pp.PCA(_x['x'][:,:10000], cutoff=500, toFloat=False) ndict.savez(pca_params, logdir+'pca_params') print 'done' train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) x = {'x': f_enc(train_x).astype(np.float32), 'mean_prior' : train_mean_prior.astype(np.float32)} x_valid = {'x': f_enc(test_x).astype(np.float32), 'mean_prior' : test_mean_prior.astype(np.float32)} x_test = {'x': f_enc(test_x).astype(np.float32), 'mean_prior' : test_mean_prior.astype(np.float32)} x_train = x print x['x'].shape print x['mean_prior'].shape L_valid = 1 n_y = 5 n_x = x['x'].shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False bernoulli_x = False byteToFloat = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'norb': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False byteToFloat = False bernoulli_x = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'norb_pca': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) f_enc, f_dec, _ = pp.PCA(train_x, 0.999) #f_enc, f_dec, _ = pp.normalize_random(train_x) train_x = f_enc(train_x) test_x = f_enc(test_x) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False bernoulli_x = False byteToFloat = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'norb_normalized': # small NORB dataset import anglepy.data.norb as norb size = 48 train_x, train_y, test_x, test_y = norb.load_resized(size, binarize_y=True) #f_enc, f_dec, _ = pp.PCA(train_x, 0.99) #f_enc, f_dec, _ = pp.normalize_random(train_x) f_enc, f_dec, _ = pp.normalize(train_x) train_x = f_enc(train_x) test_x = f_enc(test_x) x = {'x': train_x.astype(np.float32)} x_valid = {'x': test_x.astype(np.float32)} L_valid = 1 n_x = train_x.shape[0] dim_input = (size,size) type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' n_batch = 900 #23400/900 = 27 colorImg = False #binarize = False bernoulli_x = False byteToFloat = False weight_decay= float(n_batch)/train_x.shape[1] elif dataset == 'svhn': # SVHN dataset #import anglepy.data.svhn as svhn size = 32 train_x, train_y, test_x, test_y = np.load('data/svhn/svhn.npy') #extra_x, extra_y = svhn.load_numpy_extra(False, binarize_y=True) #x = {'x': np.hstack((train_x, extra_x)), 'y':np.hstack((train_y, extra_y))} #ndict.shuffleCols(x) x = {'x' : train_x, 'y': train_y} print 'Performing PCA, can take a few minutes... ' cutoff = 300 if os.environ.has_key('cutoff'): cutoff = int(os.environ['cutoff']) color.printBlue('cutoff: '+str(cutoff)) f_enc, f_dec, pca_params = pp.PCA(x['x'][:,:10000], cutoff=cutoff, toFloat=True) ndict.savez(pca_params, logdir+'pca_params') print 'Done.' n_y = 10 if os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True: color.printBlue('Loading prior') train_mean_prior, train_y1, test_mean_prior, test_y1 = np.load('data/svhn/svhn_prior.npy') print np.sum((train_y1 == train_y).astype(np.int32)) print np.sum((test_y1 == test_y).astype(np.int32)) else: train_mean_prior = np.zeros((n_z,train_x.shape[1])) test_mean_prior = np.zeros((n_z,test_x.shape[1])) x = {'x': f_enc(x['x']).astype(np.float32), 'mean_prior':train_mean_prior.astype(np.float32)} x_train = x x_test = {'x': f_enc(test_x).astype(np.float32), 'mean_prior':test_mean_prior.astype(np.float32)} x_valid = x_test print x_train['x'].shape print x_test['x'].shape print train_y.shape print test_y.shape print x_train['mean_prior'].shape print x_test['mean_prior'].shape L_valid = 1 n_x = x['x'].shape[0] dim_input = (size,size) n_batch = 5000 n_train = 604388 n_valid = 26032 n_test = 26032 colorImg = True bernoulli_x = False byteToFloat = False type_qz = 'gaussianmarg' type_pz = 'gaussianmarg' type_px = 'gaussian' nonlinear = 'softplus' else: print 'invalid data set' exit() #print '2', n_x # Construct model from anglepy.models import GPUVAE_Z_X learning_rate1 = 3e-4 if os.environ.has_key('stepsize'): learning_rate1 = float(os.environ['stepsize']) color.printBlue(str(learning_rate1)) if os.environ.has_key('preoption'): pre = int(os.environ['preoption']) if pre == 1: updates = get_adam_optimizer(learning_rate=3e-4, decay1=0.9, decay2=0.999, weight_decay=0) elif pre ==2: updates = get_adam_optimizer(learning_rate=3e-4, decay1=0.9, decay2=0.999, weight_decay=weight_decay) else: raise Exception('Prepotion unknown') with open(logdir+'hook.txt', 'a') as f: print >>f, 'preoption ' + str(pre) else: updates = get_adam_optimizer(learning_rate=learning_rate1, weight_decay=weight_decay) #print '1', n_x model = GPUVAE_Z_X(updates, n_x, n_hidden, n_z, n_hidden[::-1], nonlinear, nonlinear, type_px, type_qz=type_qz, type_pz=type_pz, prior_sd=100, init_sd=1e-3) if os.environ.has_key('pretrain') and bool(int(os.environ['pretrain'])) == True: #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412689061/' #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412676966/' #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412695481/' #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412695455/' #dir = '/Users/dpkingma/results/gpulearn_z_x_svhn_pca_300-(500, 500)__1413904756/' if len(n_hidden) == 1: color.printBlue('pre-training-1-layer') layer_str = '-500' elif len(n_hidden) == 2: color.printBlue('pre-training-2-layers') layer_str = '-(500, 500)' else: raise Exception() pre_str = 'models/gpulearn_z_x_' if dataset == 'mnist': #dir = pre_str + 'mnist_'+str(n_z)+layer_str+'_longrun/' dir = 'models/mnist_z_x_50-500-500_longrun/' elif dataset == 'mnist_rot': dir = pre_str + 'mnist_rot_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'mnist_back_rand': dir = pre_str + 'mnist_back_rand_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'mnist_back_image': dir = pre_str + 'mnist_back_image_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'mnist_back_image_rot': dir = pre_str + 'mnist_back_image_rot_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'rectangle': dir = pre_str + 'rectangle_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'rectangle_image': dir = pre_str + 'rectangle_image_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'convex': dir = pre_str + 'convex_'+str(n_z)+layer_str+'_longrun/' elif dataset == 'mnist_basic': dir = pre_str + 'mnist_basic_'+str(n_z)+layer_str+'_longrun/' if dataset == 'svhn': if (os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True): print 'prior-------------------' pre_dir = 'results/gpulearn_z_x_svhn_'+str(n_z)+'-500-500_prior_'+str(cutoff)+'_longrun/' else: pre_dir = 'results/gpulearn_z_x_svhn_'+str(n_z)+'-500-500_'+str(cutoff)+'_longrun/' color.printBlue(pre_dir) w = ndict.loadz(pre_dir+'w_best.ndict.tar.gz') v = ndict.loadz(pre_dir+'v_best.ndict.tar.gz') elif n_z == 50: print 'n_z = 50', dir w = ndict.loadz(dir+'w_best.ndict.tar.gz') v = ndict.loadz(dir+'v_best.ndict.tar.gz') else: print 'n_z != 50' w = ndict.loadz(pre_dir+'w_best.ndict.tar.gz') v = ndict.loadz(pre_dir+'v_best.ndict.tar.gz') ndict.set_value2(model.w, w) ndict.set_value2(model.v, v) # Some statistics for optimization ll_valid_stats = [-1e99, 0] # Progress hook def hook(epoch, t, ll): if epoch%10 != 0: return n_batch_n = n_batch if n_batch_n > n_valid: n_batch_n = n_valid ll_valid, _ = model.est_loglik(x_valid, n_samples=L_valid, n_batch=n_batch_n, byteToFloat=byteToFloat) ll_test = ll_valid #if not dataset == 'mnist_binarized': if not dataset == 'svhn': ll_test, _ = model.est_loglik(x_test, n_samples=L_valid, n_batch=n_batch, byteToFloat=byteToFloat) # Log ndict.savez(ndict.get_value(model.v), logdir+'v') ndict.savez(ndict.get_value(model.w), logdir+'w') def infer(data, n_batch=1000): #print '--', n_batch size = data['x'].shape[1] res = np.zeros((sum(n_hidden), size)) res1 = np.zeros((n_z,size)) res2 = np.zeros((n_hidden[-1],size)) res3 = np.zeros((n_z,size)) for i in range(0, size, n_batch): idx_to = min(size, i+n_batch) x_batch = ndict.getCols(data, i, idx_to) # may have bugs nn_batch = idx_to - i _x, _z, _z_confab = model.gen_xz(x_batch, {}, nn_batch) x_samples = _z_confab['x'] for (hi, hidden) in enumerate(_z_confab['hidden']): res[sum(n_hidden[:hi]):sum(n_hidden[:hi+1]),i:i+nn_batch] = hidden res1[:,i:i+nn_batch] = _z_confab['mean'] res2[:,i:i+nn_batch] = _z_confab['hidden'][-1] res3[:,i:i+nn_batch] = _z_confab['logvar'] #print '--' return res, res1, res2, res3 #print '..', n_batch #if not dataset == 'mnist_binarized': if not dataset == 'svhn': z_test, z_test1, z_test2, vv_test = infer(x_test) z_train, z_train1, z_train2, vv_train = infer(x_train) if ll_valid > ll_valid_stats[0]: ll_valid_stats[0] = ll_valid ll_valid_stats[1] = 0 ndict.savez(ndict.get_value(model.v), logdir+'v_best') ndict.savez(ndict.get_value(model.w), logdir+'w_best') #if not dataset == 'mnist_binarized': if dataset == 'svhn': pass #np.save(logdir+'full_latent', ('z_test': z_test, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train)) #np.save(logdir+'last_latent', ('z_test': z_test2, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train2)) else: sio.savemat(logdir+'full_latent.mat', {'z_test': z_test, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train}) sio.savemat(logdir+'mean_latent.mat', {'z_test': z_test1, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train1}) sio.savemat(logdir+'last_latent.mat', {'z_test': z_test2, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train2}) else: ll_valid_stats[1] += 1 # Stop when not improving validation set performance in 100 iterations if ll_valid_stats[1] > 1000: print "Finished" with open(logdir+'hook.txt', 'a') as f: print >>f, "Finished" exit() print epoch, t, ll, ll_valid, ll_test, ll_valid_stats with open(logdir+'hook.txt', 'a') as f: print >>f, epoch, t, ll, ll_valid, ll_test, ll_valid_stats ''' if dataset != 'svhn': l_t, px_t, pz_t, qz_t = model.test(x_train, n_samples=1, n_batch=n_batch, byteToFloat=byteToFloat) print 'Elogpx', px_t, 'Elogpz', pz_t, '-Elogqz', qz_t #sigma_square = float(os.environ['sigma_square']) print 'var', np.mean(np.exp(vv_train)), 'q', np.mean(np.abs(z_train1)), 'p', np.mean(np.abs(train_mean_prior)), 'd', np.mean(np.abs(z_train1-train_mean_prior)) with open(logdir+'hook.txt', 'a') as f: print >>f, 'Elogpx', px_t, 'Elogpz', pz_t, '-Elogqz', qz_t print >>f, 'var', np.mean(np.exp(vv_train)), 'q', np.mean(np.abs(z_train1)), 'p', np.mean(np.abs(train_mean_prior)), 'd', np.mean(np.abs(z_train1-train_mean_prior)) ''' # Graphics if gfx and epoch%gfx_freq == 0: #tail = '.png' tail = '-'+str(epoch)+'.png' v = {i: model.v[i].get_value() for i in model.v} w = {i: model.w[i].get_value() for i in model.w} if 'pca' not in dataset and 'random' not in dataset and 'normalized' not in dataset and 'zca' not in dataset: if 'w0' in v: image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T), dim_input, True, colorImg=colorImg) image.save(logdir+'q_w0'+tail, 'PNG') image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]), dim_input, True, colorImg=colorImg) image.save(logdir+'out_w'+tail, 'PNG') if 'out_unif' in w: image = paramgraphics.mat_to_img(f_dec(w['out_unif'].reshape((-1,1))), dim_input, True, colorImg=colorImg) image.save(logdir+'out_unif'+tail, 'PNG') if n_z == 2: n_width = 10 import scipy.stats z = {'z':np.zeros((2,n_width**2))} for i in range(0,n_width): for j in range(0,n_width): z['z'][0,n_width*i+j] = scipy.stats.norm.ppf(float(i)/n_width+0.5/n_width) z['z'][1,n_width*i+j] = scipy.stats.norm.ppf(float(j)/n_width+0.5/n_width) x, _, _z = model.gen_xz({}, z, n_width**2) if dataset == 'mnist': x = 1 - _z['x'] image = paramgraphics.mat_to_img(f_dec(_z['x']), dim_input) image.save(logdir+'2dmanifold'+tail, 'PNG') else: if 'norb' in dataset or dataset=='svhn': nn_batch_nn = 64 else: nn_batch_nn = 144 if not(os.environ.has_key('train_residual') and bool(int(os.environ['train_residual'])) == True) and (os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True): mp_in = np.random.randint(0,x_train['mean_prior'].shape[1],nn_batch_nn) m_p = x_train['mean_prior'][:,mp_in] s_s = 1 if os.environ.has_key('sigma_square'): s_s = float(os.environ['sigma_square']) x_samples = model.gen_xz_prior({}, {}, m_p, s_s, n_batch=nn_batch_nn) x_samples = x_samples['x'] m_p1 = (np.ones((n_z, nn_batch_nn)).T * np.mean(x_train['mean_prior'], axis = 1)).T x_samples1 = model.gen_xz_prior({}, {}, m_p1.astype(np.float32), s_s, n_batch=nn_batch_nn) image = paramgraphics.mat_to_img(f_dec(x_samples1['x']), dim_input, colorImg=colorImg) image.save(logdir+'mean_samples-prior'+tail, 'PNG') x_samples11 = model.gen_xz_prior11({}, {}, m_p, s_s, n_batch=nn_batch_nn) image = paramgraphics.mat_to_img(f_dec(x_samples11['x']), dim_input, colorImg=colorImg) image.save(logdir+'prior-image'+tail, 'PNG') else: _x, _, _z_confab = model.gen_xz({}, {}, n_batch=nn_batch_nn) x_samples = _z_confab['x'] image = paramgraphics.mat_to_img(f_dec(x_samples), dim_input, colorImg=colorImg) image.save(logdir+'samples-prior'+tail, 'PNG') #x_samples = _x['x'] #image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg) #image.save(logdir+'samples2'+tail, 'PNG') else: # Model with preprocessing if 'w0' in v: tmp = f_dec(v['w0'][:].T) #print dim_input #print tmp.shape if 'zca' in dataset or dataset=='svhn': tmp = zca_dec(zca_mean, zca_winv, tmp) image = paramgraphics.mat_to_img(tmp, dim_input, True, colorImg=colorImg) image.save(logdir+'q_w0'+tail, 'PNG') tmp = f_dec(w['out_w'][:]) if 'zca' in dataset: tmp = zca_dec(zca_mean, zca_winv, tmp) image = paramgraphics.mat_to_img(tmp, dim_input, True, colorImg=colorImg) image.save(logdir+'out_w'+tail, 'PNG') if dataset == 'svhn': nn_batch_nn = 64 else: nn_batch_nn = 144 if not(os.environ.has_key('train_residual') and bool(int(os.environ['train_residual'])) == True) and (os.environ.has_key('prior') and bool(int(os.environ['prior'])) == True): mp_in = np.random.randint(0,x_train['mean_prior'].shape[1],nn_batch_nn) m_p = x_train['mean_prior'][:,mp_in] s_s = 1 if os.environ.has_key('sigma_square'): s_s = float(os.environ['sigma_square']) x_samples = model.gen_xz_prior({}, {}, m_p, s_s, n_batch=nn_batch_nn) x_samples = zca_dec(zca_mean, zca_winv,x_samples['x']) x_samples = np.minimum(np.maximum(x_samples, 0), 1) x_samples11 = model.gen_xz_prior11({}, {}, m_p, s_s, n_batch=nn_batch_nn) x_samples11 = zca_dec(zca_mean,zca_winv,x_samples11['x']) x_samples11 = np.minimum(np.maximum(x_samples11, 0), 1) image = paramgraphics.mat_to_img(x_samples11, dim_input, colorImg=colorImg) image.save(logdir+'prior-image'+tail, 'PNG') else: _x, _z, _z_confab = model.gen_xz({}, {}, n_batch=nn_batch_nn) x_samples = f_dec(_z_confab['x']) x_samples = np.minimum(np.maximum(x_samples, 0), 1) image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg) image.save(logdir+'samples'+tail, 'PNG') ''' def infer(data, n_batch=1000): #print '--', n_batch size = data['x'].shape[1] res = np.zeros((sum(n_hidden), size)) res1 = np.zeros((n_z,size)) res2 = np.zeros((n_hidden[-1],size)) res3 = np.zeros((n_z,size)) for i in range(0, size, n_batch): idx_to = min(size, i+n_batch) x_batch = ndict.getCols(data, i, idx_to) # may have bugs nn_batch = idx_to - i _x, _z, _z_confab = model.gen_xz(x_batch, {}, nn_batch) x_samples = _z_confab['x'] for (hi, hidden) in enumerate(_z_confab['hidden']): res[sum(n_hidden[:hi]):sum(n_hidden[:hi+1]),i:i+nn_batch] = hidden res1[:,i:i+nn_batch] = _z_confab['mean'] res2[:,i:i+nn_batch] = _z_confab['hidden'][-1] res3[:,i:i+nn_batch] = _z_confab['logvar'] # return res, res1, res2, res3 #print n_batch #if not dataset == 'mnist_binarized': z_test, z_test1, z_test2, vv_test = infer(x_test) z_train, z_train1, z_train2, vv_train = infer(x_train) l_t, px_t, pz_t, qz_t = model.test(x_train, n_samples=1, n_batch=n_batch, byteToFloat=byteToFloat) print 'Elogpx', px_t, 'Elogpz', pz_t, '-Elogqz', qz_t #sigma_square = float(os.environ['sigma_square']) print 'var', np.mean(np.exp(vv_train)), 'q', np.mean(np.abs(z_train1)), 'p', np.mean(np.abs(train_mean_prior)), 'd', np.mean(np.abs(z_train1-train_mean_prior)) with open(logdir+'hook.txt', 'a') as f: print >>f, 'Elogpx', px_t, 'Elogpz', pz_t, '-Elogqz', qz_t print >>f, 'var', np.mean(np.exp(vv_train)), 'q', np.mean(np.abs(z_train1)), 'p', np.mean(np.abs(train_mean_prior)), 'd', np.mean(np.abs(z_train1-train_mean_prior)) #if not dataset == 'mnist_binarized': sio.savemat(logdir+'full_latent.mat', {'z_test': z_test, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train}) sio.savemat(logdir+'mean_latent.mat', {'z_test': z_test1, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train1}) sio.savemat(logdir+'last_latent.mat', {'z_test': z_test2, 'train_y':train_y, 'test_y':test_y, 'z_train': z_train2}) ''' # Optimize #SFO dostep = epoch_vae_adam(model, x, n_batch=n_batch, bernoulli_x=bernoulli_x, byteToFloat=byteToFloat) loop_va(dostep, hook) pass