def convert_pool_matrices(pool_input, word2ind): """Converts a dictionary of pooled captions/questions into matrices. Args: pool_input: Dictionary of pooled captions/questions word2ind: Dictionary of word -> vocabulary index conversion. Returns: item_tokens: Items in the pool tokenized and converted into a matrix. item_lens: Length of items in the matrix. """ unk_token = word2ind["<unk>"] def tokenizer(x): return [word2ind.get(ii, unk_token) for ii in word_tokenize(x.lower())] if isinstance(pool_input, dict): pool_list = sorted(pool_input, key=lambda x: pool_input[x]) else: pool_list = pool_input tokenized_items = [tokenizer(item) for item in progressbar(pool_list)] max_item_len = max(len(ii) for ii in tokenized_items) item_tokens = np.zeros( (len(tokenized_items), max_item_len)).astype("int32") item_tokens.fill(word2ind["<pad>"]) item_lens = np.zeros(len(tokenized_items)).astype("int32") for item_id, tokens in progressbar(enumerate(tokenized_items)): item_lens[item_id] = len(tokens) item_tokens[item_id, :item_lens[item_id]] = np.array(tokens) return item_tokens, item_lens
def convert_pool_matrices_pretrained_tokenizer(pool_input, pretrained_tokenizer): """Converts a dictionary of pooled captions/questions into matrices. Args: pool_input: Dictionary of pooled captions/questions pretrained_tokenizer: Huggingface tokenizer for pretrained models. Returns: item_tokens: Items in the pool tokenized and converted into a matrix. item_lens: Length of items in the matrix. """ def tokenizer(x): return pretrained_tokenizer.encode(x, add_special_tokens=True) if isinstance(pool_input, dict): pool_list = sorted(pool_input, key=lambda x: pool_input[x]) else: pool_list = pool_input tokenized_items = [tokenizer(item) for item in progressbar(pool_list)] max_item_len = max(len(ii) for ii in tokenized_items) item_tokens = np.zeros( (len(tokenized_items), max_item_len)).astype("int32") item_tokens.fill(pretrained_tokenizer.pad_token_id) item_lens = np.zeros(len(tokenized_items)).astype("int32") for item_id, tokens in progressbar(enumerate(tokenized_items)): item_lens[item_id] = len(tokens) item_tokens[item_id, :item_lens[item_id]] = np.array(tokens) return item_tokens, item_lens
def check_progress(self): """Helper metrics to check overall progression""" self._step_progressbar = None self._sub_progressbars = dict() if self._queues is not None: total_bar = len(self._succeeded_steps) + len(self._failed_steps) sub_bars = dict() import sys self._step_progressbar = progressbar(total=self._num_steps, desc='__Total__', initial=total_bar, postfix=None, position=0) for i, priority in enumerate(self._queues.keys()): if len(self._queues_labels) != 0: label = 'Step::{}-{}'.format(self._queues_labels[priority], priority) else: label = 'priority::{}'.format(str(priority + 1).zfill(3)) if priority in self._succeeded_workers.keys(): sub_bar = len(self._succeeded_workers[priority]) else: sub_bar = 0 self._sub_progressbars[priority] = progressbar(total=len(self._queues[priority]), desc=label, initial=sub_bar, position=1+i) sub_bars[priority] = sub_bar def workon(n_finished_steps, n_sub_tasks): while n_finished_steps < self._num_steps: cur_finished_steps = len(self._succeeded_steps) + len(self._failed_steps) step_delta = cur_finished_steps - n_finished_steps if step_delta > 0: n_finished_steps += step_delta self._step_progressbar.update(step_delta) for p in self._queues.keys(): if p in self._succeeded_workers.keys(): sub_delta = len(self._succeeded_workers[p]) - n_sub_tasks[p] if sub_delta > 0: n_sub_tasks[p] += sub_delta self._sub_progressbars[p].update(sub_delta) time.sleep(0.2) self._step_progressbar.close() for p in self._queues.keys(): self._sub_progressbars[p].close() import threading thread = threading.Thread(target=workon, args=(total_bar, sub_bars)) thread.daemon = True if notebook_env: display(self._step_progressbar) for p in self._queues.keys(): display(self._sub_progressbars[p]) thread.start() else: thread.start() else: print('[No scheduled jobs]')
def random_search(self, iterations, config, train_epochs, transformations, verbose=True, result_path="../data/results.csv"): self._searching = True all_scores = [] for _ in progressbar(range(iterations)): if self._kill: print("Will exit now because of signal!") break current_config = self._get_random_config(config) current_score = self.train_one_configuration( current_config, train_epochs, transformations) current_score.extend( [str(current_config), str(hash(str(current_config)))]) all_scores.append(current_score) if verbose: print(all_scores) self._save_results(all_scores, result_path) self._searching = False return all_scores
def evalRandomModelAllEigenvalues(): SIZE = 10 N_MODELS = 100 N_STEPS = 500 rmse = [] n_evals_list = np.arange(SIZE, 0, -1, dtype=np.int) for n_model in progressbar(range(N_MODELS)): # a = np.random.normal(0.0, 1.0, (SIZE, SIZE)) a = np.diag(np.random.normal(0, 1, SIZE)) rmse.append(evalModelAllEigenvalues(a, N_STEPS)) mpl.style.use('seaborn') fig, ax = plt.subplots() stats = [[np.abs(model[n_eval]) for n_eval in model] for model in rmse] # print(rmse) # print(np.abs(list(rmse.values()))) mean_error = np.mean(np.mean(stats, axis=2), axis=0) std_error = np.mean(np.std(stats, axis=2), axis=0) fig, ax = plt.subplots() ax.plot(n_evals_list, mean_error) ax.fill_between(n_evals_list, mean_error - std_error, mean_error + std_error, facecolor='#a9cce3') ax.set_xlabel("# Eigenvalues used") ax.set_ylabel("RMSE to original model") plt.show()
def fit(self, optim, loss_fn, data_loader, validation_data_loader, num_epochs, logger): best_loss = float("inf") for e in progressbar(range(num_epochs)): self._epoch = e iter_per_epoch = len(data_loader) data_iter = iter(data_loader) for i in range(iter_per_epoch): inputs, labels = self._get_inputs(data_iter) predictions, classes = self.predict(inputs, return_classes=True) optim.zero_grad() loss = loss_fn(predictions, labels) loss.backward() optim.step() self._accumulate_results( self.to_np(labels).squeeze(), classes, loss=loss.data[0], probs=self.to_np(predictions).squeeze()) stats = self.evaluate(logger, validation_data_loader, loss_fn, switch_to_eval=True) is_best = stats["val_loss"] < best_loss best_loss = min(best_loss, stats["val_loss"]) model_path = ProjectConfig.combine( ProjectConfig.model_directory, "%s_%s_fold_%s.mdl" % (self.model_name, str(e + 1), self.fold_number)) self.save(model_path, optim, is_best, scores=stats) return best_loss
def get_neuron_ordering_granular(model, class_to_idx, granularity=50, search_stride=100): weights = list(model.parameters())[0].data.cpu() num_neurons = weights.numpy().shape[1] neuron_orderings = [ get_top_neurons(model, p / search_stride, class_to_idx)[0] for p in progressbar(range(search_stride + 1)) ] sliding_idx = 0 considered_neurons = set() ordering = [] cutoffs = [] for i in range(0, num_neurons + 1, granularity): while len(neuron_orderings[sliding_idx]) < i: sliding_idx = sliding_idx + 1 new_neurons = set( neuron_orderings[sliding_idx]).difference(considered_neurons) if len(new_neurons) != 0: ordering = ordering + list(new_neurons) considered_neurons = considered_neurons.union(new_neurons) cutoffs.append(len(ordering)) return ordering, cutoffs
def save_mean_std_image(FLAGS): """Compute and save mean and std image from train images. Args: FLAGS: Commandline arguments """ import pdb image_list = os.listdir(os.path.join(FLAGS.image_root, 'train')) # compute the mean of the train images and save mean_img = None std_img = None for image_name in progressbar(image_list): image_path = os.path.join(FLAGS.image_root, 'train', image_name) image = support.load_image(image_path) if mean_img is None: mean_img = image std_img = image ** 2 else: mean_img += image std_img += image ** 2 mean_img = mean_img / len(image_list) std_img = std_img / len(image_list) mean_img = np.mean(np.mean(mean_img, 0), 0) std_img = np.mean(np.mean(std_img, 0), 0) std_img = np.sqrt(std_img - mean_img ** 2) print('Saving mean and std at: %s' % FLAGS.mean_save_path) np.save(FLAGS.mean_save_path, {'mean_img': mean_img, 'std_img': std_img})
def map_embarrassingly_parallel(input_list, mapper, project, n_jobs=-1, batch_size=-1, checkpoint=False, cleanup=True, **kwargs): """ Process items in a list in parallel (optionally, one smaller batch at a time). Args: input_list: An input object that has a list-like interface (indexing and slicing). mapper: A function to apply to each item of the input list. project: An instance of pygoose project. n_jobs: The number of parallel processing jobs. -1 will use the number of CPUs on the system. batch_size: The maximum number of input items in each batch. -1 will store all data as a single batch. checkpoint: Whether to save each batch and its corresponding output to disk. cleanup: Whether to remove the batch checkpoints from the disk after all batches are processed. **kwargs: Additional keyword arguments to joblib.Parallel. Returns: A list representing the combined output from the mapper function called on all input items. """ if batch_size < 0: batch_size = len(input_list) # Partition the data. job_id = _create_job_id() print('Creating job ID:', job_id) batch_storage_dir = os.path.join(project.temp_dir, job_id) batches = split_into_batches(input_list, batch_size, batch_storage_dir, checkpoint) # The results will be collected here. # TODO: collecting lists like this may be memory inefficient. Perhaps we could use another callback function. combined_results = [] # Process data one batch at a time. for batch in batches: description = 'Batch {}/{}'.format(batch['index'] + 1, len(batches)) # Process each item in the batch in parallel. batch_result = Parallel(n_jobs=n_jobs, **kwargs)( delayed(mapper)(input_item) for input_item in progressbar( batch['data'], desc=description, total=len(batch['data']), file=sys.stdout, ) ) if checkpoint: save(batch_result, batch['result_filename']) combined_results.extend(batch_result) # Remove the temporary files. if checkpoint and cleanup: shutil.rmtree(batch_storage_dir) return combined_results
def _test_set(): ds = IcebergDataset("../data/orig/test.json", im_dir="../data/vis/test", inference_only=True, mu_sigma=None, colormap="inferno", add_feature_planes="complex") for i in progressbar(range(len(ds))): # print(i, ds[i]["inputs"].size(), ds[i]["id"]) ds.vis(i, average=False, prefix="pure_") if i == 3: break loader = DataLoader(ds, batch_size=6, shuffle=False, num_workers=1) for i, batch in enumerate(loader): print(i, batch["inputs"].size(), batch["id"]) if i == 3: break
def get_neuron_ordering(model, class_to_idx, search_stride=100): neuron_orderings = [ get_top_neurons(model, p / search_stride, class_to_idx)[0] for p in progressbar(range(search_stride + 1)) ] considered_neurons = set() ordering = [] cutoffs = [] for local_ordering in neuron_orderings: local_ordering = list(local_ordering) new_neurons = set(local_ordering).difference(considered_neurons) ordering = ordering + list(new_neurons) considered_neurons = considered_neurons.union(new_neurons) cutoffs.append(len(ordering)) return ordering, cutoffs
def map_batch_parallel(input_list, batch_size, item_mapper=None, batch_mapper=None, flatten=True, n_jobs=-1, **kwargs): """ Split the data into batches and process each batch in its own thread. Args: input_list: An input object that has a list-like interface (indexing and slicing). item_mapper: (optional) A function to apply to each item in the batch. batch_mapper: (optional) A function to apply to each batch. Either item_mapper or batch_mapper must be set. flatten: Whether to unwrap individual batch results or keep them grouped by batch. n_jobs: The number of parallel processing jobs. -1 will use the number of CPUs on the system. batch_size: The maximum number of input items in each batch. -1 will store all data as a single batch. **kwargs: Additional keyword arguments to joblib.Parallel. Returns: A list representing the combined output from the mapper function called on all input items of each batch. """ # We must specify either how to process each batch or how to process each item. if item_mapper is None and batch_mapper is None: raise ValueError('You should specify either batch_mapper or item_mapper.') if batch_mapper is None: batch_mapper = _default_batch_mapper batches = split_into_batches(input_list, batch_size, batch_storage_dir='') all_batch_results = Parallel(n_jobs=n_jobs, **kwargs)( delayed(batch_mapper)(batch['data'], item_mapper) for batch in progressbar( batches, desc='Batches', total=len(batches), file=sys.stdout, ) ) # Unwrap the individual batch results if necessary. if flatten: final_result = [] for batch_result in all_batch_results: final_result.extend(batch_result) else: final_result = all_batch_results return final_result
def extract_from_indices_file(): base_output_dir = ( "/mnt/data/tiny_images/py-tiny-image-access/loaded_images/cifar100" ) # the first 50'000 indices are for training train_dir = base_output_dir + "/train" # the last 10'000 indices are for testing test_dir = base_output_dir + "/test" tinyimage.openTinyImage() cifar100_indices = get_indices() for i, index in enumerate(progressbar(cifar100_indices)): if i < 50000: output_dir = train_dir else: output_dir = test_dir meta = tinyimage.getMetaData(index) tinyimage.sliceToImage(tinyimage.sliceToBin(index), output_dir + "/" + meta[1]) tinyimage.closeTinyImage()
def main(args): # reading data print('Reading from: ' + args.data_file) with open(args.data_file, 'r') as file_id: data = json.load(file_id) # open a text file to write the questions save_path = args.data_file.replace('.json', '_ques_flat.txt') print('Saving to: ' + save_path) with open(save_path, 'w') as file_id: for ques in progressbar(data['data']['questions']): file_id.write(clean_non_ascii(ques) + ' ?\n') # open a text file to write the captions save_path = args.data_file.replace('.json', '_cap_flat.txt') print('Saving to: ' + save_path) with open(save_path, 'w') as file_id: captions = [ii['caption'] for ii in data['data']['dialogs']] for cap in captions: file_id.write(clean_non_ascii(cap) + ' .\n')
def save_vocabularies(train_examples, FLAGS): """Extract and save vocabularies for questions and answers. Args: train_examples: Training examples Returns: words: Vocabulary (dictionary) extracted from the questions ans_list: List of possible answers, extracted from train set """ words = {} ans_list = {} for datum in progressbar(train_examples): for ques_datum in datum['qa']: token = ques_datum['answer'].lower() words[token] = words.get(token, 0) + 1 ans_list[token] = 1 for token in word_tokenize(ques_datum['question']): token = token.lower() words[token] = words.get(token, 0) + 1 # additional tokens words['<pad>'] = 1 words['<start>'] = 1 words['<end>'] = 1 words['<unk>'] = 1 print('Saving to: ' + FLAGS.vocab_save_path) with open(FLAGS.vocab_save_path, 'w') as file_id: file_id.write('\n'.join(sorted(words.keys()))) # answer lists ans_list = list(ans_list.keys()) ans_list.append('<unk>') print('Saving to: ' + FLAGS.answers_save_path) with open(FLAGS.answers_save_path, 'w') as file_id: file_id.write('\n'.join(ans_list))
def train_set(): t1 = ToTensor() t2 = transforms.Compose([Flip(axis=2), ToTensor()]) t3 = transforms.Compose([Flip(axis=1), ToTensor()]) t4 = transforms.Compose([Flip(axis=2), Flip(axis=1), ToTensor()]) t5 = transforms.Compose([Flip(axis=1), Flip(axis=2), ToTensor()]) t6 = transforms.Compose([Rotate(90), ToTensor()]) ds1 = IcebergDataset("../data/all.npy", transform=None, im_dir="../data/vis/train", colormap="inferno", add_feature_planes="complex") for i in progressbar(range(len(ds1))): sample = ds1[i] ds1.vis(i, average=False, prefix="") # print(i, sample['inputs'].size(), sample['targets'].size(), sample["targets"].numpy()[0]) # if i == 10: # break dataloader = DataLoader(ds1, batch_size=4, shuffle=True, num_workers=1, pin_memory=True) for i_batch, sample_batched in enumerate(dataloader): print(i_batch, sample_batched['inputs'].size(), sample_batched['targets'].size()) if i_batch == 3: break
def fit(self, optim, loss_fn, data_loader, validation_data_loader, num_epochs, logger): best_loss = float("inf") start_point = random.randint(0, 32) for e in progressbar(range(num_epochs)): self._epoch = e iter_per_epoch = len(data_loader) data_iter = iter(data_loader) inputs, targets, predictions = None, None, None for i in range(iter_per_epoch): inputs, targets = self._get_inputs(data_iter) predictions, mu, logvar = self.predict(targets) optim.zero_grad() loss = loss_fn(predictions, targets, mu, logvar) loss.backward() optim.step() self._accumulate_results(None, None, loss=loss.data[0]) self._log_images(inputs, targets, predictions, logger, start=start_point, prefix="train_", reshape=(2, 75, 75)) stats = self.evaluate(logger, validation_data_loader, loss_fn, switch_to_eval=True) is_best = stats["val_loss"] < best_loss best_loss = min(best_loss, stats["val_loss"]) model_path = ProjectConfig.combine( ProjectConfig.model_directory, "%s_%s_fold_%s.mdl" % (self.model_name, str(e + 1), self.fold_number)) self.save(model_path, optim, is_best, scores=stats) return best_loss
def infer(path, num_folds, average=True): ds = IcebergDataset(path, inference_only=True, transform=ToTensor(), add_feature_planes="no") loader = DataLoader(ds, 64) predictions = defaultdict(list) for fold in range(num_folds): model = LeNet.restore("../models/LeNet_78_fold_None.mdl") if torch.cuda.is_available(): model.cuda() iterator = iter(loader) iter_per_epoch = len(loader) for _ in progressbar(range(iter_per_epoch)): next_batch = next(iterator) inputs_tensor, ids = next_batch["inputs"], next_batch["id"] inputs = model.to_var(inputs_tensor) probs, _ = model.predict(inputs, return_classes=False) probs = model.to_np(probs).squeeze() probs = probs.tolist() chunk = dict(zip(ids, probs)) for k, v in chunk.items(): predictions[k].append(v) if average: result = {k: sum(v) / len(v) for k, v in predictions.items()} else: result = {} for k, v in predictions.items(): prob = np.mean(np.array(v)) if prob <= 0.1: prob = 0 elif prob >= 0.9: prob = 1 result[k] = prob return result
def hyperscreen(self, softening=1.0): """[summary] Returns: [type] -- [description] """ data = self.data[self.data['Hyperbola test passed']] # taprange = range(data['crsu'].min(), data['crsu'].max() + 1) taprange_u = range(data['crsu'].min() - 1, data['crsu'].max() + 1) taprange_v = range(data['crsv'].min() - 1, data['crsv'].max() + 1) if self.numevents < 100000: bins = [50, 50] # number of bins else: bins = [200, 200] # Instantiate these empty dictionaries to hold our results u_axis_survivals = {} v_axis_survivals = {} if self.verbose is False: progressbar_disable = True elif self.verbose is True: progressbar_disable = False if self.verbose is True: print( colorama.Fore.YELLOW + "\nApplying Otsu's Method to every Tap-specific boomerang across U-axis taps {} through {}" .format(taprange_u[0] + 1, taprange_u[-1] + 1)) skiptaps_u = [] skiptaps_v = [] for tap in progressbar(taprange_u, disable=progressbar_disable, ascii=False): # Do the U axis tapmask_u = data[data['crsu'] == tap].index.values if len(tapmask_u) < 20: skiptaps_u.append((tap + 1, len(tapmask_u))) continue keep_u = np.isfinite(data['fb_u'][tapmask_u]) hist_u, xbounds_u, ybounds_u = np.histogram2d( data['fb_u'][tapmask_u][keep_u], data['fp_u'][tapmask_u][keep_u], bins=bins) thresh_hist_u = self.threshold(hist_u, bins=bins, softening=softening) posx_u = np.digitize(data['fb_u'][tapmask_u], xbounds_u) posy_u = np.digitize(data['fp_u'][tapmask_u], ybounds_u) hist_mask_u = (posx_u > 0) & (posx_u <= bins[0]) & ( posy_u > -1) & (posy_u <= bins[1]) # Values of the histogram where the points are hhsub_u = thresh_hist_u[posx_u[hist_mask_u] - 1, posy_u[hist_mask_u] - 1] pass_fb_u = data['fb_u'][tapmask_u][hist_mask_u][np.isfinite( hhsub_u)] u_axis_survivals["U Axis Tap {:02d}".format( tap)] = pass_fb_u.index.values if self.verbose is True: print( "\nThe following {} U-axis taps were skipped due to a (very) low number of counts: " .format(len(skiptaps_u))) for skipped_tap in skiptaps_u: tapnum, counts = skipped_tap print("Skipped U-axis Tap {}, which had {} count(s)".format( tapnum, counts)) print(colorama.Fore.MAGENTA + "\n... doing the same for the V axis taps {} through {}". format(taprange_v[0] + 1, taprange_v[-1] + 1)) for tap in progressbar(taprange_v, disable=progressbar_disable, ascii=False): # Now do the V axis: tapmask_v = data[data['crsv'] == tap].index.values if len(tapmask_v) < 20: skiptaps_v.append((tap + 1, len(tapmask_v))) continue keep_v = np.isfinite(data['fb_v'][tapmask_v]) hist_v, xbounds_v, ybounds_v = np.histogram2d( data['fb_v'][tapmask_v][keep_v], data['fp_v'][tapmask_v][keep_v], bins=bins) thresh_hist_v = self.threshold(hist_v, bins=bins, softening=softening) posx_v = np.digitize(data['fb_v'][tapmask_v], xbounds_v) posy_v = np.digitize(data['fp_v'][tapmask_v], ybounds_v) hist_mask_v = (posx_v > 0) & (posx_v <= bins[0]) & ( posy_v > -1) & (posy_v <= bins[1]) # Values of the histogram where the points are hhsub_v = thresh_hist_v[posx_v[hist_mask_v] - 1, posy_v[hist_mask_v] - 1] pass_fb_v = data['fb_v'][tapmask_v][hist_mask_v][np.isfinite( hhsub_v)] v_axis_survivals["V Axis Tap {:02d}".format( tap)] = pass_fb_v.index.values if self.verbose is True: print( "\nThe following {} V-axis taps were skipped due to a (very) low number of counts: " .format(len(skiptaps_v))) for skipped_tap in skiptaps_v: tapnum, counts = skipped_tap print("Skipped V-axis Tap {}, which had {} count(s)".format( tapnum, counts)) # Done looping over taps if self.verbose is True: print( colorama.Fore.BLUE + "\nCollecting events that pass both U- and V-axis HyperScreen tests...", end=" ") u_all_survivals = np.concatenate( [x for x in u_axis_survivals.values()]) v_all_survivals = np.concatenate( [x for x in v_axis_survivals.values()]) # If the event passes both U- and V-axis tests, it survives all_survivals = np.intersect1d(u_all_survivals, v_all_survivals) survival_mask = np.isin(self.data.index.values, all_survivals) failure_mask = np.logical_not(survival_mask) num_survivals = sum(survival_mask) num_failures = sum(failure_mask) percent_hyperscreen_rejected = round( ((num_failures / self.numevents) * 100), 2) # Do a sanity check to look for lost events. Shouldn't be any. if num_survivals + num_failures != self.numevents: print("WARNING: Total Number of survivals and failures does \ not equal total events in the EVT1 file. Something is wrong!") legacy_hyperbola_test_failures = sum( self.data['Hyperbola test failed']) percent_legacy_hyperbola_test_rejected = round( ((legacy_hyperbola_test_failures / self.numevents) * 100), 2) percent_improvement_over_legacy_test = round( (percent_hyperscreen_rejected - percent_legacy_hyperbola_test_rejected), 2) if self.verbose is True: print("Done") print(colorama.Fore.GREEN + "HyperScreen rejected" + colorama.Fore.YELLOW + " {}% of all events ({:,} bad events / {:,} total events)". format(percent_hyperscreen_rejected, sum(failure_mask), self.numevents) + colorama.Fore.GREEN + "\nThe Murray+ algorithm rejects" + colorama.Fore.MAGENTA + " {}% of all events ({:,} bad events / {:,} total events)". format(percent_legacy_hyperbola_test_rejected, legacy_hyperbola_test_failures, self.numevents)) print( colorama.Fore.GREEN + "As long as the results pass sanity checks, this is a POTENTIAL improvement of \n" + colorama.Fore.BLUE + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ POTENTIAL Improvement ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + colorama.Fore.WHITE + " {}%\n".format( percent_improvement_over_legacy_test) + colorama.Fore.BLUE + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" ) hyperscreen_results_dict = { "ObsID": self.obsid, "Target": self.target, "Exposure Time": self.exptime, "Detector": self.detector, "Number of Events": self.numevents, "Number of Good Time Events": self.goodtimeevents, "U Axis Survivals by Tap": u_axis_survivals, "V Axis Survivals by Tap": v_axis_survivals, "U Axis All Survivals": u_all_survivals, "V Axis All Survivals": v_all_survivals, "All Survivals (event indices)": all_survivals, "All Survivals (boolean mask)": survival_mask, "All Failures (boolean mask)": failure_mask, "Percent rejected by Tapscreen": percent_hyperscreen_rejected, "Percent rejected by Hyperbola": percent_legacy_hyperbola_test_rejected, "Percent improvement": percent_improvement_over_legacy_test } return hyperscreen_results_dict
for label in f: labels.append(label.rstrip()) return labels def get_indices(): indices = [] with open('./indices_cifar100','r') as f: for index in f: indices.append(int(index.rstrip())) return indices if __name__ == "__main__": keywords = get_labels() tinyimage.openTinyImage() images = [] ignore_indices = get_indices() pick = len(ignore_indices) for keyword in progressbar(keywords): indexes = tinyimage.retrieveByTerm(keyword) for i in indexes: if i not in ignore_indices: image = tinyimage.sliceToBin(i).reshape(32,32,3, order="F").astype('float32') / 255. images.append(image) relevant = np.array(images) np.random.shuffle(relevant) relevant = relevant[:pick] np.save("relevant_images",relevant) tinyimage.closeTinyImage()
def __init__(self, imdb, params): """Initialize by reading the data and pre-processing it. """ self.imdb = imdb self.params = params self.num_inst = len(self.imdb['data']) self.num_rounds = len(self.imdb['data'][0]['question_ind']) # load vocabulary vocab_path = params['text_vocab_path'] self.vocab_dict = text_processing.VocabDict(vocab_path) self.T_encoder = params['max_enc_len'] # record special token ids self.start_token_id = self.vocab_dict.word2idx('<start>') self.end_token_id = self.vocab_dict.word2idx('<end>') self.pad_token_id = self.vocab_dict.word2idx('<pad>') # Load answers with open(params['args']['answer_list_path'], 'r') as file_id: choices = [ii.strip('\n') for ii in file_id.readlines()] self.num_choices = len(choices) self.choices2ind = {ii: index for index, ii in enumerate(choices)} self.ind2choices = {index: ii for index, ii in enumerate(choices)} # peek one example to see whether answer and gt_layout are in the data test_data = self.imdb['data'][0] self.load_gt_layout = test_data.get('gt_layout_tokens', False) if 'load_gt_layout' in params: self.load_gt_layout = params['load_gt_layout'] if self.load_gt_layout: self.T_decoder = params['max_dec_len'] self.assembler = params['assembler'] # load the mean of the images load_path = params['path'].split('/')[:-1] + ['train_image_mean.npy'] load_path = '/'.join(load_path) print('Loading training image stats from: ' + load_path) img_stats = np.load(load_path)[()] mean_img = img_stats['mean_img'].reshape([1, 1, -1]) std_img = img_stats['std_img'].reshape([1, 1, -1]) # read all the images images = {} print('Reading images..') #TODO: Change this back! for datum in progressbar(self.imdb['data'][::3]): img_path = datum['image_path'] if img_path not in images: cur_img = support.load_image(img_path) cur_img = (cur_img - mean_img) / std_img images[img_path] = cur_img self.images = images # get the shape from random image for _, sample in self.images.items(): self.img_size = sample.shape break # convert to tokens self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x] # use history if needed by the program generator self.use_history = self.params['generator'] == 'mem' if self.use_history: self._construct_history() # if fact is to be used if self.params['use_fact']: self._construct_fact()
image = self._add_planes(image) elif self.add_feature_planes == "simple": image = self._get_simple_planes(image) noise_factor = 0.4 planes = [image[i, :, :] for i in range(image.shape[0])] stats = [self.get_image_stat(i) for i in planes] masks = [np.random.binomial(1, 1 - noise_factor, i.shape) for i in planes] noise = [masks[i] * np.random.normal(loc=stats[i][0], scale=stats[i][1], size=masks[i].shape) for i in range(len(planes))] noisy = [planes[i] + noise[i] for i in range(len(planes))] noisy = np.stack(noisy, axis=0) item = {"inputs": noisy, "targets": image} if self.transform: item = self.transform(item) return item def vis(self, idx, average=False, prefix=""): base_dir = self.im_dir or "./" image1 = self[idx]["inputs"] image2 = self[idx]["targets"] self._vis_image(idx, image2, average, base_dir, prefix) self._vis_image(idx, image1, average, base_dir, "noise_" + prefix) if __name__ == "__main__": ds1 = AutoEncoderDataset("../data/folds/test_0.npy", transform=None, im_dir="../data/vis/test", colormap="inferno", add_feature_planes="no") for i in progressbar(range(len(ds1))): sample = ds1[i] ds1.vis(i, average=True, prefix="")
hashes = json.loads(open(hash_file).read()) ####################### with open("word_embeddings/word2num.json", "r") as f: word2num = json.load(f) word2num = {w: i for i, w in enumerate(word2num)} ####################### shuffle(examples) batch_size = 1024 dir_count = -1 res_dir = None data = [] for ex in progressbar(examples): if res_dir is None or len(data) >= batch_size: if res_dir is not None: with open(os.path.join(res_dir, "data.json"), "w") as f: json.dump(data, f) dir_count += 1 res_dir = os.path.join(DDIR, str(dir_count)) data = [] if not os.path.isdir(res_dir): os.mkdir(res_dir) ID = ex["identifier"].split("-")[:3] ID = "-".join(ID) if ID + "-img0.png" not in id2path:
mu1, sigma1, med1, maximum_1, minimum_1, percentile75_1 = IcebergDataset.get_image_stat(image[0, :, :]) mu2, sigma2, med2, maximum_2, minimum_2, percentile75_2 = IcebergDataset.get_image_stat(image[1, :, :]) result.append((mu1, sigma1, med1, maximum_1, minimum_1, percentile75_1, mu2, sigma2, med2, maximum_2, minimum_2, percentile75_2, angle[0], label[0])) new_frame = pd.DataFrame(result, columns=["mu1", "sigma1", "med1", "max1", "min1", "per75_1", "mu2", "sigma2", "med2", "max2", "min2", "per75_2", "angle", "label"]) new_frame.to_csv("../data/stats.csv", index=False) print() if __name__ == "__main__": data = IcebergDataset("../data/orig/test.json", mu_sigma=None, inference_only=True, colormap="inferno", im_dir="../data/vis/test/cluster_1") X = np.array([i["inputs"].ravel() for i in data]) # get_best_clusters(X) clusterer = KMeans(n_clusters=2, random_state=10) cluster_labels = clusterer.fit_predict(X) positives = [] for i in progressbar(range(len(data))): if cluster_labels[i] == 1: # sample = data[i] # positives.append(sample["targets"][0]) data.vis(i, prefix="C1_") print("Len", len(positives)) print("Positives", sum(positives)) # inspect_angle() print("Finished!")
def evaluate(self, dataloader, dtype): # network in evaluation mode self.eval() gtRanks = [] numInst = dataloader.numInst[dtype] # save all scores and gtLabels scores = [] gtLabels = [] imageIds = [] # Get gt scores for all options for startId in progressbar(range(0, numInst, self.batchSize)): # Obtain test batch, argument set and GT members batch = dataloader.getTestBatch(startId, dtype) batchSize = batch['set'].size(0) # Extract set, positive setEmbed = bottle(self.wordTransform, Variable(batch['set'])) # if set is empty, reset to zero if self.setSize == 0: setEmbed.data.fill_(0.0) setEmbed, _ = self.selfatt(setEmbed, setEmbed, setEmbed) setEmbed = self.pooler(setEmbed, 1) if type(setEmbed).__name__ == 'tuple': setEmbed = setEmbed[0] setEmbed = setEmbed.squeeze() # If image exists if 'image' in batch: imgEmbed = self.imgTransform(Variable(batch['image'])) setEmbed = torch.cat((setEmbed, imgEmbed), 1) # current batch scores batchScores = torch.FloatTensor(batchSize, self.vocabSize) # Get the scores for all possible options for ii in range(0, self.vocabSize, self.batchSize): end = min(ii + self.batchSize, self.vocabSize) # Interact gt and set to get score argInds = torch.arange(ii, end).long().unsqueeze(0) if self.useGPU: argInds = argInds.cuda() argInds = argInds.repeat(batchSize, 1) argEmbed = bottle(self.wordTransform, Variable(argInds)) argScore = self.scoreInstanceSet(argEmbed, setEmbed) # save scores for this batch batchScores[:, ii:end] = argScore.data.float().squeeze() # Assign the set least possible score (-Inf) to set elements rangeInds = torch.arange(0, batchSize).long() for ii in range(self.evalSize): # satwik: edits for new pytorch scatInds = torch.stack((rangeInds, batch['set'][:, ii].cpu()), 1) batchScores.scatter_(1, scatInds, float('-inf')) # Convert to numpy array batchScores = batchScores.numpy() # rank data is ascending, need descending batchRanks = np.apply_along_axis(rankdata, 1, -1 * batchScores) # save the batch scores scores.append(batchScores) # Assign the ranks gtLabels.extend(batch['pos']) if 'imageId' in batch: imageIds.extend(batch['imageId']) for ii in range(batchSize): gtRank = [batchRanks[ii, jj] for jj in batch['pos'][ii]] gtRanks.append(gtRank) # Compute rank statistics metrics = computeRankStats(np.concatenate(gtRanks)) # network in training mode self.train() return metrics, np.concatenate(scores), {'gtLabels': gtLabels, \ 'imageId': imageIds}
def evaluate_agent(wizard, val_loader, args): """Evaluate a SIMMC agent given a dataloader. Args: wizard: SIMMC model dataloader: Dataloader to use to run the model on args: Arguments for evaluation """ total_iters = int(val_loader.num_instances / args["batch_size"]) # Turn autograd off for evaluation -- light-weight and faster. with torch.no_grad(): wizard.eval() matches = [] for batch in progressbar(val_loader.get_batch(), total=int(total_iters)): if args["bleu_evaluation"]: mode = {"next_token": "ARGMAX", "beam_size": 5} else: mode = None batch_outputs = wizard(batch, mode) # Stringify model responses. if args["bleu_evaluation"]: batch_outputs["model_response"] = ( val_loader.stringify_beam_outputs( batch_outputs["beam_output"], batch)) # Remove beam output to avoid memory issues. del batch_outputs["beam_output"] matches.append(batch_outputs) wizard.train() # Compute perplexity. total_loss_sum = sum(ii["loss_sum"].item() for ii in matches) num_tokens = sum(ii["num_tokens"].item() for ii in matches) avg_loss_eval = total_loss_sum / num_tokens # Compute BLEU score. if args["bleu_evaluation"]: model_responses = [jj for ii in matches for jj in ii["model_response"]] bleu_score = val_loader.evaluate_response_generation(model_responses) else: model_responses = None bleu_score = -1. # Evaluate retrieval score. if args["retrieval_evaluation"]: candidate_scores = [ jj for ii in matches for jj in ii["candidate_scores"] ] retrieval_metrics = val_loader.evaluate_response_retrieval( candidate_scores) print(retrieval_metrics) else: retrieval_metrics = {} # Evaluate action prediction. action_predictions = [jj for ii in matches for jj in ii["action_preds"]] action_metrics = val_loader.evaluate_action_prediction(action_predictions) print(action_metrics["confusion_matrix"]) print_str = ("\nEvaluation\n\tLoss: {:.2f}\n\t" "Perplexity: {:.2f}\n\tBLEU: {:.3f}\n\t" "Action: {:.2f}\n\t" "Action Perplexity: {:.2f}\n\t" "Action Attribute Accuracy: {:.2f}") print( print_str.format(avg_loss_eval, math.exp(avg_loss_eval), bleu_score, 100 * action_metrics["action_accuracy"], action_metrics["action_perplexity"], 100 * action_metrics["attribute_accuracy"])) # Save the results to a file. eval_dict = { "loss": avg_loss_eval, "perplexity": math.exp(avg_loss_eval), "bleu": bleu_score, "action_accuracy": action_metrics["action_accuracy"], "action_perplexity": action_metrics["action_perplexity"], "action_attribute": action_metrics["attribute_accuracy"] } eval_dict.update(retrieval_metrics) eval_outputs = { "model_actions": action_predictions, "model_responses": model_responses } return eval_dict, eval_outputs
id2synet[ID] = list(id2synet[ID])[0] ##################### id2path = dict() for root, _, files in os.walk(imgs_dir): for file in files: if os.path.splitext(file)[1] == ".png": id2path[file] = os.path.join(root, file) examples = [json.loads(line) for line in open(json_file).readlines()] hashes = json.loads(open(hash_file).read()) res_dict = dict() for ID in progressbar(id2path): img = read_img(id2path[ID]) if img is None: continue C = id2synet[ID] res_dir = os.path.join(DDIR, str(C)) if not os.path.isdir(res_dir): os.mkdir(res_dir) path = os.listdir(res_dir) path = filter(lambda p: os.path.splitext(p)[1] == ".png", path) path = sum(1 for _ in path) path = os.path.join(res_dir, str(path) + ".png") res_dict[ID] = path cv2.imwrite(path, img)
if len(members) == 1: return list(members) if len(drr) == 2 and drr[0] == "VP" and isinstance(drr[1], list): if len(drr[1]) == 0: return [] elif drr[1][0] == "VP" and len(drr[1]) == 2: return [rr[1][0], rr[1][1]] return rr def pp(lol): if isinstance(lol, str): return lol return "(%s)" % " ".join([pp(l) for l in lol]) with open(sys.argv[1]) as ptb_f: for line in progressbar(ptb_f): tree = ParentedTree.fromstring(line) # record the list of substitutions lookup = {}; index = 0 for st in tree.subtrees(): if len(list(st.subtrees())) == 1: lookup[index] = st[0]; st[0] = str(index) index += 1 colparse = collapse(strip(tree)) final = finalize(colparse) print(pp(final)) #print(lookup) #print('') #pdb.set_trace();
snapshot_saver = tf.train.Saver(max_to_keep=None) # keep all snapshots snapshot_saver.restore(sess, args['checkpoint']) print('Evaluating on %s' % args['testSplit']) ansMatches = [] progMatches = [] totalIter = int(valLoader.batchLoader.numInst / args['batchSize']) maxIters = 100 curIter = 0 toSave = { 'output': [], 'batch': [] } for batch in progressbar(valLoader.batches(), total=totalIter): _, outputs = model.runVisualizeIteration(batch, sess) toSave['output'].append(outputs) toSave['batch'].append(batch) # debug -- also compute the ranks during visualization #ranks.append(batchRanks); curIter += 1 if curIter >= maxIters: break # save the output + batch batchPath = args['checkpoint'] + '.100_batches.npy' print('Printing the batches: ' + batchPath) support.saveBatch(toSave, batchPath)
snapshot_saver = tf.train.Saver(max_to_keep=None) # keep all snapshots snapshot_saver.restore(sess, args['checkpoint']) print('Evaluating on %s' % args['test_split']) ranks = [] matches = [] total_iter = int(val_loader.batch_loader.num_inst / args['batch_size']) num_iters = 0 # get confusion matrix only if using refer confusion_mat = np.zeros((2, 2)) if args['use_refer']: refer_token = question_assembler.name2idx_dict['_Refer'] find_token = question_assembler.name2idx_dict['_Find'] for batch in progressbar(val_loader.batches(), total=total_iter): batch_ranks, outputs = model.run_evaluate_iteration(batch, sess) ranks.append(batch_ranks) if 'matches' in outputs: matches.append(outputs['matches']) # debug, get confusion between find/refer if args['use_refer']: find_gt = batch['gt_layout'] == find_token refer_gt = batch['gt_layout'] == refer_token find_pred = outputs['pred_tokens'] == find_token refer_pred = outputs['pred_tokens'] == refer_token confusion_mat[0, 0] += np.sum(find_pred & find_gt) confusion_mat[0, 1] += np.sum(refer_pred & find_gt) confusion_mat[1, 0] += np.sum(find_pred & refer_gt)