def _get_unique_filename(self): while True: filename = uuid.uuid4() if not tb_fs.file_exists( tb_fs.join_paths( [self.folderpath, "config-%s.json" % filename])): return filename
def _get_unique_name(self, prefix): while True: filename = uuid.uuid4() filepath = tb_fs.join_paths( [self.folderpath, "%s-%s.json" % (prefix, filename)]) if not tb_fs.file_exists(filepath): return filename
def load(self, name, x): cfg = self.name_to_cfg[name] filepath = self._get_filepath(name, cfg['use_json']) if tb_fs.file_exists(filepath): if cfg['use_json']: out = tb_io.read_jsonfile(filepath) else: out = tb_io.read_picklefile(filepath) x = cfg['load_fn'](x, out) return x
def download_file(urlpath, folderpath, filename=None, abort_if_file_exists=True): if filename is None: filename = urlpath.split('/')[-1] filepath = tb_fs.join_paths([folderpath, filename]) assert tb_fs.folder_exists(folderpath) assert (not tb_fs.file_exists(filepath)) or abort_if_file_exists f = urllib.URLopener() f.retrieve(urlpath, filepath)
def _fn(cfg_path): ds = [] for name in json_filename_lst: p = tb_fs.join_paths([cfg_path, name]) if (not abort_if_notexists) and (not tb_fs.file_exists(p)): d = None # if abort if it does not exist, it is going to fail reading the file. else: d = tb_io.read_jsonfile(p) ds.append(d) return ds
def _fn(e_folderpath): cfg = tb_io.read_jsonfile( tb_fs.join_paths([e_folderpath, 'config.json'])) res = None if not use_checkpoints: res_fpath = tb_fs.join_paths([e_folderpath, 'results.json']) else: res_fpath = tb_fs.join_paths([e_folderpath, 'checkpoint.json']) if tb_fs.file_exists(res_fpath): res = tb_io.read_jsonfile(res_fpath) return (cfg, res)
def create_table_from_experiment(experiment_name, rows, columns, values, abort_if_incomplete_configs=True, use_checkpoints=False, single_row_multitable=False, print_to_terminal=True, max_column_width=10**9, abort_if_different_keys=True): _, xs = explore_experiment('experiments/%s' % experiment_name, use_checkpoints) cfgs = [] res = [] for (c, r) in xs: if r is not None: cfgs.append(c) res.append(r) else: assert not abort_if_incomplete_configs xs = tb_ut.zip_toggle([cfgs, res]) ks = keys_with_variation(cfgs) c = dict(cfgs[0]) for k in ks: c.pop(k) ks.pop('out_folder') print("***%s***" % experiment_name) pprint(ks) print() ds = [summarize_results(tb_ut.merge_dicts(x)) for x in xs] # if the values are with respective if any([ v in values for v in [ 'dev_precision', 'dev_recall', 'dev_fb1', 'test_precision', 'test_recall', 'test_fb1' ] ]): def _extract_fn(fpath): out = subprocess.check_output( ["cat %s | data/conll_2000/conlleval.txt" % fpath], shell=True) res_line = out.split('\n')[1] f1 = float(res_line.split(';')[-1].split(": ")[1]) p, r, fb1 = map(lambda x: 0.01 * float(x.split(': ')[1]), res_line.split('%; '))[1:] return p, r, fb1 # add the test and dev performances to the file. for d in ds: (d['dev_precision'], d['dev_recall'], d['dev_fb1']) = _extract_fn( tb_fs.join_paths([d['out_folder'], 'pred_dev.txt'])) (d['test_precision'], d['test_recall'], d['test_fb1']) = _extract_fn( tb_fs.join_paths([d['out_folder'], 'pred_test.txt'])) # this is the final, last run for conll2000 fpath = tb_fs.join_paths([d['out_folder'], 'final_pred_test.txt']) if tb_fs.file_exists(fpath): (d['final_test_precision'], d['final_test_recall'], d['final_test_fb1']) = _extract_fn(fpath) df = tb_ut.create_dataframe(ds, abort_if_different_keys) # # shorten the names appropriately. df = df.rename(columns={k: k[:max_column_width] for k in rows}) rows = [k[:max_column_width] for k in rows] # determines teh table layout. if not single_row_multitable: ts = [ df.pivot_table(index=rows, columns=columns, values=[v]) for v in values ] else: ts = [ df.pivot_table( index=rows, columns=columns, values=values) #.sort_values('dev_accuracy', ascending=False) ] tb_fs.create_folder('analyses/%s' % experiment_name, abort_if_exists=False) s_c = pformat(c) ss_df = [ t.to_string(float_format=get_float_formatter(2, 100.0)) for t in ts ] lines = [s_c] for s in ss_df: lines.append('') lines.append(s) if print_to_terminal: # print to terminal for s in lines: print(s) # write to file tb_io.write_textfile('analyses/%s/results.txt' % experiment_name, lines) tb_io.write_csvfile(ds, 'analyses/%s/results.csv' % experiment_name, sort_keys=True, abort_if_different_keys=abort_if_different_keys)
def train_model_with_config(): import research_toolbox.tb_logging as tb_lg if cfg["optimizer_type"] == "sgd": trainer = dy.SimpleSGDTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "adam": trainer = dy.AdamTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "sgd_mom": trainer = dy.MomentumSGDTrainer(m, cfg["step_size_start"]) else: raise ValueError trainer.set_sparse_updates(0) # restarting from a checkpoint if it exists. # optimizer state is not kept. ckpt_filepath = cfg["out_folder"] + "/checkpoint.json" if tb_fs.file_exists(ckpt_filepath): log_d = tb_io.read_jsonfile(ckpt_filepath) current_epoch = len(log_d["dev_acc"]) best_dev_acc = np.max(log_d["dev_acc"]) m.populate(cfg["out_folder"] + '/model.ckpt') else: current_epoch = 0 best_dev_acc = 0.0 log_d = { 'dev_acc': [], 'avg_loss': [], 'train_tks/sec': [], 'eval_tks/sec': [], 'secs_per_epoch': [], "lr": [] } if cfg["debug"] or cfg["compute_train_acc"]: log_d["train_acc"] = [] if cfg["loss_type"] == "log_neighbors": loss_fn = loss_log_neighbors elif cfg["loss_type"] == "log_beam": loss_fn = loss_log_beam elif cfg["loss_type"] == "cost_sensitive_margin_last": loss_fn = loss_cost_sensitive_margin_last elif cfg["loss_type"] == "margin_last": loss_fn = loss_margin_last elif cfg["loss_type"] == "perceptron_first": loss_fn = loss_perceptron_first elif cfg["loss_type"] == "perceptron_last": loss_fn = loss_perceptron_last elif cfg["loss_type"] == "upper_bound": loss_fn = loss_upper_bound else: raise ValueError cfg_accuracy = lambda data: beam_accuracy(data, cfg["beam_size"]) cfg_train_graph = lambda e: train_beam_graph(e, cfg["beam_size"], cfg[ "traj_type"], loss_fn) for epoch in range(current_epoch, cfg["num_epochs"]): if cfg["step_size_schedule_type"] == 'fixed': lr = cfg["step_size_start"] elif cfg["step_size_schedule_type"] == 'cosine': lr = cosine_get_lr(cfg["step_size_start"], cfg["step_size_end"], cfg["num_epochs"], epoch) else: raise ValueError log_d['lr'].append(lr) trainer.learning_rate = lr acc_loss = 0.0 random.shuffle(train_data) epoch_timer = tb_lg.TimeTracker() train_timer = tb_lg.TimeTracker() for i, e in enumerate(train_data): if i % cfg["print_every_num_examples"] == 0 and i > 0: print "Epoch %d - Example %d/%d" % (epoch, i, len(train_data)) loss = cfg_train_graph(e) acc_loss += loss.value() loss.backward() trainer.update() log_d["avg_loss"].append(acc_loss / len(train_data)) log_d["train_tks/sec"].append(num_train_tokens / train_timer.time_since_start()) eval_timer = tb_lg.TimeTracker() # log_d['train_acc'].append(accuracy(train_data)) log_d['dev_acc'].append(cfg_accuracy(dev_data)) # log_d['test_acc'].append(accuracy(test_data)) log_d['eval_tks/sec'].append(( #len(train_data) + num_dev_tokens # + num_test_tokens ) / eval_timer.time_since_start()) log_d["secs_per_epoch"].append(epoch_timer.time_since_start()) if cfg["debug"] or cfg["compute_train_acc"]: train_acc = cfg_accuracy(train_data) print "train_acc: ", train_acc log_d["train_acc"].append(train_acc) pprint({k: vs[-1] for k, vs in log_d.iteritems()}) if best_dev_acc < log_d["dev_acc"][-1]: best_dev_acc = log_d["dev_acc"][-1] m.save(cfg["out_folder"] + '/best_model.ckpt') tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/checkpoint.json") m.save(cfg["out_folder"] + '/model.ckpt') results_filepath = cfg["out_folder"] + "/results.json" if not tb_fs.file_exists(results_filepath): m.populate(cfg["out_folder"] + '/best_model.ckpt') log_d['test_acc'] = cfg_accuracy(test_data) tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/results.json")