Ejemplo n.º 1
0
 def _get_unique_filename(self):
     while True:
         filename = uuid.uuid4()
         if not tb_fs.file_exists(
                 tb_fs.join_paths(
                     [self.folderpath,
                      "config-%s.json" % filename])):
             return filename
Ejemplo n.º 2
0
 def _get_unique_name(self, prefix):
     while True:
         filename = uuid.uuid4()
         filepath = tb_fs.join_paths(
             [self.folderpath,
              "%s-%s.json" % (prefix, filename)])
         if not tb_fs.file_exists(filepath):
             return filename
Ejemplo n.º 3
0
 def load(self, name, x):
     cfg = self.name_to_cfg[name]
     filepath = self._get_filepath(name, cfg['use_json'])
     if tb_fs.file_exists(filepath):
         if cfg['use_json']:
             out = tb_io.read_jsonfile(filepath)
         else:
             out = tb_io.read_picklefile(filepath)
         x = cfg['load_fn'](x, out)
     return x
Ejemplo n.º 4
0
def download_file(urlpath,
                  folderpath,
                  filename=None,
                  abort_if_file_exists=True):
    if filename is None:
        filename = urlpath.split('/')[-1]
    filepath = tb_fs.join_paths([folderpath, filename])
    assert tb_fs.folder_exists(folderpath)
    assert (not tb_fs.file_exists(filepath)) or abort_if_file_exists
    f = urllib.URLopener()
    f.retrieve(urlpath, filepath)
Ejemplo n.º 5
0
    def _fn(cfg_path):
        ds = []
        for name in json_filename_lst:
            p = tb_fs.join_paths([cfg_path, name])

            if (not abort_if_notexists) and (not tb_fs.file_exists(p)):
                d = None
            # if abort if it does not exist, it is going to fail reading the file.
            else:
                d = tb_io.read_jsonfile(p)
            ds.append(d)
        return ds
Ejemplo n.º 6
0
    def _fn(e_folderpath):
        cfg = tb_io.read_jsonfile(
            tb_fs.join_paths([e_folderpath, 'config.json']))

        res = None
        if not use_checkpoints:
            res_fpath = tb_fs.join_paths([e_folderpath, 'results.json'])
        else:
            res_fpath = tb_fs.join_paths([e_folderpath, 'checkpoint.json'])

        if tb_fs.file_exists(res_fpath):
            res = tb_io.read_jsonfile(res_fpath)
        return (cfg, res)
Ejemplo n.º 7
0
def create_table_from_experiment(experiment_name,
                                 rows,
                                 columns,
                                 values,
                                 abort_if_incomplete_configs=True,
                                 use_checkpoints=False,
                                 single_row_multitable=False,
                                 print_to_terminal=True,
                                 max_column_width=10**9,
                                 abort_if_different_keys=True):

    _, xs = explore_experiment('experiments/%s' % experiment_name,
                               use_checkpoints)

    cfgs = []
    res = []
    for (c, r) in xs:
        if r is not None:
            cfgs.append(c)
            res.append(r)
        else:
            assert not abort_if_incomplete_configs
    xs = tb_ut.zip_toggle([cfgs, res])

    ks = keys_with_variation(cfgs)
    c = dict(cfgs[0])
    for k in ks:
        c.pop(k)

    ks.pop('out_folder')
    print("***%s***" % experiment_name)
    pprint(ks)
    print()

    ds = [summarize_results(tb_ut.merge_dicts(x)) for x in xs]

    # if the values are with respective
    if any([
            v in values for v in [
                'dev_precision', 'dev_recall', 'dev_fb1', 'test_precision',
                'test_recall', 'test_fb1'
            ]
    ]):

        def _extract_fn(fpath):

            out = subprocess.check_output(
                ["cat %s | data/conll_2000/conlleval.txt" % fpath], shell=True)

            res_line = out.split('\n')[1]
            f1 = float(res_line.split(';')[-1].split(": ")[1])

            p, r, fb1 = map(lambda x: 0.01 * float(x.split(': ')[1]),
                            res_line.split('%; '))[1:]

            return p, r, fb1

        # add the test and dev performances to the file.
        for d in ds:
            (d['dev_precision'], d['dev_recall'], d['dev_fb1']) = _extract_fn(
                tb_fs.join_paths([d['out_folder'], 'pred_dev.txt']))

            (d['test_precision'], d['test_recall'],
             d['test_fb1']) = _extract_fn(
                 tb_fs.join_paths([d['out_folder'], 'pred_test.txt']))

            # this is the final, last run for conll2000
            fpath = tb_fs.join_paths([d['out_folder'], 'final_pred_test.txt'])
            if tb_fs.file_exists(fpath):

                (d['final_test_precision'], d['final_test_recall'],
                 d['final_test_fb1']) = _extract_fn(fpath)

    df = tb_ut.create_dataframe(ds, abort_if_different_keys)

    # # shorten the names appropriately.
    df = df.rename(columns={k: k[:max_column_width] for k in rows})
    rows = [k[:max_column_width] for k in rows]

    # determines teh table layout.
    if not single_row_multitable:

        ts = [
            df.pivot_table(index=rows, columns=columns, values=[v])
            for v in values
        ]

    else:
        ts = [
            df.pivot_table(
                index=rows, columns=columns,
                values=values)  #.sort_values('dev_accuracy', ascending=False)
        ]

    tb_fs.create_folder('analyses/%s' % experiment_name, abort_if_exists=False)
    s_c = pformat(c)
    ss_df = [
        t.to_string(float_format=get_float_formatter(2, 100.0)) for t in ts
    ]

    lines = [s_c]
    for s in ss_df:
        lines.append('')
        lines.append(s)

    if print_to_terminal:
        # print to terminal
        for s in lines:
            print(s)

    # write to file
    tb_io.write_textfile('analyses/%s/results.txt' % experiment_name, lines)
    tb_io.write_csvfile(ds,
                        'analyses/%s/results.csv' % experiment_name,
                        sort_keys=True,
                        abort_if_different_keys=abort_if_different_keys)
Ejemplo n.º 8
0
def train_model_with_config():
    import research_toolbox.tb_logging as tb_lg

    if cfg["optimizer_type"] == "sgd":
        trainer = dy.SimpleSGDTrainer(m, cfg["step_size_start"])
    elif cfg["optimizer_type"] == "adam":
        trainer = dy.AdamTrainer(m, cfg["step_size_start"])
    elif cfg["optimizer_type"] == "sgd_mom":
        trainer = dy.MomentumSGDTrainer(m, cfg["step_size_start"])
    else:
        raise ValueError
    trainer.set_sparse_updates(0)

    # restarting from a checkpoint if it exists.
    # optimizer state is not kept.
    ckpt_filepath = cfg["out_folder"] + "/checkpoint.json"
    if tb_fs.file_exists(ckpt_filepath):
        log_d = tb_io.read_jsonfile(ckpt_filepath)
        current_epoch = len(log_d["dev_acc"])
        best_dev_acc = np.max(log_d["dev_acc"])
        m.populate(cfg["out_folder"] + '/model.ckpt')
    else:
        current_epoch = 0
        best_dev_acc = 0.0

        log_d = {
            'dev_acc': [],
            'avg_loss': [],
            'train_tks/sec': [],
            'eval_tks/sec': [],
            'secs_per_epoch': [],
            "lr": []
        }
        if cfg["debug"] or cfg["compute_train_acc"]:
            log_d["train_acc"] = []

    if cfg["loss_type"] == "log_neighbors":
        loss_fn = loss_log_neighbors
    elif cfg["loss_type"] == "log_beam":
        loss_fn = loss_log_beam
    elif cfg["loss_type"] == "cost_sensitive_margin_last":
        loss_fn = loss_cost_sensitive_margin_last
    elif cfg["loss_type"] == "margin_last":
        loss_fn = loss_margin_last
    elif cfg["loss_type"] == "perceptron_first":
        loss_fn = loss_perceptron_first
    elif cfg["loss_type"] == "perceptron_last":
        loss_fn = loss_perceptron_last
    elif cfg["loss_type"] == "upper_bound":
        loss_fn = loss_upper_bound
    else:
        raise ValueError

    cfg_accuracy = lambda data: beam_accuracy(data, cfg["beam_size"])
    cfg_train_graph = lambda e: train_beam_graph(e, cfg["beam_size"], cfg[
        "traj_type"], loss_fn)

    for epoch in range(current_epoch, cfg["num_epochs"]):
        if cfg["step_size_schedule_type"] == 'fixed':
            lr = cfg["step_size_start"]
        elif cfg["step_size_schedule_type"] == 'cosine':
            lr = cosine_get_lr(cfg["step_size_start"], cfg["step_size_end"],
                               cfg["num_epochs"], epoch)
        else:
            raise ValueError
        log_d['lr'].append(lr)

        trainer.learning_rate = lr

        acc_loss = 0.0
        random.shuffle(train_data)
        epoch_timer = tb_lg.TimeTracker()
        train_timer = tb_lg.TimeTracker()
        for i, e in enumerate(train_data):
            if i % cfg["print_every_num_examples"] == 0 and i > 0:
                print "Epoch %d - Example %d/%d" % (epoch, i, len(train_data))
            loss = cfg_train_graph(e)
            acc_loss += loss.value()
            loss.backward()
            trainer.update()

        log_d["avg_loss"].append(acc_loss / len(train_data))
        log_d["train_tks/sec"].append(num_train_tokens /
                                      train_timer.time_since_start())
        eval_timer = tb_lg.TimeTracker()
        # log_d['train_acc'].append(accuracy(train_data))
        log_d['dev_acc'].append(cfg_accuracy(dev_data))
        # log_d['test_acc'].append(accuracy(test_data))
        log_d['eval_tks/sec'].append((  #len(train_data) +
            num_dev_tokens
            # + num_test_tokens
        ) / eval_timer.time_since_start())
        log_d["secs_per_epoch"].append(epoch_timer.time_since_start())
        if cfg["debug"] or cfg["compute_train_acc"]:
            train_acc = cfg_accuracy(train_data)
            print "train_acc: ", train_acc
            log_d["train_acc"].append(train_acc)
        pprint({k: vs[-1] for k, vs in log_d.iteritems()})

        if best_dev_acc < log_d["dev_acc"][-1]:
            best_dev_acc = log_d["dev_acc"][-1]
            m.save(cfg["out_folder"] + '/best_model.ckpt')
        tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/checkpoint.json")
        m.save(cfg["out_folder"] + '/model.ckpt')

    results_filepath = cfg["out_folder"] + "/results.json"
    if not tb_fs.file_exists(results_filepath):
        m.populate(cfg["out_folder"] + '/best_model.ckpt')
        log_d['test_acc'] = cfg_accuracy(test_data)
        tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/results.json")