Ejemplo n.º 1
0
def eval_ukp_with_nli(exp_name):
    step_per_epoch = 24544 + 970

    hp = hyperparams.HPBert()
    e_config = ExperimentConfig()
    e_config.num_steps = step_per_epoch
    e_config.voca_size = 30522
    e_config.num_dev_batches = 30
    e_config.load_names = ['bert']
    encode_opt = "is_good"
    num_class_list = [3, 3]
    f1_list = []
    save_path = "/mnt/scratch/youngwookim/Chair/output/model/runs/argmix_AN_B_40000_abortion_is_good/model-21306"
    for topic in data_generator.argmining.ukp_header.all_topics[:1]:
        e = Experiment(hp)
        print(exp_name)
        e_config.name = "argmix_{}_{}_{}".format(exp_name, topic, encode_opt)
        arg_data_loader = BertDataLoader(topic,
                                         True,
                                         hp.seq_max,
                                         "bert_voca.txt",
                                         option=encode_opt)
        f1_last = e.eval_ukp_on_shared(e_config, arg_data_loader,
                                       num_class_list, save_path)
        f1_list.append((topic, f1_last))
    print(exp_name)
    print(encode_opt)
    print(f1_list)
    for key, score in f1_list:
        print("{0}\t{1:.03f}".format(key, score))
Ejemplo n.º 2
0
def ukp_train_test_repeat(load_id, exp_name, topic, n_repeat):
    hp = hyperparams.HPBert()
    e_config = ExperimentConfig()
    e_config.num_epoch = 2
    e_config.save_interval = 100 * 60  # 30 minutes
    e_config.voca_size = 30522
    e_config.load_names = ['bert']
    encode_opt = "is_good"

    print(load_id)
    scores = []
    for i in range(n_repeat):
        e = Experiment(hp)
        print(exp_name)
        e_config.name = "arg_{}_{}_{}".format(exp_name, topic, encode_opt)
        data_loader = BertDataLoader(topic,
                                     True,
                                     hp.seq_max,
                                     "bert_voca.txt",
                                     option=encode_opt)
        save_path = e.train_ukp(e_config, data_loader, load_id)
        f1_last = e.eval_ukp(e_config, data_loader, save_path)
        scores.append(f1_last)
    print(exp_name)
    print(encode_opt)
    for e in scores:
        print(e, end="\t")
    print()
    print("Avg\n{0:.03f}".format(average(scores)))
Ejemplo n.º 3
0
def gen_tfrecord_w_tdev():
    max_sequence = 300
    dir_path = os.path.join(data_path, "ukp_tdev_{}".format(max_sequence))
    exist_or_mkdir(dir_path)
    for topic in all_topics:
        data_loader = modify_data_loader(
            BertDataLoader(topic, True, max_sequence, "bert_voca.txt",
                           "only_topic_word"))
        todo = [("dev", data_loader.get_dev_data())]

        train_data = list(data_loader.get_train_data())

        random.shuffle(train_data)
        validation_size = int(len(train_data) * 0.1)

        train_train_data = train_data[:-validation_size]
        train_dev_data = train_data[validation_size:]
        todo.append(("ttrain", train_train_data))
        todo.append(("tdev", train_dev_data))

        for name, data in todo[::-1]:
            features = lmap(entry_to_feature_dict, data)
            out_name = "{}_{}".format(name, topic)
            out_path = os.path.join(dir_path, out_name)
            write_features_to_file(features, out_path)
Ejemplo n.º 4
0
def ukp_train_test(load_id, exp_name):
    hp = hyperparams.HPBert()
    e_config = ExperimentConfig()
    e_config.num_epoch = 2
    e_config.save_interval = 100 * 60  # 30 minutes
    e_config.voca_size = 30522
    e_config.load_names = ['bert']
    encode_opt = "is_good"

    print(load_id)
    f1_list = []
    for topic in data_generator.argmining.ukp_header.all_topics:
        e = Experiment(hp)
        print(exp_name)
        e_config.name = "arg_{}_{}_{}".format(exp_name, topic, encode_opt)
        data_loader = BertDataLoader(topic,
                                     True,
                                     hp.seq_max,
                                     "bert_voca.txt",
                                     option=encode_opt)
        save_path = e.train_ukp(e_config, data_loader, load_id)
        print(topic)
        f1_last = e.eval_ukp(e_config, data_loader, save_path)
        f1_list.append((topic, f1_last))
    print(exp_name)
    print(encode_opt)
    print(f1_list)
    for key, score in f1_list:
        print("{0}\t{1:.03f}".format(key, score))
Ejemplo n.º 5
0
def gen_pairwise():
    max_sequence = 300
    dir_path = os.path.join(data_path, "ukp_pairwise_{}".format(max_sequence))
    exist_or_mkdir(dir_path)

    for topic in all_topics:
        data_loader = modify_data_loader(
            BertDataLoader(topic, True, max_sequence, "bert_voca.txt",
                           "only_topic_word"))
        todo = [("train", data_loader.get_train_data()),
                ("dev", data_loader.get_dev_data())]
        for name, data in todo[::-1]:
            out_name = "{}_{}".format(name, topic)
            out_path = os.path.join(dir_path, out_name)

            grouped = [[], [], []]
            for e in data:
                input_ids, input_mask, segment_ids, label = e
                grouped[label].append(e)

            combs = []
            combs.extend(generate_pairwise_combinations(
                grouped[0], grouped[1]))
            combs.extend(generate_pairwise_combinations(
                grouped[1], grouped[2]))
            combs.extend(generate_pairwise_combinations(
                grouped[2], grouped[0]))
            features = lmap(pairwise_entry_to_feature_dict, combs)
            write_features_to_file(features, out_path)
Ejemplo n.º 6
0
    def __init__(self, topic, cheat = False, cheat_topic=None):
        self.voca_size = 30522
        self.topic = topic
        load_names = ['bert', "cls_dense"]
        if not cheat:
            run_name = "arg_key_neccesary_{}".format(topic)
        else:
            run_name = "arg_key_neccesary_{}".format(cheat_topic)
        self.hp = hyperparams.HPBert()
        self.model_dir = cpath.model_path
        self.data_loader = BertDataLoader(topic, True, self.hp.seq_max, "bert_voca.txt")

        self.task = transformer_nli(self.hp, self.voca_size, 0, False)
        self.sess = init_session()
        self.sess.run(tf.global_variables_initializer())
        self.merged = tf.summary.merge_all()
        self.load_model_white(run_name, load_names)

        self.batch_size = 512
Ejemplo n.º 7
0
def do_fetch_param():
    hp = hyperparams.HPBert()
    voca_size = 30522
    encode_opt = "is_good"
    topic = "abortion"
    load_run_name = "arg_nli_{}_is_good".format(topic)
    run_name = "arg_{}_{}_{}".format("fetch_grad", topic, encode_opt)
    data_loader = BertDataLoader(topic, True, hp.seq_max, "bert_voca.txt", option=encode_opt)
    model_path = get_model_full_path(load_run_name)
    names, vars = fetch_params(hp, voca_size, run_name, data_loader, model_path)
    r = names, vars
    pickle.dump(r, open(os.path.join(output_path, "params.pickle"), "wb"))
Ejemplo n.º 8
0
def gen_tfrecord():
    max_sequence = 300
    is_3way = False
    dir_path = os.path.join(data_path, "ukp_{}_2way".format(max_sequence))
    exist_or_mkdir(dir_path)
    for topic in all_topics:
        data_loader = modify_data_loader(
            BertDataLoader(topic, is_3way, max_sequence, "bert_voca.txt",
                           "only_topic_word"))
        todo = [("train", data_loader.get_train_data()),
                ("dev", data_loader.get_dev_data())]

        for name, data in todo[::-1]:
            features = lmap(entry_to_feature_dict, data)
            out_name = "{}_{}".format(name, topic)
            out_path = os.path.join(dir_path, out_name)
            write_features_to_file(features, out_path)
Ejemplo n.º 9
0
def train_ukp_with_nli(load_id, exp_name):
    step_per_epoch = 24544 + 970

    hp = hyperparams.HPBert()
    e_config = ExperimentConfig()
    e_config.num_steps = step_per_epoch
    e_config.save_interval = 100 * 60  # 30 minutes
    e_config.voca_size = 30522
    e_config.num_dev_batches = 30
    e_config.load_names = ['bert']
    e_config.valid_freq = 500
    encode_opt = "is_good"
    nli_setting = NLI()
    nli_setting.vocab_size = 30522
    nli_setting.vocab_filename = "bert_voca.txt"
    num_class_list = [3, 3]
    f1_list = []
    for topic in data_generator.argmining.ukp_header.all_topics:
        e = Experiment(hp)
        print(exp_name)
        e_config.name = "argmix_{}_{}_{}".format(exp_name, topic, encode_opt)
        arg_data_loader = BertDataLoader(topic,
                                         True,
                                         hp.seq_max,
                                         "bert_voca.txt",
                                         option=encode_opt)
        nli_data_loader = nli.DataLoader(hp.seq_max,
                                         nli_setting.vocab_filename, True)

        shared_data_loader = SharedFeeder([arg_data_loader, nli_data_loader],
                                          [1, 5], ["Arg", "NLI"],
                                          hp.batch_size)

        save_path = e.train_shared(e_config, shared_data_loader,
                                   num_class_list, load_id)
        print(topic)
        f1_last = e.eval_ukp_on_shared(e_config, arg_data_loader,
                                       num_class_list, save_path)
        f1_list.append((topic, f1_last))
    print(exp_name)
    print(encode_opt)
    print(f1_list)
    for key, score in f1_list:
        print("{0}\t{1:.03f}".format(key, score))
Ejemplo n.º 10
0
class Predictor:
    def __init__(self, topic, cheat = False, cheat_topic=None):
        self.voca_size = 30522
        self.topic = topic
        load_names = ['bert', "cls_dense"]
        if not cheat:
            run_name = "arg_key_neccesary_{}".format(topic)
        else:
            run_name = "arg_key_neccesary_{}".format(cheat_topic)
        self.hp = hyperparams.HPBert()
        self.model_dir = cpath.model_path
        self.data_loader = BertDataLoader(topic, True, self.hp.seq_max, "bert_voca.txt")

        self.task = transformer_nli(self.hp, self.voca_size, 0, False)
        self.sess = init_session()
        self.sess.run(tf.global_variables_initializer())
        self.merged = tf.summary.merge_all()
        self.load_model_white(run_name, load_names)

        self.batch_size = 512

    def encode_instance(self, topic, sentence):
        topic_str = topic + " is neccesary."
        entry = self.data_loader.encode_pair(topic_str, sentence)
        return entry["input_ids"], entry["input_mask"], entry["segment_ids"]

    def predict(self, target_topic, sents):
        inputs = list([self.encode_instance(target_topic, s) for s in sents])

        def batch2feed_dict(batch):
            x0, x1, x2  = batch
            feed_dict = {
                self.task.x_list[0]: x0,
                self.task.x_list[1]: x1,
                self.task.x_list[2]: x2,
            }
            return feed_dict


        def forward_run(inputs):
            batches = get_batches_ex(inputs, self.batch_size, 3)
            logit_list = []
            for batch in batches:
                logits,  = self.sess.run([self.task.sout, ],
                                               feed_dict=batch2feed_dict(batch))
                logit_list.append(logits)
            return np.concatenate(logit_list)

        logits = forward_run(inputs)
        pred = np.argmax(logits, axis=1)
        return pred

    def load_model_white(self, name, include_namespace, verbose=True):
        run_dir = os.path.join(self.model_dir, 'runs')
        save_dir = os.path.join(run_dir, name)
        def get_last_id(save_dir):
            last_model_id = None
            for (dirpath, dirnames, filenames) in os.walk(save_dir):
                for filename in filenames:
                    if ".meta" in filename:
                        print(filename)
                        model_id = filename[:-5]
                        if last_model_id is None:
                            last_model_id = model_id
                        else:
                            last_model_id = model_id if model_id > last_model_id else last_model_id
            return last_model_id

        id = get_last_id(save_dir)
        path = os.path.join(save_dir, "{}".format(id))

        def condition(v):
            if v.name.split('/')[0] in include_namespace:
                return True
            return False

        variables = tf.contrib.slim.get_variables_to_restore()
        variables_to_restore = [v for v in variables if condition(v)]
        if verbose:
            print("Restoring: {} {}".format(name, id))
            for v in variables_to_restore:
                print(v)

        self.loader = tf.train.Saver(variables_to_restore, max_to_keep=1)
        self.loader.restore(self.sess, path)