コード例 #1
0
def train_lm(args):
    if args.data_name not in config.DATASET:
        raise ValueError("Invalid data name!")
    dataset = DataSet(config.DATASET[args.data_name])
    train_dataset = dataset.load_train()
    test_dataset = dataset.load_test()
    corpus = Corpus(train_dataset.file_list, test_dataset.file_list,
                    args.reverse)
    suffix = "backward" if args.reverse else "forward"

    kwargs = {
        "vocab_size": corpus.glove_embed.shape[0],
        "embed_dim": corpus.glove_embed.shape[1],
        "corpus": corpus,
        "hparams": {
            "hidden_size": args.hidden_size,
            "num_layers": args.num_layers,
            "cell_type": args.cell_type,
            "tie_embed": args.tie_embed,
            "rnn_dropout": args.rnn_dropout,
            "hidden_dropout": args.hidden_dropout,
            "num_epochs": args.num_epochs,
            "batch_size": args.batch_size,
            "bptt": args.bptt,
            "log_interval": args.log_interval,
            "save_path": args.save_path + '_' + args.data_name + '_' + suffix,
            "lr": args.lr,
            "wdecay": args.wdecay,
        }
    }

    lm = LanguageModel(**kwargs)
    best_valid_loss = lm.fit()
    print("Best Valid Loss:", best_valid_loss)
コード例 #2
0
    def __init__(self, model_name, data_name, cv_runs, params_dict, logger,
                 eval_by_rel):
        dataset = DataSet(config.DATASET[data_name])
        self.train_triples, self.valid_triples, self.test_triples = dataset.load_data(
        )
        self.e2id, self.r2id = dataset.load_idx()

        self.model_name = model_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        self.logger = logger
        self.n_entities = len(self.e2id)
        self.n_relations = len(self.r2id)
        if eval_by_rel:
            self.scorer = RelationScorer(self.train_triples,
                                         self.valid_triples, self.test_triples,
                                         self.n_relations)
        else:
            self.scorer = Scorer(self.train_triples, self.valid_triples,
                                 self.test_triples, self.n_entities)

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_path = os.path.abspath(config.CHECKPOINT_PATH)
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        self.checkpoint_prefix = os.path.join(checkpoint_path, self.__str__())
コード例 #3
0
def run_lm_coherence(args):
    logging.info("Loading data...")
    if args.data_name not in config.DATASET:
        raise ValueError("Invalid data name!")

    dataset = DataSet(config.DATASET[args.data_name])
    train_dataset = dataset.load_train()
    test_df = dataset.load_test_perm()
    test_dataset = dataset.load_test()
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=1,
                                 shuffle=False)
    corpus = Corpus(train_dataset.file_list, test_dataset.file_list)

    # dataset = DataSet(config.DATASET["wsj_bigram"])
    # test_df = dataset.load_test_perm()
    # test_dataset = dataset.load_test()
    # test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)

    with open(
            os.path.join(config.CHECKPOINT_PATH,
                         args.lm_name + "_forward.pkl"), "rb") as f:
        hparams = pickle.load(f)

    kwargs = {
        "vocab_size": corpus.glove_embed.shape[0],
        "embed_dim": corpus.glove_embed.shape[1],
        "corpus": corpus,
        "hparams": hparams,
    }

    forward_model = LanguageModel(**kwargs)
    forward_model.load(
        os.path.join(config.CHECKPOINT_PATH, args.lm_name + "_forward.pt"))
    backward_model = LanguageModel(**kwargs)
    backward_model.load(
        os.path.join(config.CHECKPOINT_PATH, args.lm_name + "_backward.pt"))

    logging.info("Results for discrimination:")
    model = LMCoherence(forward_model.lm, backward_model.lm, corpus)
    dis_acc = model.evaluate_dis(test_dataloader, test_df)
    logging.info("Disc Accuracy: {}".format(dis_acc))

    logging.info("Results for insertion:")
    ins_acc = model.evaluate_ins(test_dataloader, test_df)
    logging.info("Disc Accuracy: {}".format(ins_acc))

    return dis_acc, ins_acc
コード例 #4
0
def save_eval_perm(data_name, if_sample=False, random_seed=config.RANDOM_SEED):
    random.seed(random_seed)

    logging.info("Loading valid and test data...")
    if data_name not in config.DATASET:
        raise ValueError("Invalid data name!")
    dataset = DataSet(config.DATASET[data_name])
    # dataset.random_seed = random_seed
    if if_sample:
        valid_dataset = dataset.load_valid_sample()
    else:
        valid_dataset = dataset.load_valid()
    if if_sample:
        test_dataset = dataset.load_test_sample()
    else:
        test_dataset = dataset.load_test()
    valid_df = valid_dataset.article_df
    test_df = test_dataset.article_df

    logging.info("Generating permuted articles...")

    def permute(x):
        x = np.array(x).squeeze()
        # neg_x_list = permute_articles([x], config.NEG_PERM)[0]
        neg_x_list = permute_articles_with_replacement([x], config.NEG_PERM)[0]
        return "<BREAK>".join(["<PUNC>".join(i) for i in neg_x_list])

    valid_df["neg_list"] = valid_df.sentences.map(permute)
    valid_df["sentences"] = valid_df.sentences.map(lambda x: "<PUNC>".join(x))
    valid_nums = valid_df.neg_list.map(lambda x: len(x.split("<BREAK>"))).sum()
    test_df["neg_list"] = test_df.sentences.map(permute)
    test_df["sentences"] = test_df.sentences.map(lambda x: "<PUNC>".join(x))
    test_nums = test_df.neg_list.map(lambda x: len(x.split("<BREAK>"))).sum()

    logging.info("Number of validation pairs %d" % valid_nums)
    logging.info("Number of test pairs %d" % test_nums)

    logging.info("Saving...")
    dataset.save_valid_perm(valid_df)
    dataset.save_test_perm(test_df)
    logging.info("Finished!")
コード例 #5
0
ファイル: read_data.py プロジェクト: zzb5233/dmn-tensorflow
def read_babi(data_dir, task_id, type, batch_size, word_table):
    """ Reads bAbi data set.
    :param data_dir: bAbi data directory
    :param task_id: task no. (int)
    :param type: 'train' or 'test'
    :param batch_size: how many examples in a minibatch?
    :param word_table: WordTable
    :return: DataSet
    """
    data = load_babi(data_dir, task_id, type)
    x, q, y, fc = process_babi(data, word_table)
    return DataSet(batch_size, x, q, y, fc, name=type)
コード例 #6
0
    def __init__(self, model_name, data_name, cv_runs, params_dict, logger, eval_by_rel):
        dataset = DataSet(config.DATASET[data_name])
        self.train_triples, self.valid_triples, self.test_triples = dataset.load_data()
        self.e2id, self.r2id = dataset.load_idx()

        self.groundings = dataset.load_groundings() if "SoLE" in model_name else None
        self.model_name = model_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        if "batch_size" not in self.hparams :
            if "batch_num" in self.hparams :
                self.hparams["batch_size"] = int(len(self.train_triples)/self.hparams["batch_num"])
            else:
                raise AttributeError("Need parameter batch_size or batch_num! (Check model_param_space.py)")

        self.logger = logger
        self.n_entities = len(self.e2id)
        self.n_relations = len(self.r2id)
        if eval_by_rel:
            self.scorer = RelationScorer(
                self.train_triples, self.valid_triples, self.test_triples, self.n_relations)
        else:
            self.scorer = Scorer(
                self.train_triples, self.valid_triples, self.test_triples, self.n_entities)

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        checkpoint_path = os.path.abspath(config.CHECKPOINT_PATH + "/" + self.__str__() +
                                          ("_NNE_" if "NNE_enable" in self.hparams and self.hparams.NNE_enable == True else "_noNNE_") + data_name)
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        self.checkpoint_prefix = os.path.join(checkpoint_path, self.__str__())

        print(self.hparams)
コード例 #7
0
class args:
    batch_size = 2
    lr = 0.0002
    # display = 500
    display = 10
    weight_decay = 0.00001
    num_class = 6
    model_name = 'HRNet V2'

data_path_df = pd.read_csv('dataset/path_list.csv')

tf.logging.set_verbosity(tf.logging.INFO)
#将 TensorFlow 日志信息输出到屏幕.TensorFlow将输出与该级别相对应的所有日志消息以及更高程度严重性的所有级别的日志信息。
train_path, val_path = train_test_split(data_path_df, test_size=0.25, shuffle=True)
#train_test_split是交叉验证中常用的函数,功能是从样本中随机的按比例选取train_data和test_data
dataset_tr = DataSet(image_path=train_path['image'].values, label_path=train_path['label'].values)
dataset_val = DataSet(image_path=val_path['image'].values, label_path=val_path['label'].values)

model = HighResolutionNet(args.num_class)

image = tf.placeholder(tf.float32, [None, 256, 256, 3], name='input_x')
label = tf.placeholder(tf.int32, [None, 256, 256])
lr = tf.placeholder(tf.float32,)

logits = model.forword(image)
print(logits)
predicts = tf.argmax(logits, axis=-1, name='predicts')
print(predicts)

# cross_entropy
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label))
コード例 #8
0
logits, loss = get_graph()

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

saver = tf.train.Saver()

with tf.Session() as sess:
    tf.global_variables_initializer().run()

    # Useful for testing overfit model
    #batch_input = np.random.normal(size=(batch_size, max_seq_len, 109))
    #batch_targets = np.random.normal(size=(batch_size, max_seq_len)) / 20.
    #batch_weights = np.ones((batch_size, max_seq_len))

    dataset = DataSet()

    data = dataset.get_numpy_data()

    trainset, train_validset, validset = dataset.split_valid(*data, 0.25)

    print('Train dataset shape: {0}'.format(trainset[0].shape))
    # trainset stats:
    # shape: 712, 1520 when split 0.5
    # Epoch about every ~8000 steps (not true epoch due to shifted seq)

    if SAVE_ANALYTICS:
        with open(os.path.join(OUTDIR, 'log.csv'), 'w') as outfile:
            outfile.write('Step,Train R,Valid R\n')

    print('training...')
コード例 #9
0
def run_bigram_coherence(args):
    logging.info("Loading data...")
    if args.data_name not in config.DATASET:
        raise ValueError("Invalid data name!")
    dataset = DataSet(config.DATASET[args.data_name])
    # dataset.random_seed = args.random_seed
    if not os.path.isfile(dataset.test_perm):
        save_eval_perm(args.data_name, random_seed=args.random_seed)

    train_dataset = dataset.load_train(args.portion)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True, drop_last=True)
    valid_dataset = dataset.load_valid(args.portion)
    valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=False)
    valid_df = dataset.load_valid_perm()
    test_dataset = dataset.load_test(args.portion)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
    test_df = dataset.load_test_perm()

    logging.info("Loading sent embedding...")
    if args.sent_encoder == "infersent":
        sent_embedding = get_infersent(args.data_name, if_sample=args.test)
        embed_dim = 4096
    elif args.sent_encoder == "average_glove":
        sent_embedding = get_average_glove(args.data_name, if_sample=args.test)
        embed_dim = 300
    elif args.sent_encoder == "lm_hidden":
        corpus = Corpus(train_dataset.file_list, test_dataset.file_list)
        sent_embedding = get_lm_hidden(args.data_name, "lm_" + args.data_name, corpus)
        embed_dim = 2048
    elif args.sent_encoder == "s2s_hidden":
        corpus = SentCorpus(train_dataset.file_list, test_dataset.file_list)
        sent_embedding = get_s2s_hidden(args.data_name, "s2s_" + args.data_name, corpus)
        embed_dim = 2048
    else:
        raise ValueError("Invalid sent encoder name!")

    logging.info("Training BigramCoherence model...")
    kwargs = {
        "embed_dim": embed_dim,
        "sent_encoder": sent_embedding,
        "hparams": {
            "loss": args.loss,
            "input_dropout": args.input_dropout,
            "hidden_state": args.hidden_state,
            "hidden_layers": args.hidden_layers,
            "hidden_dropout": args.hidden_dropout,
            "num_epochs": args.num_epochs,
            "margin": args.margin,
            "lr": args.lr,
            "l2_reg_lambda": args.l2_reg_lambda,
            "use_bn": args.use_bn,
            "task": "discrimination",
            "bidirectional": args.bidirectional,
        }
    }

    model = BigramCoherence(**kwargs)
    model.init()
    best_step, valid_acc = model.fit(train_dataloader, valid_dataloader, valid_df)
    if args.save:
        model_path = os.path.join(config.CHECKPOINT_PATH, "%s-%.4f" % (args.data_name, valid_acc))
        # model.save(model_path)
        torch.save(model, model_path + '.pth')
    model.load_best_state()

    # dataset = DataSet(config.DATASET["wsj_bigram"])
    # test_dataset = dataset.load_test()
    # test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
    # test_df = dataset.load_test_perm()
    # if args.sent_encoder == "infersent":
    #    model.sent_encoder = get_infersent("wsj_bigram", if_sample=args.test)
    # elif args.sent_encoder == "average_glove":
    #    model.sent_encoder = get_average_glove("wsj_bigram", if_sample=args.test)
    # else:
    #    model.sent_encoder = get_lm_hidden("wsj_bigram", "lm_" + args.data_name, corpus)

    logging.info("Results for discrimination:")
    dis_acc = model.evaluate_dis(test_dataloader, test_df)
    print("Test Acc:", dis_acc)
    logging.info("Disc Accuracy: {}".format(dis_acc[0]))

    logging.info("Results for insertion:")
    ins_acc = model.evaluate_ins(test_dataloader, test_df)
    print("Test Acc:", ins_acc)
    logging.info("Insert Accuracy: {}".format(ins_acc[0]))

    return dis_acc, ins_acc