def train_lm(args): if args.data_name not in config.DATASET: raise ValueError("Invalid data name!") dataset = DataSet(config.DATASET[args.data_name]) train_dataset = dataset.load_train() test_dataset = dataset.load_test() corpus = Corpus(train_dataset.file_list, test_dataset.file_list, args.reverse) suffix = "backward" if args.reverse else "forward" kwargs = { "vocab_size": corpus.glove_embed.shape[0], "embed_dim": corpus.glove_embed.shape[1], "corpus": corpus, "hparams": { "hidden_size": args.hidden_size, "num_layers": args.num_layers, "cell_type": args.cell_type, "tie_embed": args.tie_embed, "rnn_dropout": args.rnn_dropout, "hidden_dropout": args.hidden_dropout, "num_epochs": args.num_epochs, "batch_size": args.batch_size, "bptt": args.bptt, "log_interval": args.log_interval, "save_path": args.save_path + '_' + args.data_name + '_' + suffix, "lr": args.lr, "wdecay": args.wdecay, } } lm = LanguageModel(**kwargs) best_valid_loss = lm.fit() print("Best Valid Loss:", best_valid_loss)
def __init__(self, model_name, data_name, cv_runs, params_dict, logger, eval_by_rel): dataset = DataSet(config.DATASET[data_name]) self.train_triples, self.valid_triples, self.test_triples = dataset.load_data( ) self.e2id, self.r2id = dataset.load_idx() self.model_name = model_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) self.logger = logger self.n_entities = len(self.e2id) self.n_relations = len(self.r2id) if eval_by_rel: self.scorer = RelationScorer(self.train_triples, self.valid_triples, self.test_triples, self.n_relations) else: self.scorer = Scorer(self.train_triples, self.valid_triples, self.test_triples, self.n_entities) self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_path = os.path.abspath(config.CHECKPOINT_PATH) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) self.checkpoint_prefix = os.path.join(checkpoint_path, self.__str__())
def run_lm_coherence(args): logging.info("Loading data...") if args.data_name not in config.DATASET: raise ValueError("Invalid data name!") dataset = DataSet(config.DATASET[args.data_name]) train_dataset = dataset.load_train() test_df = dataset.load_test_perm() test_dataset = dataset.load_test() test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False) corpus = Corpus(train_dataset.file_list, test_dataset.file_list) # dataset = DataSet(config.DATASET["wsj_bigram"]) # test_df = dataset.load_test_perm() # test_dataset = dataset.load_test() # test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False) with open( os.path.join(config.CHECKPOINT_PATH, args.lm_name + "_forward.pkl"), "rb") as f: hparams = pickle.load(f) kwargs = { "vocab_size": corpus.glove_embed.shape[0], "embed_dim": corpus.glove_embed.shape[1], "corpus": corpus, "hparams": hparams, } forward_model = LanguageModel(**kwargs) forward_model.load( os.path.join(config.CHECKPOINT_PATH, args.lm_name + "_forward.pt")) backward_model = LanguageModel(**kwargs) backward_model.load( os.path.join(config.CHECKPOINT_PATH, args.lm_name + "_backward.pt")) logging.info("Results for discrimination:") model = LMCoherence(forward_model.lm, backward_model.lm, corpus) dis_acc = model.evaluate_dis(test_dataloader, test_df) logging.info("Disc Accuracy: {}".format(dis_acc)) logging.info("Results for insertion:") ins_acc = model.evaluate_ins(test_dataloader, test_df) logging.info("Disc Accuracy: {}".format(ins_acc)) return dis_acc, ins_acc
def save_eval_perm(data_name, if_sample=False, random_seed=config.RANDOM_SEED): random.seed(random_seed) logging.info("Loading valid and test data...") if data_name not in config.DATASET: raise ValueError("Invalid data name!") dataset = DataSet(config.DATASET[data_name]) # dataset.random_seed = random_seed if if_sample: valid_dataset = dataset.load_valid_sample() else: valid_dataset = dataset.load_valid() if if_sample: test_dataset = dataset.load_test_sample() else: test_dataset = dataset.load_test() valid_df = valid_dataset.article_df test_df = test_dataset.article_df logging.info("Generating permuted articles...") def permute(x): x = np.array(x).squeeze() # neg_x_list = permute_articles([x], config.NEG_PERM)[0] neg_x_list = permute_articles_with_replacement([x], config.NEG_PERM)[0] return "<BREAK>".join(["<PUNC>".join(i) for i in neg_x_list]) valid_df["neg_list"] = valid_df.sentences.map(permute) valid_df["sentences"] = valid_df.sentences.map(lambda x: "<PUNC>".join(x)) valid_nums = valid_df.neg_list.map(lambda x: len(x.split("<BREAK>"))).sum() test_df["neg_list"] = test_df.sentences.map(permute) test_df["sentences"] = test_df.sentences.map(lambda x: "<PUNC>".join(x)) test_nums = test_df.neg_list.map(lambda x: len(x.split("<BREAK>"))).sum() logging.info("Number of validation pairs %d" % valid_nums) logging.info("Number of test pairs %d" % test_nums) logging.info("Saving...") dataset.save_valid_perm(valid_df) dataset.save_test_perm(test_df) logging.info("Finished!")
def read_babi(data_dir, task_id, type, batch_size, word_table): """ Reads bAbi data set. :param data_dir: bAbi data directory :param task_id: task no. (int) :param type: 'train' or 'test' :param batch_size: how many examples in a minibatch? :param word_table: WordTable :return: DataSet """ data = load_babi(data_dir, task_id, type) x, q, y, fc = process_babi(data, word_table) return DataSet(batch_size, x, q, y, fc, name=type)
def __init__(self, model_name, data_name, cv_runs, params_dict, logger, eval_by_rel): dataset = DataSet(config.DATASET[data_name]) self.train_triples, self.valid_triples, self.test_triples = dataset.load_data() self.e2id, self.r2id = dataset.load_idx() self.groundings = dataset.load_groundings() if "SoLE" in model_name else None self.model_name = model_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) if "batch_size" not in self.hparams : if "batch_num" in self.hparams : self.hparams["batch_size"] = int(len(self.train_triples)/self.hparams["batch_num"]) else: raise AttributeError("Need parameter batch_size or batch_num! (Check model_param_space.py)") self.logger = logger self.n_entities = len(self.e2id) self.n_relations = len(self.r2id) if eval_by_rel: self.scorer = RelationScorer( self.train_triples, self.valid_triples, self.test_triples, self.n_relations) else: self.scorer = Scorer( self.train_triples, self.valid_triples, self.test_triples, self.n_entities) self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) checkpoint_path = os.path.abspath(config.CHECKPOINT_PATH + "/" + self.__str__() + ("_NNE_" if "NNE_enable" in self.hparams and self.hparams.NNE_enable == True else "_noNNE_") + data_name) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) self.checkpoint_prefix = os.path.join(checkpoint_path, self.__str__()) print(self.hparams)
class args: batch_size = 2 lr = 0.0002 # display = 500 display = 10 weight_decay = 0.00001 num_class = 6 model_name = 'HRNet V2' data_path_df = pd.read_csv('dataset/path_list.csv') tf.logging.set_verbosity(tf.logging.INFO) #将 TensorFlow 日志信息输出到屏幕.TensorFlow将输出与该级别相对应的所有日志消息以及更高程度严重性的所有级别的日志信息。 train_path, val_path = train_test_split(data_path_df, test_size=0.25, shuffle=True) #train_test_split是交叉验证中常用的函数,功能是从样本中随机的按比例选取train_data和test_data dataset_tr = DataSet(image_path=train_path['image'].values, label_path=train_path['label'].values) dataset_val = DataSet(image_path=val_path['image'].values, label_path=val_path['label'].values) model = HighResolutionNet(args.num_class) image = tf.placeholder(tf.float32, [None, 256, 256, 3], name='input_x') label = tf.placeholder(tf.int32, [None, 256, 256]) lr = tf.placeholder(tf.float32,) logits = model.forword(image) print(logits) predicts = tf.argmax(logits, axis=-1, name='predicts') print(predicts) # cross_entropy cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label))
logits, loss = get_graph() optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() # Useful for testing overfit model #batch_input = np.random.normal(size=(batch_size, max_seq_len, 109)) #batch_targets = np.random.normal(size=(batch_size, max_seq_len)) / 20. #batch_weights = np.ones((batch_size, max_seq_len)) dataset = DataSet() data = dataset.get_numpy_data() trainset, train_validset, validset = dataset.split_valid(*data, 0.25) print('Train dataset shape: {0}'.format(trainset[0].shape)) # trainset stats: # shape: 712, 1520 when split 0.5 # Epoch about every ~8000 steps (not true epoch due to shifted seq) if SAVE_ANALYTICS: with open(os.path.join(OUTDIR, 'log.csv'), 'w') as outfile: outfile.write('Step,Train R,Valid R\n') print('training...')
def run_bigram_coherence(args): logging.info("Loading data...") if args.data_name not in config.DATASET: raise ValueError("Invalid data name!") dataset = DataSet(config.DATASET[args.data_name]) # dataset.random_seed = args.random_seed if not os.path.isfile(dataset.test_perm): save_eval_perm(args.data_name, random_seed=args.random_seed) train_dataset = dataset.load_train(args.portion) train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) valid_dataset = dataset.load_valid(args.portion) valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=False) valid_df = dataset.load_valid_perm() test_dataset = dataset.load_test(args.portion) test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False) test_df = dataset.load_test_perm() logging.info("Loading sent embedding...") if args.sent_encoder == "infersent": sent_embedding = get_infersent(args.data_name, if_sample=args.test) embed_dim = 4096 elif args.sent_encoder == "average_glove": sent_embedding = get_average_glove(args.data_name, if_sample=args.test) embed_dim = 300 elif args.sent_encoder == "lm_hidden": corpus = Corpus(train_dataset.file_list, test_dataset.file_list) sent_embedding = get_lm_hidden(args.data_name, "lm_" + args.data_name, corpus) embed_dim = 2048 elif args.sent_encoder == "s2s_hidden": corpus = SentCorpus(train_dataset.file_list, test_dataset.file_list) sent_embedding = get_s2s_hidden(args.data_name, "s2s_" + args.data_name, corpus) embed_dim = 2048 else: raise ValueError("Invalid sent encoder name!") logging.info("Training BigramCoherence model...") kwargs = { "embed_dim": embed_dim, "sent_encoder": sent_embedding, "hparams": { "loss": args.loss, "input_dropout": args.input_dropout, "hidden_state": args.hidden_state, "hidden_layers": args.hidden_layers, "hidden_dropout": args.hidden_dropout, "num_epochs": args.num_epochs, "margin": args.margin, "lr": args.lr, "l2_reg_lambda": args.l2_reg_lambda, "use_bn": args.use_bn, "task": "discrimination", "bidirectional": args.bidirectional, } } model = BigramCoherence(**kwargs) model.init() best_step, valid_acc = model.fit(train_dataloader, valid_dataloader, valid_df) if args.save: model_path = os.path.join(config.CHECKPOINT_PATH, "%s-%.4f" % (args.data_name, valid_acc)) # model.save(model_path) torch.save(model, model_path + '.pth') model.load_best_state() # dataset = DataSet(config.DATASET["wsj_bigram"]) # test_dataset = dataset.load_test() # test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False) # test_df = dataset.load_test_perm() # if args.sent_encoder == "infersent": # model.sent_encoder = get_infersent("wsj_bigram", if_sample=args.test) # elif args.sent_encoder == "average_glove": # model.sent_encoder = get_average_glove("wsj_bigram", if_sample=args.test) # else: # model.sent_encoder = get_lm_hidden("wsj_bigram", "lm_" + args.data_name, corpus) logging.info("Results for discrimination:") dis_acc = model.evaluate_dis(test_dataloader, test_df) print("Test Acc:", dis_acc) logging.info("Disc Accuracy: {}".format(dis_acc[0])) logging.info("Results for insertion:") ins_acc = model.evaluate_ins(test_dataloader, test_df) print("Test Acc:", ins_acc) logging.info("Insert Accuracy: {}".format(ins_acc[0])) return dis_acc, ins_acc