def _test(config): if config.data_from == "20newsgroup": config.test_batch_size = 281 word2idx = Counter(json.load(open("../data/{}/word2idx_{}.json".format(config.data_from, config.data_from), "r"))["word2idx"]) idx2word = json.load(open("../data/{}/word2idx_{}.json".format(config.data_from, config.data_from), "r"))["idx2word"] assert len(word2idx) == len(idx2word) for i in range(10): assert word2idx[idx2word[i]] == i vocab_size = len(word2idx) word2vec = Counter(json.load(open("../data/{}/word2vec_{}.json".format(config.data_from, config.pretrain_from), "r"))["word2vec"]) # word2vec = {} if config.debug or config.load else get_word2vec(config, word2idx) idx2vec = {word2idx[word]: vec for word, vec in word2vec.items() if word in word2idx} unk_embedding = np.random.multivariate_normal(np.zeros(config.word_embedding_size), np.eye(config.word_embedding_size)) config.emb_mat = np.array([idx2vec[idx] if idx in idx2vec else unk_embedding for idx in range(vocab_size)]) config.vocab_size = vocab_size test_dict = {} if os.path.exists("../data/{}/{}_{}{}.json".format(config.data_from, config.data_from, config.dev_type, config.clftype)): test_dict = json.load(open("../data/{}/{}_{}{}.json".format(config.data_from, config.data_from, config.dev_type, config.clftype), "r")) if config.data_from == "reuters": dev_data = DataSet(test_dict, "test") if len(test_dict)>0 else read_reuters(config, data_type="test", word2idx=word2idx) elif config.data_from == "20newsgroup": dev_data = DataSet(test_dict, "test") if len(test_dict)>0 else read_news(config, data_type="test", word2idx=word2idx) elif config.data_from == "ice": dev_data = DataSet(test_dict, config.dev_type) config.dev_size = dev_data.get_data_size() # if config.use_glove_for_unk: pprint(config.__flags, indent=2) model = get_model(config) graph_handler = GraphHandler(config, model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) # check #w_embeddings = sess.run(model.word_embeddings) #print("w_embeddings:", w_embeddings.shape, w_embeddings) dev_evaluate = Evaluator(config, model) num_steps = math.floor(dev_data.num_examples / config.test_batch_size) if 0 < config.val_num_batches < num_steps: num_steps = config.val_num_batches # print("num_steps:", num_steps) e_dev = dev_evaluate.get_evaluation_from_batches( sess, tqdm(dev_data.get_batches(config.test_batch_size, num_batches=num_steps), total=num_steps))
def _train(config): word2idx = Counter(json.load(open("../data/{}/word2idx_{}.json".format(config.data_from, config.data_from), "r"))["word2idx"]) idx2word = json.load(open("../data/{}/word2idx_{}.json".format(config.data_from, config.data_from), "r"))["idx2word"] assert len(word2idx) == len(idx2word) for i in range(10): assert word2idx[idx2word[i]] == i vocab_size = len(word2idx) print("vocab_size", vocab_size, idx2word[:10]) word2vec = Counter(json.load(open("../data/{}/word2vec_{}.json".format(config.data_from, config.pretrain_from), "r"))["word2vec"]) # word2vec = {} if config.debug or config.load else get_word2vec(config, word2idx) idx2vec = {word2idx[word]: vec for word, vec in word2vec.items() if word in word2idx} print("no unk words:", len(idx2vec)) unk_embedding = np.random.multivariate_normal(np.zeros(config.word_embedding_size), np.eye(config.word_embedding_size)) config.emb_mat = np.array([idx2vec[idx] if idx in idx2vec else unk_embedding for idx in range(vocab_size)]) config.vocab_size = vocab_size print("emb_mat:", config.emb_mat.shape) test_type = "test" if config.data_from == "ice": test_type = "dev" else: test_type = "test" train_dict, test_dict = {}, {} ice_flat = "" if config.data_from == "ice" and config.model_name.endswith("flat"): ice_flat = "_flat" if os.path.exists("../data/{}/{}_{}{}{}.json".format(config.data_from, config.data_from, "train", ice_flat, config.clftype)): train_dict = json.load(open("../data/{}/{}_{}{}{}.json".format(config.data_from, config.data_from, "train", ice_flat, config.clftype), "r")) if os.path.exists("../data/{}/{}_{}{}{}.json".format(config.data_from, config.data_from, test_type, ice_flat, config.clftype)): test_dict = json.load(open("../data/{}/{}_{}{}{}.json".format(config.data_from, config.data_from, test_type, ice_flat, config.clftype), "r")) # check for key, val in train_dict.items(): if isinstance(val[0], list) and len(val[0])>10: print(key, val[0][:50]) else: print(key, val[0:4]) print("train:", len(train_dict)) print("test:", len(test_dict)) if config.data_from == "reuters": train_data = DataSet(train_dict, "train") if len(train_dict)>0 else read_reuters(config, data_type="train", word2idx=word2idx) dev_data = DataSet(test_dict, "test") if len(test_dict)>0 else read_reuters(config, data_type="test", word2idx=word2idx) elif config.data_from == "20newsgroup": train_data = DataSet(train_dict, "train") if len(train_dict)>0 else read_news(config, data_type="train", word2idx=word2idx) dev_data = DataSet(test_dict, "test") if len(test_dict)>0 else read_news(config, data_type="test", word2idx=word2idx) elif config.data_from == "ice": train_data = DataSet(train_dict, "train") dev_data = DataSet(test_dict, "dev") config.train_size = train_data.get_data_size() config.dev_size = dev_data.get_data_size() print("train/dev:", config.train_size, config.dev_size) # calculate doc length # TO CHECK avg_len = 0 for d_l in train_dict["x_len"]: avg_len += d_l/config.train_size print("avg_len at train:", avg_len) if config.max_docs_length > 2000: config.max_docs_length = 2000 pprint(config.__flags, indent=2) model = get_model(config) trainer = Trainer(config, model) graph_handler = GraphHandler(config, model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) num_batches = config.num_batches or int(math.ceil(train_data.num_examples / config.batch_size)) * config.num_epochs global_step = 0 dev_evaluate = Evaluator(config, model) best_f1 = 0.50 for batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_batches, shuffle=True, cluster=config.cluster), total=num_batches): global_step = sess.run(model.global_step) + 1 # print("global_step:", global_step) get_summary = global_step % config.log_period loss, summary, train_op = trainer.step(sess, batch, get_summary) if get_summary: graph_handler.add_summary(summary, global_step) # occasional saving # if global_step % config.save_period == 0 : # graph_handler.save(sess, global_step=global_step) if not config.eval: continue # Occasional evaluation if global_step % config.eval_period == 0: #config.test_batch_size = config.dev_size/3 num_steps = math.ceil(dev_data.num_examples / config.test_batch_size) if 0 < config.val_num_batches < num_steps: num_steps = config.val_num_batches # print("num_steps:", num_steps) e_dev = dev_evaluate.get_evaluation_from_batches( sess, tqdm(dev_data.get_batches(config.test_batch_size, num_batches=num_steps), total=num_steps)) if e_dev.fv > best_f1: best_f1 = e_dev.fv #if global_step % config.save_period == 0: graph_handler.save(sess, global_step=global_step) graph_handler.add_summaries(e_dev.summaries, global_step) print("f1:", best_f1)
def _train(config): word2idx = Counter( json.load( open( "data/{}/word2idx_{}.json".format(config.data_from, config.data_from), "r"))["word2idx"]) vocab_size = len(word2idx) print("vocab_size", vocab_size) word2vec = Counter( json.load( open( "data/{}/word2vec_{}.json".format(config.data_from, config.pretrain_from), "r"))["word2vec"]) # word2vec = {} if config.debug or config.load else get_word2vec(config, word2idx) idx2vec = { word2idx[word]: vec for word, vec in word2vec.items() if word in word2idx and word != "UNK" } unk_embedding = np.random.multivariate_normal( np.zeros(config.word_embedding_size), np.eye(config.word_embedding_size)) config.emb_mat = np.array([ idx2vec[idx] if idx in idx2vec else unk_embedding for idx in range(vocab_size) ]) config.vocab_size = vocab_size print("emb_mat:", config.emb_mat.shape) train_dict, test_dict = {}, {} if os.path.exists("data/{}/{}_{}.json".format(config.data_from, config.data_from, "train")): train_dict = json.load( open( "data/{}/{}_{}.json".format(config.data_from, config.data_from, "train"), "r")) if os.path.exists("data/{}/{}_{}.json".format(config.data_from, config.data_from, "test")): test_dict = json.load( open( "data/{}/{}_{}.json".format(config.data_from, config.data_from, "test"), "r")) # check if config.data_from == "reuters": train_data = DataSet(train_dict, "train") if len(train_dict) > 0 else read_reuters( config, data_type="train", word2idx=word2idx) dev_data = DataSet(test_dict, "test") if len(test_dict) > 0 else read_reuters( config, data_type="test", word2idx=word2idx) elif config.data_from == "20newsgroup": train_data = DataSet(train_dict, "train") if len(train_dict) > 0 else read_news( config, data_type="train", word2idx=word2idx) dev_data = DataSet(test_dict, "test") if len(test_dict) > 0 else read_news( config, data_type="test", word2idx=word2idx) config.train_size = train_data.get_data_size() config.dev_size = dev_data.get_data_size() print("train/dev:", config.train_size, config.dev_size) if config.max_docs_length > 2000: config.max_docs_length = 2000 pprint(config.__flags, indent=2) model = get_model(config) graph_handler = GraphHandler(config, model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) graph_handler.initialize(sess) num_batches = config.num_batches or int( math.ceil( train_data.num_examples / config.batch_size)) * config.num_epochs global_step = 0 dev_evaluate = Evaluator(config, model) for batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_batches, shuffle=True, cluster=config.cluster), total=num_batches): batch_idx, batch_ds = batch ''' if config.debug: for key, value in batch_ds.data.items(): if not key.startswith("x"): print(key, value) continue ''' global_step = sess.run(model.global_step) + 1 # print("global_step:", global_step) get_summary = global_step % config.log_period feed_dict = model.get_feed_dict(batch, config) logits, y, y_len, loss, summary, train_op = sess.run( [ model.logits, model.y, model.y_seq_length, model.loss, model.summary, model.train_op ], feed_dict=feed_dict) #print("logits:", logits[0:3], y[0:3], y_len[0:3], logits.shape, y.shape, y_len.shape) print("loss:", loss) if get_summary: graph_handler.add_summary(summary, global_step) # occasional saving if global_step % config.save_period == 0: graph_handler.save(sess, global_step=global_step) if not config.eval: continue # Occasional evaluation if global_step % config.eval_period == 0: #config.test_batch_size = config.dev_size/3 num_steps = math.ceil(dev_data.num_examples / config.test_batch_size) if 0 < config.val_num_batches < num_steps: num_steps = config.val_num_batches # print("num_steps:", num_steps) e_dev = dev_evaluate.get_evaluation_from_batches( sess, tqdm(dev_data.get_batches(config.test_batch_size, num_batches=num_steps), total=num_steps)) graph_handler.add_summaries(e_dev.summaries, global_step)