class InfluxClient: def __init__( self, url="http://localhost:8086", token="IpLnoNkWhqmnSLO2ieeqmHejYrrokycO5Be8HRgM6UI1S_CO-Py2_opA2E1z6iCzJrv5U_gHGVHh5JMCFsgwjQ==" ): # You can generate a Token from the "Tokens Tab" in the UI @ localhost:9999 self.org = "vwa" self.bucket = "vwa" self.client = InfluxDBClient(url=url, token=token) self.write_api = self.client.write_api(write_options=SYNCHRONOUS) self.b = Batcher(500, 5, self._send) self.q = self.client.query_api() def send(self, line): self.b.send(line) def sendSequence(self, sequence): self._send(sequence) def _send(self, sequence): try: self.write_api.write(self.bucket, self.org, sequence) print("%d items sent!" % len(sequence)) except Exception as e: print("%d items not sent!" % len(sequence), e)
def main(unused_argv): # prints a message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) hps, vocab = prepare_hps_vocab() generator_batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) discriminator_batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) if hps.mode == 'train': generator, discriminator = build_seqgan_graph(hps, vocab) setup_training(generator, discriminator, generator_batcher, discriminator_batcher) elif hps.mode == 'decode': # The model is configured with max_dec_steps=1 because we only ever run one step of # the decoder at a time (to do beam search). decode_model_hps = hps._replace(max_dec_steps=1) generator = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(generator, generator_batcher, vocab) decoder.decode() else: raise ValueError("The 'mode' flag must be one of train/decode")
def create_train_eval_model(FLAGS): Classify_model = model_pools["tagging_model"] bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) # load custom processer from task name task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() train_batcher = Batcher(processor, FLAGS) # create trainning model Bert_model = Classify_model(bert_config, train_batcher, FLAGS) Bert_model.build_graph() Bert_model.create_or_load_recent_model() FLAGS_eval = FLAGS._asdict() FLAGS_eval["mode"] = "dev" FLAGS_eval = config.generate_nametuple(FLAGS_eval) validate_batcher =Batcher(processor, FLAGS_eval) validate_model = Classify_model(bert_config, validate_batcher, FLAGS_eval) validate_model.build_graph() validate_model.create_or_load_recent_model() return Bert_model,validate_model
def main(): args = get_args() vocab = Vocab(args.vocab_path, args.vocab_size) # create a vocabulary hps = get_hps() if not args.data_path == "": batcher = Batcher(args.data_path, vocab, hps, args.single_pass) import pdb pdb.set_trace() x = batcher.next_batch() import pdb pdb.set_trace() pass else: with open(args.json_path) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) import pdb pdb.set_trace() pass
def thread_decode(test_path, vocab, FLAGS): sess = tf.Session(config=get_config()) if FLAGS.beam == True: FLAGS.batch_size = FLAGS.beam_size FLAGS.max_dec_steps = 1 print('batch size ', FLAGS.batch_size) summarizationModel = PointerNet(FLAGS, vocab) summarizationModel.build_graph() saver = tf.train.Saver() COORD = tf.train.Coordinator() best_model = load_best_model(FLAGS.restore_path) print('best model : {0}'.format(best_model)) saver.restore(sess, save_path=best_model) batcher = Batcher(test_path, vocab, FLAGS, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) batches = batcher.cpu_fill_batch_queue() # 1 example repeated across batch print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) #切分数据 split_datas, count_array = split_batches(batches, FLAGS.work_num) print("len batches : {0}".format(len(batches))) assert len(split_datas) == FLAGS.work_num work_threads = [] for i in range(FLAGS.work_num): job = lambda: do(split_datas[i], summarizationModel, vocab, sess, FLAGS, count_array[i], i) t = threading.Thread(target=job) t.start() work_threads.append(t) print('work : {0}'.format(i)) COORD.join(work_threads) print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
def train(args): # retrieve proper data, model, and vocabulary train_data = resolve_data(args, "train") test_data = resolve_data(args, "test") vocab = resolve_vocab(args) model = get_model(args, vocab) # intialize batchers train_batcher = Batcher(train_data, args.batch_size, args.model_name) test_batcher = Batcher(test_data, args.test_batch_size, args.model_name) # initialize training parameters loss = torch.nn.BCEWithLogitsLoss() # don't optimizer fixed weights like GloVe embeddings optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) # evaluation metrics best_accuracy = 0.0 best_params = None best_epoch = 0 prev_accuracy = 0 consec_worse_epochs = 0 for i in range(args.epochs): cost = 0. while not train_batcher.is_finished(): sentences1, sentences2, labels = train_batcher.get_batch() cost += train_batch(model, loss, optimizer, sentences1, sentences2, labels, vocab) print("Epoch = %d, average loss = %s" % (i + 1, cost / train_batcher.num_batches)) if (i + 1) % args.test_freq == 0: test_acc, F_score = test(model, test_batcher, vocab, args) print("Accuracy (F-score) after epoch #%s --> %s%% (%s)" % (i, int(acc * 100.0), F_score)) if test_acc < prev_accuracy: consec_worse_epochs += 1 if consec_worse_epochs >= args.max_consec_worse_epochs: print("Training incurred %s consecutive worsening epoch(s): from %s to %s" \ % (args.max_consec_worse_epochs, i + 1 - (args.max_consec_worse_epochs * args.test_freq), i + 1)) break else: consec_worse_epochs = 0 if test_acc > best_accuracy: best_accuracy = test_acc best_epoch = i + 1 best_params = model.state_dict() prev_accuracy = test_acc model.load_state_dict(best_params) acc, F_score = test(model, test_batcher, vocab, args) print("Best Accuracy achieved after epoch #%s --> %s%% (%s" % (best_epoch, int(acc * 100.0), F_score))
def fit_tfidf_vectorizer(hps, vocab): if not os.path.exists( os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer')): os.makedirs(os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer')) decode_model_hps = hps._replace(max_dec_steps=1, batch_size=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries batcher = Batcher(FLAGS.data_path, vocab, decode_model_hps, single_pass=FLAGS.single_pass) all_sentences = [] while True: batch = batcher.next_batch() # 1 example repeated across batch if batch is None: # finished decoding dataset in single_pass mode break all_sentences.extend(batch.raw_article_sents[0]) stemmer = PorterStemmer() class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) tfidf_vectorizer = StemmedTfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_df=0.7) tfidf_vectorizer.fit_transform(all_sentences) return tfidf_vectorizer
def __init__(self, model_file_path, data_path, data_class='val'): self.data_class = data_class if self.data_class not in ['val', 'test']: print("data_class must be 'val' or 'test'.") raise ValueError # model_file_path e.g. --> ../log/{MODE NAME}/best_model/model_best_XXXXX model_name = os.path.basename(model_file_path) # log_root e.g. --> ../log/{MODE NAME}/ log_root = os.path.dirname(os.path.dirname(model_file_path)) # _decode_dir e.g. --> ../log/{MODE NAME}/decode_model_best_XXXXX/ self._decode_dir = os.path.join(log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') self._result_path = os.path.join(self._decode_dir, 'result_%s_%s.txt' \ % (model_name, self.data_class)) # remove result file if exist if os.path.isfile(self._result_path): os.remove(self._result_path) for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(5) self.model = Model(model_file_path, is_eval=True)
def decode(self, serializedInstance): label = serializedInstance['label'] print("serializedInstance sentence : " + serializedInstance["sentence"]) dataset = self.preprocess_dataset(serializedInstance, self.label2id, self.word2id, self.feature2id) batcher = Batcher(dataset["storage"], dataset["data"], dataset["data"].shape[0], 10, self.id2vec) context_data, mention_representation_data, target_data, feature_data = batcher.next( ) scores = self.model.predict(context_data, mention_representation_data, feature_data) score = scores[0] label_id, label_score = max(enumerate(list(score)), key=lambda x: x[1]) if label_score >= self.threshold: predicted_label = self.id2label[label_id] else: predicted_label = "OTHER" print("predicted_label: " + predicted_label + ", label_id: " + str(label_id) + ", score: " + str(label_score)) return {'label': predicted_label, 'confidence': str(label_score)}
def main(unused_argv): set_random_seeds() get_datapath() # The dataset path get_steps() # setting steps according data_size tf.logging.set_verbosity(tf.logging.INFO) print('Now the mode of this mode is {} !'.format(FLAGS.mode)) # if log_dir is not exited, create it. if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) if FLAGS.mode == 'decode': FLAGS.branch_batch_size = FLAGS.beam_size # for beam search FLAGS.TS_mode = False hps = make_hps() # make a hps namedtuple # Vocabulary vocab = Vocab(hps.vocab_path, hps.vocab_size) # Train or Inference if hps.mode == 'train': batcher = Batcher(hps.data_path, vocab, hps) eval_hps = hps._replace(mode='eval') eval_batcher = Batcher(hps.eval_data_path, vocab, eval_hps) model = GSNModel(hps, vocab) train(model, batcher, eval_batcher, vocab, hps) elif hps.mode == 'decode': decode_mdl_hps = hps._replace(max_dec_steps=1) batcher = Batcher(hps.test_data_path, vocab, decode_mdl_hps) # for test model = GSNModel(decode_mdl_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder._decode()
def create_training_model(FLAGS,vocab_in, vocab_out = None): batcher_train = Batcher(FLAGS.data_path, vocab_in,vocab_out, FLAGS, data_file=FLAGS.train_name) train_model = SummarizationModel(FLAGS, vocab_in,vocab_out,batcher_train) logging.info("Building graph...") train_model.build_graph() # Create dev model # I can't deepCopy tf.flags, so I change flags into nametuple. # Find another way in the future FLAGS_eval = FLAGS._asdict() FLAGS_eval["mode"] = "eval" FLAGS_eval = config.generate_nametuple(FLAGS_eval) #variable_scope.get_variable_scope().reuse_variables() batcher_dev = Batcher(FLAGS.data_path, vocab_in,vocab_out, FLAGS, data_file=FLAGS.dev_name) dev_model = SummarizationModel(FLAGS_eval, vocab_in,vocab_out,batcher_dev) dev_model.build_graph() train_model.create_or_load_recent_model() return train_model,dev_model
def train(hidden_size, batch_size): batcher = Batcher() print('Data:') print(batcher.inputs.shape) print(batcher.targets.shape) model = get_model(hidden_size, batcher.chars_len()) model.compile(loss={ 'op': 'categorical_crossentropy', 'char': 'categorical_crossentropy' }, optimizer='adam', metrics=['accuracy']) model.summary() for grad_step in range(int(1e9)): ppp = gen_large_chunk_single_thread(batcher, batcher.inputs, batcher.targets, chunk_size=batch_size) x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets = ppp model.train_on_batch(x=x_train, y=[y_train_1, y_train_2]) print( dict( zip(model.metrics_names, model.test_on_batch(x=x_val, y=[y_val_1, y_val_2])))) # guess = c_table.decode(preds[0], calc_argmax=False) # top_passwords = predict_top_most_likely_passwords_monte_carlo(model, row_x, 100) # p = model.predict(row_x, batch_size=32, verbose=0)[0] # p.shape (12, 82) # [np.random.choice(a=range(82), size=1, p=p[i, :]) for i in range(12)] # s = [np.random.choice(a=range(82), size=1, p=p[i, :])[0] for i in range(12)] # c_table.decode(s, calc_argmax=False) # Could sample 1000 and take the most_common() if grad_step % 100 == 0: row_x, password_target, password_input = x_val, val_sub_targets, val_sub_inputs ops, char = model.predict(row_x, verbose=0) predicted_chars = list(batcher.decode(char)) ops = ops.argmax(axis=1) decoded_op = [] for op in ops: if op == 0: decoded_op.append('insert') elif op == 1: decoded_op.append('replace') else: decoded_op.append('delete') for i, (x, y, pc, po) in enumerate( zip(password_input, password_target, predicted_chars, decoded_op)): print('x :', x) print('y :', y) print('predict char :', pc) print('predict op :', po) print('---------------------') if i >= 100: break
class Evaluate(object): def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) self.model_file_path = model_file_path time.sleep(5) self.model = Model(model_file_path, is_eval=True) def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) with torch.no_grad(): encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.item() def run_eval(self): batch = self.batcher.next_batch() loss_list = [] while batch is not None: loss = self.eval_one_batch(batch) loss_list.append(loss) batch = self.batcher.next_batch() return np.mean(loss_list)
def train_generator(args, load_recent=True): '''Train the generator via classical approach''' logging.debug('Batcher...') batcher = Batcher(args.data_dir, args.batch_size, args.seq_length) logging.debug('Vocabulary...') with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'), 'w') as f: cPickle.dump((batcher.chars, batcher.vocab), f) logging.debug('Creating generator...') generator = Generator(args, is_training=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) if load_recent: ckpt = tf.train.get_checkpoint_state(args.save_dir_gen) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) for epoch in xrange(args.num_epochs): # Anneal learning rate new_lr = args.learning_rate * (args.decay_rate**epoch) sess.run(tf.assign(generator.lr, new_lr)) batcher.reset_batch_pointer() state = generator.initial_state.eval() for batch in xrange(batcher.num_batches): start = time.time() x, y = batcher.next_batch() feed = { generator.input_data: x, generator.targets: y, generator.initial_state: state } # train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed) train_loss, _ = sess.run([generator.cost, generator.train_op], feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \ .format(epoch * batcher.num_batches + batch, args.num_epochs * batcher.num_batches, epoch, train_loss, end - start) if (epoch * batcher.num_batches + batch) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir_gen, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=epoch * batcher.num_batches + batch) print 'Generator model saved to {}'.format(checkpoint_path)
def test_batcher(): batcher = Batcher(hps.data_path, vocab, hps, hps.single_pass) #batcher = newbatcher(vocab, hps, hps.data_path, hps.single_pass) #time.sleep(15) while True: start = time.time() #batch = next(batcher)#.next_batch() batch = batcher.next_batch() print('elapse:', time.time() - start)
def main(): opts = optparser.parse_args()[0] train_loader = Loader(opts.train) opts.vocab_len = len(train_loader._char_to_id) opts.pos_len = len(train_loader._pos_to_id) opts.max_pos_len = train_loader._pos_max_len opts.max_target_len = train_loader._char_max_len opts.use_cuda = opts.use_cuda == 1 opts.eval = opts.eval == 1 opts.data_size = train_loader.get_data_size() if not torch.cuda.is_available(): opts.use_cuda = False torch.manual_seed(opts.seed) np.random.seed(opts.seed) if not opts.eval: # weights for paddings, set to 0 loss_weights = torch.ones(opts.vocab_len) loss_weights[0] = 0 criterion = nn.NLLLoss(loss_weights, size_average=False) c2i, i2c, p2i, i2p = train_loader.get_mappings() dev_loader = Loader(opts.dev, c2i, i2c, p2i, i2p) if dev_loader._pos_max_len > opts.max_pos_len: opts.max_pos_len = dev_loader._pos_max_len model = Module(opts) if opts.model_path is not '': model = torch.load(opts.model_path) train_batcher = Batcher(opts.batch_size, train_loader.get_data(), opts.max_pos_len, opts.eval) dev_batcher = Batcher(decode_batch, dev_loader.get_data(), opts.max_pos_len, True) print model start_train(model, criterion, opts, train_batcher, dev_batcher) else: model = torch.load(opts.model_path) model.eval() #print model c2i, i2c, p2i, i2p = train_loader.get_mappings() test_loader = Loader(opts.test, c2i, i2c, p2i, i2p) if test_loader._pos_max_len > opts.max_pos_len: opts.max_pos_len = test_loader._pos_max_len test_batcher = Batcher(1, test_loader.get_data(), opts.max_pos_len, opts.eval) opts.data_size = test_loader.get_data_size() decode(model, opts, test_batcher, i2c, i2p)
def run_training(): print('batch size', FLAGS.batch_size) summarizationModel = PointerNet(FLAGS, vocab) summarizationModel.build_graph() batcher = Batcher(FLAGS.data_path, vocab, FLAGS, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) val_batcher = Batcher(FLAGS.val_data_path, vocab, FLAGS, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) sess = tf.Session(config=get_config()) sess.run(tf.global_variables_initializer()) eval_max_reward = -float('inf') saver = tf.train.Saver(max_to_keep=10) if FLAGS.restore_path: print('loading params...') saver.restore(sess, FLAGS.restore_path) epoch = FLAGS.epoch step = 0 patient = FLAGS.patient while epoch > 0: batches = batcher.fill_batch_queue() print('load batch...') for batch in batches: print('start training...') step += 1 feed_dict = make_feed_dict(summarizationModel, batch) loss, _ = sess.run( [summarizationModel.loss, summarizationModel.train_op], feed_dict) print("epoch : {0}, step : {1}, loss : {2}".format( abs(epoch - FLAGS.epoch), step, loss)) if step % FLAGS.eval_step == 0: eval_reward = run_eval(summarizationModel, val_batcher, sess) print('eval reward ', eval_reward) if eval_max_reward < eval_reward: if not os.path.exists(FLAGS.checkpoint): os.mkdir(FLAGS.checkpoint) saver.save(sess, save_path=os.path.join( FLAGS.checkpoint, 'model_{0}_{1}.ckpt'.format( step, eval_reward))) eval_max_reward = eval_reward patient = FLAGS.patient print('eval max reward : {0}'.format(eval_max_reward)) if patient < 0: break if eval_max_reward - eval_reward > FLAGS.threshold: patient -= 1
def train(params): data_loader = Batcher(params) params.vocab_size = data_loader.vocab_size if not os.path.isdir(params.save_dir): os.makedirs(params.save_dir) with open(os.path.join(params.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(params, f) with open(os.path.join(params.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(params) with tf.Session() as sess: summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(params.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) for e in range(params.num_epochs): sess.run(tf.assign(model.lr, params.learning_rate * (0.97**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) summ, train_loss, state, _ = sess.run( [summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) end = time.time() logging.info( "Epoch #{e} / Batch #{b} -- Loss {train_loss:.3f} " "Time {time_diff:.3f}".format(e=e, b=b, train_loss=train_loss, time_diff=end - start)) if e % params.save_every == 0 or e == params.num_epochs - 1: checkpoint_path = os.path.join(params.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e)
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) self.model_file_path = model_file_path time.sleep(5) self.model = Model(model_file_path, is_eval=True)
def generate_batch(self, mode): #mode: train/test/val hps = self._hps hps['mode'] = mode batcher = Batcher(hps['data_path'] + '/{}.bin'.format(mode), self._vocab, hps, single_pass=True) while True: batch = batcher.next_batch() feed_dict = self.make_feed_dict(batch) yield [feed_dict['enc_batch'], feed_dict['dec_batch']], feed_dict['target_batch']
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, batch_size=config.batch_size) train_dir = os.path.join(config.log_root) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__( self, url="http://localhost:8086", token="IpLnoNkWhqmnSLO2ieeqmHejYrrokycO5Be8HRgM6UI1S_CO-Py2_opA2E1z6iCzJrv5U_gHGVHh5JMCFsgwjQ==" ): # You can generate a Token from the "Tokens Tab" in the UI @ localhost:9999 self.org = "vwa" self.bucket = "vwa" self.client = InfluxDBClient(url=url, token=token) self.write_api = self.client.write_api(write_options=SYNCHRONOUS) self.b = Batcher(500, 5, self._send) self.q = self.client.query_api()
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) train_dir = os.path.join(config.log_root, 'train_{}'.format(stamp)) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def __init__(self): preprocessor = Preprocessor() preprocessor.load_data() self.x_train = preprocessor.get_x_train() self.y_train = preprocessor.get_y_train() self.x_val = preprocessor.get_x_val() self.y_val = preprocessor.get_y_val() self.pos = preprocessor.get_pos() self.neg = preprocessor.get_neg() self.hyper_model = DenseHyperModel(self.pos, self.neg) self.tuner = None self.batch_size = 1024 self.epochs = 100 self.batcher = Batcher() self.objective = 'val_auc'
def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(5) self.model = Model(model_file_path, is_eval=True)
def get_decode_results(sess, model, vocab, hps, data_path): eval_batcher = Batcher(data_path, vocab, hps, True) total_loss = 0.0 total_correct_preds = 0.0 predictions = np.array([]) original_comments = [] gold_labels = [] attention_scores = [] labelvalues = np.array(["male", "female"]) predicted_labels = [] probabilities = np.array([]) n=0 while True: try: eval_batch = eval_batcher.next_batch() if eval_batch is None: break eval_results = model.run_eval_step(sess, eval_batch) batch = eval_batch batch_size = FLAGS.batch_size loss = eval_results['loss'] correct_predictions = eval_results['correct_predictions'] predictions = eval_results['predictions'] predicted_labels = np.concatenate((predicted_labels, labelvalues[predictions])) # print eval_results['probs'] # print eval_results['batch'] # print batch.enc_batch[0] # print batch.enc_batch[1] # print batch.enc_batch[2] # raw_input() probabilities = np.concatenate((probabilities, eval_results['probs'])) gold_labels += batch.original_labels original_comments += batch.original_comments attention_scores += list(eval_results['attention_scores']) total_loss += loss*batch_size total_correct_preds += correct_predictions n+=batch_size except StopIteration: break eval_loss = total_loss/n accuracy = total_correct_preds/n return eval_loss, accuracy, original_comments, gold_labels, predicted_labels, attention_scores, np.array(probabilities, dtype=str)
def decode_Beam(FLAGS): # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. #if FLAGS.mode == 'decode': # FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode #if FLAGS.single_pass and FLAGS.mode != 'decode': # raise Exception("The single_pass flag should only be True in decode mode") vocab_in, vocab_out = data.load_dict_data(FLAGS) FLAGS_batcher = config.retype_FLAGS() FLAGS_decode = FLAGS_batcher._asdict() FLAGS_decode["max_dec_steps"] = 1 FLAGS_decode["mode"] = "decode" FLAGS_decode = config.generate_nametuple(FLAGS_decode) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries batcher = Batcher(FLAGS.data_path, vocab_in,vocab_out, FLAGS_batcher, data_file=FLAGS.test_name) model = SummarizationModel(FLAGS_decode, vocab_in,vocab_out,batcher) decoder = BeamSearchDecoder(model, batcher, vocab_out) decoder.decode()
def run(size): # print ((unused_argv)) # if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly # raise Exception("Problem with flags: %s" % unused_argv) FLAGS.min_dec_steps = size//4 FLAGS.max_dec_steps = size FLAGS.max_enc_steps = size tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = log_path FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode =="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) print("vocab path is ",FLAGS.vocab_path) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} #print("This is FLAGS -->",FLAGS) for val in FLAGS: # for each flag // New modification for TF 1.5 if val in hparam_list: # if it's in the list hps_dict[val] = FLAGS[val].value # add it to the dict // New modification for TF 1.5 hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness if hps.mode == 'train': print("creating model...") model = SummarizationModel(hps, vocab) setup_training(model, batcher) elif hps.mode == 'eval': model = SummarizationModel(hps, vocab) run_eval(model, batcher, vocab) elif hps.mode == 'decode': decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def main(): tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary FLAGS.batch_size = FLAGS.beam_size hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen' ] hps_dict = {} for key, val in FLAGS.__flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass) tf.set_random_seed(111) # a seed value for randomness decode_model_hps = hps # This will be the hyperparameters for the decoder model decode_model_hps = hps._replace( max_dec_steps=1 ) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder.decode( ) # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
def train_generator(args, load_recent=True): '''Train the generator via classical approach''' logging.debug('Batcher...') batcher = Batcher(args.data_dir, args.batch_size, args.seq_length) logging.debug('Vocabulary...') with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'), 'w') as f: cPickle.dump((batcher.chars, batcher.vocab), f) logging.debug('Creating generator...') generator = Generator(args, is_training = True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) if load_recent: ckpt = tf.train.get_checkpoint_state(args.save_dir_gen) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) for epoch in xrange(args.num_epochs): # Anneal learning rate new_lr = args.learning_rate * (args.decay_rate ** epoch) sess.run(tf.assign(generator.lr, new_lr)) batcher.reset_batch_pointer() state = generator.initial_state.eval() for batch in xrange(batcher.num_batches): start = time.time() x, y = batcher.next_batch() feed = {generator.input_data: x, generator.targets: y, generator.initial_state: state} # train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed) train_loss, _ = sess.run([generator.cost, generator.train_op], feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \ .format(epoch * batcher.num_batches + batch, args.num_epochs * batcher.num_batches, epoch, train_loss, end - start) if (epoch * batcher.num_batches + batch) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir_gen, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = epoch * batcher.num_batches + batch) print 'Generator model saved to {}'.format(checkpoint_path)
output,pre_activation = modules.logistic_regression(concat,INPUT_SIZE+args.lstm_hidden_size*2,LABEL_SIZE) ## loss,optimizer,init loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(pre_activation, y)) optimizer = tf.train.AdamOptimizer(0.001).minimize(loss) init = tf.initialize_all_variables() ## batcher print "loading dicts..." dicts = joblib.load("../../data/dicts_gillick") print "obtaining batch..." test_dataset = joblib.load("../../data/data_test_gillick") test_batcher = Batcher(test_dataset["storage"],test_dataset["data"],8963,args.context_length,dicts["id2vec"]) # saver saver = tf.train.Saver() # session sess = tf.Session() sess.run(init) print "restoring..." saver.restore(sess, args.model_name) [x_context_data, x_target_mean_data, y_data] = test_batcher.next()
argparser.add_argument('--hidden_layer_size', default=128, type=int) argparser.add_argument('--num_layers', default=2, type=int) argparser.add_argument('--compiled_output', default='cg.pkl') argparser.add_argument('--iterations', default=20, type=int) argparser.add_argument('--compile', action='store_true') argparser.add_argument('--load', type=str) args = argparser.parse_args() main(args) logger.info("Loading input file...") loader = InputMapper() with open(args.input) as fp: text = fp.read() X = loader.convert_to_tensor(text) batcher = Batcher(X, loader.vocab_size(), batch_size=args.batch_size, sequence_length=args.sequence_length) Xvalid, yvalid = batcher.get_validation() cache_location = args.compiled_output if not args.compile else None if args.load: logger.info("Loading LSTM model from file...") cg = CharacterGenerator.load_model(args.load).compile(cache=cache_location) else: lstm = LSTM(1, args.hidden_layer_size, num_layers=args.num_layers) softmax = Softmax(args.hidden_layer_size, loader.vocab_size()) cg = CharacterGenerator(lstm, softmax).compile(cache=cache_location) logger.info("Running SGD") learning_rate = 0.1
def train(word2vec, dataset, parameters): modeldir = os.path.join(parameters["runs_dir"], parameters["model_name"]) if not os.path.exists(modeldir): os.mkdir(modeldir) logdir = os.path.join(modeldir, "log") if not os.path.exists(logdir): os.mkdir(logdir) logdir_train = os.path.join(logdir, "train") if not os.path.exists(logdir_train): os.mkdir(logdir_train) logdir_test = os.path.join(logdir, "test") if not os.path.exists(logdir_test): os.mkdir(logdir_test) logdir_dev = os.path.join(logdir, "dev") if not os.path.exists(logdir_dev): os.mkdir(logdir_dev) savepath = os.path.join(modeldir, "save") device_string = "/gpu:{}".format(parameters["gpu"]) if parameters["gpu"] else "/cpu:0" with tf.device(device_string): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) config_proto = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) sess = tf.Session(config=config_proto) premises_ph = tf.placeholder(tf.float32, shape=[parameters["sequence_length"], None, parameters["embedding_dim"]], name="premises") hypothesis_ph = tf.placeholder(tf.float32, shape=[parameters["sequence_length"], None, parameters["embedding_dim"]], name="hypothesis") targets_ph = tf.placeholder(tf.int32, shape=[None], name="targets") keep_prob_ph = tf.placeholder(tf.float32, name="keep_prob") _projecter = TensorFlowTrainable() projecter = _projecter.get_4Dweights(filter_height=1, filter_width=parameters["embedding_dim"], in_channels=1, out_channels=parameters["num_units"], name="projecter") optimizer = tf.train.AdamOptimizer(learning_rate=parameters["learning_rate"], name="ADAM", beta1=0.9, beta2=0.999) with tf.variable_scope(name_or_scope="premise"): premise = RNN(cell=LSTMCell, num_units=parameters["num_units"], embedding_dim=parameters["embedding_dim"], projecter=projecter, keep_prob=keep_prob_ph) premise.process(sequence=premises_ph) with tf.variable_scope(name_or_scope="hypothesis"): hypothesis = RNN(cell=AttentionLSTMCell, num_units=parameters["num_units"], embedding_dim=parameters["embedding_dim"], hiddens=premise.hiddens, states=premise.states, projecter=projecter, keep_prob=keep_prob_ph) hypothesis.process(sequence=hypothesis_ph) loss, loss_summary, accuracy, accuracy_summary = hypothesis.loss(targets=targets_ph) weight_decay = tf.reduce_sum([tf.reduce_sum(parameter) for parameter in premise.parameters + hypothesis.parameters]) global_loss = loss + parameters["weight_decay"] * weight_decay train_summary_op = tf.merge_summary([loss_summary, accuracy_summary]) train_summary_writer = tf.train.SummaryWriter(logdir_train, sess.graph) test_summary_op = tf.merge_summary([loss_summary, accuracy_summary]) test_summary_writer = tf.train.SummaryWriter(logdir_test) saver = tf.train.Saver(max_to_keep=10) summary_writer = tf.train.SummaryWriter(logdir) tf.train.write_graph(sess.graph_def, modeldir, "graph.pb", as_text=False) loader = tf.train.Saver(tf.all_variables()) optimizer = tf.train.AdamOptimizer(learning_rate=parameters["learning_rate"], name="ADAM", beta1=0.9, beta2=0.999) train_op = optimizer.minimize(global_loss) sess.run(tf.initialize_all_variables()) batcher = Batcher(word2vec=word2vec) train_batches = batcher.batch_generator(dataset=dataset["train"], num_epochs=parameters["num_epochs"], batch_size=parameters["batch_size"]["train"], sequence_length=parameters["sequence_length"]) num_step_by_epoch = int(math.ceil(len(dataset["train"]["targets"]) / parameters["batch_size"]["train"])) for train_step, (train_batch, epoch) in enumerate(train_batches): feed_dict = { premises_ph: np.transpose(train_batch["premises"], (1, 0, 2)), hypothesis_ph: np.transpose(train_batch["hypothesis"], (1, 0, 2)), targets_ph: train_batch["targets"], keep_prob_ph: parameters["keep_prob"], } _, summary_str, train_loss, train_accuracy = sess.run([train_op, train_summary_op, loss, accuracy], feed_dict=feed_dict) train_summary_writer.add_summary(summary_str, train_step) if train_step % 100 == 0: sys.stdout.write("\rTRAIN | epoch={0}/{1}, step={2}/{3} | loss={4:.2f}, accuracy={5:.2f}% ".format(epoch + 1, parameters["num_epochs"], train_step % num_step_by_epoch, num_step_by_epoch, train_loss, 100. * train_accuracy)) sys.stdout.flush() if train_step % 5000 == 0: test_batches = batcher.batch_generator(dataset=dataset["test"], num_epochs=1, batch_size=parameters["batch_size"]["test"], sequence_length=parameters["sequence_length"]) for test_step, (test_batch, _) in enumerate(test_batches): feed_dict = { premises_ph: np.transpose(test_batch["premises"], (1, 0, 2)), hypothesis_ph: np.transpose(test_batch["hypothesis"], (1, 0, 2)), targets_ph: test_batch["targets"], keep_prob_ph: 1., } summary_str, test_loss, test_accuracy = sess.run([test_summary_op, loss, accuracy], feed_dict=feed_dict) print"\nTEST | loss={0:.2f}, accuracy={1:.2f}% ".format(test_loss, 100. * test_accuracy) print "" test_summary_writer.add_summary(summary_str, train_step) break if train_step % 5000 == 0: saver.save(sess, save_path=savepath, global_step=train_step) print ""
init = tf.initialize_all_variables() ## batcher print "loading dataset..." train_dataset = joblib.load("../../data/"+DATA+"_train") train_data = train_dataset["data"][:50000,:] print train_data.shape #exit() dev_dataset = joblib.load("../../data/"+DATA+"_dev") test_dataset = joblib.load("../../data/"+DATA+"_test") print "loading dicts..." dicts = joblib.load("../../data/dict_"+DATA) print "obtaining batch..." train_batcher = Batcher(train_dataset["storage"],train_data,BATCH_SIZE,args.context_length,dicts["id2vec"]) dev_batcher = Batcher(dev_dataset["storage"],dev_dataset["data"],2202,args.context_length,dicts["id2vec"]) #2202 10000 test_batcher = Batcher(test_dataset["storage"],test_dataset["data"],8885,args.context_length,dicts["id2vec"]) #8885 563 # saver saver = tf.train.Saver() # session sess = tf.Session() sess.run(init) [x_context_data, x_target_mean_data, y_data, feature_data] = test_batcher.next() test_feed = {y:y_data,keep_prob_context:[1],keep_prob_target:[1],feature:feature_data} for i in range(args.context_length*2+1):