def dump_data(path): ## get an empty directory if os.path.isdir(path): os.system('rm -rf %s' % path) os.mkdir(path) import sentence_cleaner for problem in task.problems: sent_file = '%s/%s.sent' % (path, problem.id) doc_file = '%s/%s.doc' % (path, problem.id) par_file = '%s/%s.par' % (path, problem.id) sent_fh = open(sent_file, 'w') doc_fh = open(doc_file, 'w') par_fh = open(par_file, 'w') for doc in problem.new_docs: count = 0 for sent in doc.sentences: ## cleaning if sent.original[0:2].islower(): print 'bad parse:', sent.original continue if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) else: cleaned = sentence_cleaner.clean(sent.original) sent_fh.write('%s\n' % cleaned) doc_fh.write('%s\n' % (doc.id)) par_fh.write('%d\n' % int(sent.paragraph_starter)) sent_fh.close() doc_fh.close() par_fh.close() query_file = '%s/%s.query' % (path, problem.id) query_fh = open(query_file, 'w') query_fh.write('%s\n' % problem.title) query_fh.write('%s\n' % problem.narr) query_fh.close() gold_file = '%s/%s.gold_sent' % (path, problem.id) gold_doc_file = '%s/%s.gold_doc' % (path, problem.id) gold_fh = open(gold_file, 'w') gold_doc_fh = open(gold_doc_file, 'w') for ann, sents in problem.training.items(): for sent in sents: gold_fh.write('%s\n' % sent) gold_doc_fh.write('%s\n' % ann) gold_fh.close() gold_doc_fh.close()
def test(): import sentence_cleaner total, reg, agg = 0, 0, 0 for problem in task.problems: for doc in problem.new_docs: for sent in doc.sentences: if sent.original[0].islower(): print '**', sent.original if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) agg += len(cleaned.split()) reg += len(sentence_cleaner.clean(sent.original).split()) else: cleaned = sentence_cleaner.clean(sent.original) agg += len(cleaned.split()) reg += len(cleaned.split()) total += len(sent.original.split()) if sent.original == cleaned: continue print sent.original print cleaned print '----------' #if sent.order == 0: print sent print '+++' print 'total [%d] reg [%d] agg [%d]' % (total, reg, agg)
def test(): import sentence_cleaner total, reg, agg = 0, 0, 0 for problem in task.problems: for doc in problem.new_docs: for sent in doc.sentences: if sent.original[0].islower(): print '**', sent.original if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) agg += len(cleaned.split()) reg += len(sentence_cleaner.clean(sent.original).split()) else: cleaned = sentence_cleaner.clean(sent.original) agg += len(cleaned.split()) reg += len(cleaned.split()) total += len(sent.original.split()) if sent.original == cleaned: continue print sent.original print cleaned print '----------' #if sent.order == 0: print sent print '+++' print 'total [%d] reg [%d] agg [%d]' %(total, reg, agg)
def dump_data(path): ## get an empty directory if os.path.isdir(path): os.system('rm -rf %s' %path) os.mkdir(path) import sentence_cleaner for problem in task.problems: sent_file = '%s/%s.sent' %(path, problem.id) doc_file = '%s/%s.doc' %(path, problem.id) par_file = '%s/%s.par' %(path, problem.id) sent_fh = open(sent_file, 'w') doc_fh = open(doc_file, 'w') par_fh = open(par_file, 'w') for doc in problem.new_docs: count = 0 for sent in doc.sentences: ## cleaning if sent.original[0:2].islower(): print 'bad parse:', sent.original continue if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) else: cleaned = sentence_cleaner.clean(sent.original) sent_fh.write('%s\n' %cleaned) doc_fh.write('%s\n' %(doc.id)) par_fh.write('%d\n' %int(sent.paragraph_starter)) sent_fh.close() doc_fh.close() par_fh.close() query_file = '%s/%s.query' %(path, problem.id) query_fh = open(query_file, 'w') query_fh.write('%s\n' %problem.title) query_fh.write('%s\n' %problem.narr) query_fh.close() gold_file = '%s/%s.gold_sent' %(path, problem.id) gold_doc_file = '%s/%s.gold_doc' %(path, problem.id) gold_fh = open(gold_file, 'w') gold_doc_fh = open(gold_doc_file, 'w') for ann, sents in problem.training.items(): for sent in sents: gold_fh.write('%s\n' %sent) gold_doc_fh.write('%s\n' %ann) gold_fh.close() gold_doc_fh.close()
def main(_): assert (FLAGS.action in ACTIONS) assert (FLAGS.loss in LOSS_FCTS) loss_fct = FLAGS.loss action = FLAGS.action train = action in ["train", "continue"] test = action == "test" linebyline = action in ["ppl", "loglikes", "predict", "probs", "export"] util.mkdirs(FLAGS.model_dir) if not (FLAGS.data_path or linebyline): raise ValueError("Must set --data_path to data directory") config = get_config() word_to_id_path = os.path.join(FLAGS.model_dir, "word_to_id") if action != "train": with open(word_to_id_path, 'r') as f: word_to_id = pickle.load(f) else: word_to_id = None config.epoch = 1 config.step = 0 # Reading fast_test. # This option is enabled by 'transpose.py' fast_test = False if "fast_test" in config.__dict__: # Be sure to set a boolean fast_test = True if config.fast_test else False eval_config = Config(clone=config) eval_config.batch_size = 1 eval_config.num_steps = 1 # Load data if not linebyline: raw_data = reader.raw_data(FLAGS.data_path, training=train, word_to_id=word_to_id) train_data, valid_data, test_data, word_to_id = raw_data with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) # Defining model(s) if train: # Saving word_to_id & conf to file with open(word_to_id_path, 'w') as f: pickle.dump(word_to_id, f) with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=False, initializer=initializer): m = Model(is_training=True, config=config, loss_fct=loss_fct) tf.scalar_summary("Training Loss", m.cost) tf.scalar_summary("Learning Rate", m.lr) with tf.name_scope("Valid"): with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = Model(is_training=False, config=config) tf.scalar_summary("Validation Loss", mvalid.cost) with tf.name_scope("Test"): with tf.variable_scope("Model", reuse=train, initializer=initializer): mtest = Model(is_training=False, config=eval_config, test_opti=fast_test) saver = tf.train.Saver() init_op = tf.global_variables_initializer() with tf.Session() as session: session.run(init_op) if train: config.save() if action == "continue": session = _restore_session(saver, session) saver = None if FLAGS.nosave else saver print("Starting training from epoch %d using %s" % (config.epoch, loss_fct)) while config.epoch <= config.max_max_epoch: i = config.epoch lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, eval_op=m.train_op, verbose=True, saver=saver, log=FLAGS.log) print("Epoch: %d Train Perplexity: %.3f" % (i, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data) print("Epoch: %d Valid Perplexity: %.3f" % (i, valid_perplexity)) config.step = 0 config.epoch += 1 config.save() else: session = _restore_session(saver, session) if FLAGS.action == "export": tf.train.write_graph(session.graph_def, FLAGS.model_dir, 'graph.pb') sys.exit() # Line by line processing (=ppl, predict, loglikes) if linebyline: while True: lines = sys.stdin.readline() if not lines: break lines = lines.strip().split('\t') results = [] for line in lines: idict = None test_data = sentence_cleaner.clean( line, word_to_id) if len(test_data) < 2: print(-9999) continue inverse_dict = dict( zip(word_to_id.values(), word_to_id.keys())) result = run_epoch(session, mtest, test_data, idict=inverse_dict, action=FLAGS.action) results.append((test_data, result)) if FLAGS.action == 'predict': print(json.dumps(results)) if FLAGS.action in ['ppl', 'loglikes']: if len(lines) is 2: print("%.2f, %.2f, %.2f" % (results[0][1], results[1][1], results[0][1] - results[1][1])) if FLAGS.debug is True: print(lines) else: if results[0][1] > FLAGS.threshold: print("%.2f" % results[0][1]) elif FLAGS.action == "probs": for j, result in enumerate(results): if FLAGS.debug is True: print(lines[j]) out_str = "" test_data = result[0] probs = result[1] count = 0 for i, prob in enumerate(probs): if i == 0: continue if FLAGS.debug is True: out_str += "(%s %.3f) " % ( inverse_dict[test_data[i]], prob * 100) if prob * 100 < FLAGS.threshold: count = count + 1 if FLAGS.debug is True: print(out_str) if count > 0: print("{:.2f}% words below {:.3f} prob". format( (count / (len(probs) - 1) * 100), FLAGS.threshold)) # Whole text processing elif test: test_perplexity = run_epoch(session, mtest, test_data) print("Test Perplexity: %.3f" % test_perplexity)