Example #1
0
def dump_data(path):

    ## get an empty directory
    if os.path.isdir(path): os.system('rm -rf %s' % path)
    os.mkdir(path)

    import sentence_cleaner

    for problem in task.problems:
        sent_file = '%s/%s.sent' % (path, problem.id)
        doc_file = '%s/%s.doc' % (path, problem.id)
        par_file = '%s/%s.par' % (path, problem.id)
        sent_fh = open(sent_file, 'w')
        doc_fh = open(doc_file, 'w')
        par_fh = open(par_file, 'w')
        for doc in problem.new_docs:
            count = 0
            for sent in doc.sentences:

                ## cleaning
                if sent.original[0:2].islower():
                    print 'bad parse:', sent.original
                    continue
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                else:
                    cleaned = sentence_cleaner.clean(sent.original)

                sent_fh.write('%s\n' % cleaned)
                doc_fh.write('%s\n' % (doc.id))
                par_fh.write('%d\n' % int(sent.paragraph_starter))
        sent_fh.close()
        doc_fh.close()
        par_fh.close()

        query_file = '%s/%s.query' % (path, problem.id)
        query_fh = open(query_file, 'w')
        query_fh.write('%s\n' % problem.title)
        query_fh.write('%s\n' % problem.narr)
        query_fh.close()

        gold_file = '%s/%s.gold_sent' % (path, problem.id)
        gold_doc_file = '%s/%s.gold_doc' % (path, problem.id)
        gold_fh = open(gold_file, 'w')
        gold_doc_fh = open(gold_doc_file, 'w')
        for ann, sents in problem.training.items():
            for sent in sents:
                gold_fh.write('%s\n' % sent)
                gold_doc_fh.write('%s\n' % ann)
        gold_fh.close()
        gold_doc_fh.close()
Example #2
0
def test():
    import sentence_cleaner
    total, reg, agg = 0, 0, 0
    for problem in task.problems:
        for doc in problem.new_docs:
            for sent in doc.sentences:
                if sent.original[0].islower(): print '**', sent.original
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                    agg += len(cleaned.split())
                    reg += len(sentence_cleaner.clean(sent.original).split())
                else:
                    cleaned = sentence_cleaner.clean(sent.original)
                    agg += len(cleaned.split())
                    reg += len(cleaned.split())
                total += len(sent.original.split())
                if sent.original == cleaned: continue
                print sent.original
                print cleaned
                print '----------'
                #if sent.order == 0: print sent
            print '+++'
    print 'total [%d] reg [%d] agg [%d]' % (total, reg, agg)
Example #3
0
def test():
    import sentence_cleaner
    total, reg, agg = 0, 0, 0
    for problem in task.problems:
        for doc in problem.new_docs:
            for sent in doc.sentences:
                if sent.original[0].islower(): print '**', sent.original
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                    agg += len(cleaned.split())
                    reg += len(sentence_cleaner.clean(sent.original).split())
                else:
                    cleaned = sentence_cleaner.clean(sent.original)
                    agg += len(cleaned.split())
                    reg += len(cleaned.split())
                total += len(sent.original.split())    
                if sent.original == cleaned: continue
                print sent.original
                print cleaned
                print '----------'
                #if sent.order == 0: print sent
            print '+++'
    print 'total [%d] reg [%d] agg [%d]' %(total, reg, agg)
Example #4
0
def dump_data(path):
    
    ## get an empty directory
    if os.path.isdir(path): os.system('rm -rf %s' %path)
    os.mkdir(path)
    
    import sentence_cleaner
    
    for problem in task.problems:
        sent_file = '%s/%s.sent' %(path, problem.id)
        doc_file = '%s/%s.doc' %(path, problem.id)
        par_file = '%s/%s.par' %(path, problem.id)
        sent_fh = open(sent_file, 'w')
        doc_fh = open(doc_file, 'w')
        par_fh = open(par_file, 'w')
        for doc in problem.new_docs:
            count = 0
            for sent in doc.sentences:
                
                ## cleaning
                if sent.original[0:2].islower(): 
                    print 'bad parse:', sent.original
                    continue
                if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original)
                else: cleaned = sentence_cleaner.clean(sent.original)
                
                sent_fh.write('%s\n' %cleaned)
                doc_fh.write('%s\n' %(doc.id))
                par_fh.write('%d\n' %int(sent.paragraph_starter))
        sent_fh.close()
        doc_fh.close()
        par_fh.close()
            
        query_file = '%s/%s.query' %(path, problem.id)
        query_fh = open(query_file, 'w')
        query_fh.write('%s\n' %problem.title)
        query_fh.write('%s\n' %problem.narr)
        query_fh.close()

        gold_file = '%s/%s.gold_sent' %(path, problem.id)
        gold_doc_file = '%s/%s.gold_doc' %(path, problem.id)
        gold_fh = open(gold_file, 'w')
        gold_doc_fh = open(gold_doc_file, 'w')
        for ann, sents in problem.training.items():
            for sent in sents:
                gold_fh.write('%s\n' %sent)
                gold_doc_fh.write('%s\n' %ann)
        gold_fh.close()
        gold_doc_fh.close()
Example #5
0
def main(_):
    assert (FLAGS.action in ACTIONS)
    assert (FLAGS.loss in LOSS_FCTS)

    loss_fct = FLAGS.loss
    action = FLAGS.action

    train = action in ["train", "continue"]
    test = action == "test"
    linebyline = action in ["ppl", "loglikes", "predict", "probs", "export"]

    util.mkdirs(FLAGS.model_dir)

    if not (FLAGS.data_path or linebyline):
        raise ValueError("Must set --data_path to data directory")

    config = get_config()

    word_to_id_path = os.path.join(FLAGS.model_dir, "word_to_id")
    if action != "train":
        with open(word_to_id_path, 'r') as f:
            word_to_id = pickle.load(f)

    else:
        word_to_id = None
        config.epoch = 1
        config.step = 0

    # Reading fast_test.
    # This option is enabled by 'transpose.py'
    fast_test = False
    if "fast_test" in config.__dict__:
        # Be sure to set a boolean
        fast_test = True if config.fast_test else False

    eval_config = Config(clone=config)
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    # Load data
    if not linebyline:
        raw_data = reader.raw_data(FLAGS.data_path,
                                   training=train,
                                   word_to_id=word_to_id)
        train_data, valid_data, test_data, word_to_id = raw_data
    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        # Defining model(s)
        if train:
            # Saving word_to_id & conf to file
            with open(word_to_id_path, 'w') as f:
                pickle.dump(word_to_id, f)

            with tf.name_scope("Train"):
                with tf.variable_scope("Model",
                                       reuse=False,
                                       initializer=initializer):
                    m = Model(is_training=True,
                              config=config,
                              loss_fct=loss_fct)
                tf.scalar_summary("Training Loss", m.cost)
                tf.scalar_summary("Learning Rate", m.lr)

            with tf.name_scope("Valid"):
                with tf.variable_scope("Model",
                                       reuse=True,
                                       initializer=initializer):
                    mvalid = Model(is_training=False, config=config)
                tf.scalar_summary("Validation Loss", mvalid.cost)

        with tf.name_scope("Test"):
            with tf.variable_scope("Model",
                                   reuse=train,
                                   initializer=initializer):
                mtest = Model(is_training=False,
                              config=eval_config,
                              test_opti=fast_test)

        saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init_op)
            if train:
                config.save()
                if action == "continue":
                    session = _restore_session(saver, session)

                saver = None if FLAGS.nosave else saver
                print("Starting training from epoch %d using %s" %
                      (config.epoch, loss_fct))

                while config.epoch <= config.max_max_epoch:
                    i = config.epoch
                    lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
                    m.assign_lr(session, config.learning_rate * lr_decay)

                    print("Epoch: %d Learning rate: %.3f" %
                          (i, session.run(m.lr)))
                    train_perplexity = run_epoch(session,
                                                 m,
                                                 train_data,
                                                 eval_op=m.train_op,
                                                 verbose=True,
                                                 saver=saver,
                                                 log=FLAGS.log)
                    print("Epoch: %d Train Perplexity: %.3f" %
                          (i, train_perplexity))

                    valid_perplexity = run_epoch(session, mvalid, valid_data)
                    print("Epoch: %d Valid Perplexity: %.3f" %
                          (i, valid_perplexity))

                    config.step = 0
                    config.epoch += 1
                    config.save()

            else:
                session = _restore_session(saver, session)

                if FLAGS.action == "export":
                    tf.train.write_graph(session.graph_def, FLAGS.model_dir,
                                         'graph.pb')
                    sys.exit()

                # Line by line processing (=ppl, predict, loglikes)
                if linebyline:
                    while True:
                        lines = sys.stdin.readline()
                        if not lines: break

                        lines = lines.strip().split('\t')
                        results = []
                        for line in lines:
                            idict = None
                            test_data = sentence_cleaner.clean(
                                line, word_to_id)

                            if len(test_data) < 2:
                                print(-9999)
                                continue

                            inverse_dict = dict(
                                zip(word_to_id.values(), word_to_id.keys()))
                            result = run_epoch(session,
                                               mtest,
                                               test_data,
                                               idict=inverse_dict,
                                               action=FLAGS.action)
                            results.append((test_data, result))

                        if FLAGS.action == 'predict':
                            print(json.dumps(results))

                        if FLAGS.action in ['ppl', 'loglikes']:
                            if len(lines) is 2:
                                print("%.2f, %.2f, %.2f" %
                                      (results[0][1], results[1][1],
                                       results[0][1] - results[1][1]))
                                if FLAGS.debug is True:
                                    print(lines)
                            else:
                                if results[0][1] > FLAGS.threshold:
                                    print("%.2f" % results[0][1])

                        elif FLAGS.action == "probs":
                            for j, result in enumerate(results):

                                if FLAGS.debug is True:
                                    print(lines[j])

                                out_str = ""
                                test_data = result[0]
                                probs = result[1]
                                count = 0
                                for i, prob in enumerate(probs):
                                    if i == 0:
                                        continue

                                    if FLAGS.debug is True:
                                        out_str += "(%s %.3f) " % (
                                            inverse_dict[test_data[i]],
                                            prob * 100)

                                    if prob * 100 < FLAGS.threshold:
                                        count = count + 1

                                if FLAGS.debug is True:
                                    print(out_str)

                                if count > 0:
                                    print("{:.2f}% words below {:.3f} prob".
                                          format(
                                              (count / (len(probs) - 1) * 100),
                                              FLAGS.threshold))

                    # Whole text processing
                elif test:
                    test_perplexity = run_epoch(session, mtest, test_data)
                    print("Test Perplexity: %.3f" % test_perplexity)