Esempio n. 1
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        # changed by Kaifeng, for test
        testTableFile = FLAGS.test_dir + '/test.json'

        offset = 0
        # the test data is the last 20000 items in the table
        if FLAGS.enable_table_test:
            print('loading database table')
            with open(testTableFile) as testTables:
                tables = json.load(testTables)
            answerOutput = open(FLAGS.test_dir + '/answer.out', 'w')

        testQuestionFile = FLAGS.data_dir + '/%s_test.qu.ids1500' % subset
        testLogicFile = FLAGS.data_dir + '/%s_test.lon.ids150' % subset  # For tagging model, Hongyu
        #0530 newly added
        geoQuestionFile = FLAGS.data_dir + '/%s_train.qu.ids1500' % subset
        geoLogicFile = FLAGS.data_dir + '/%s_train.lon.ids150' % subset  # For tagging model, Hongyu
        logicalTemp_geo = open(FLAGS.test_dir + '/%s_train.out' % subset, 'w')

        logicalTemp_test = open(FLAGS.test_dir + '/%s_test.out' % subset, 'w')

        print('======= start testing =======')
        print('=== train dataset ===')
        with tf.gfile.GFile(geoQuestionFile,
                            mode='r') as geoQuestions, tf.gfile.GFile(
                                geoLogicFile, mode='r') as geoLogics:
            q_index = 0
            sentence, logic_sen = geoQuestions.readline(), geoLogics.readline()
            while sentence and logic_sen:
                if q_index % 200 == 0:
                    print("  reading data line %d" % q_index)
                    sys.stdout.flush()
                qid = 'qID_' + str(q_index)
                print('training question: ', qid)
                # Get token-ids for the input sentence.
                token_ids = [int(x) for x in sentence.split()]
                logic_ids = [int(x) for x in logic_sen.split()]
                print(token_ids)
                print(logic_ids)
                # Which bucket does it belong to?
                bucket_id = len(_buckets) - 1
                for i, bucket in enumerate(_buckets):
                    if bucket[0] > len(token_ids) and bucket[1] > len(
                            logic_ids):
                        bucket_id = i
                        break
                else:
                    logging.warning("Sentence truncated: %s", sentence)

                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits, _ = model.step(sess, encoder_inputs,
                                                    decoder_inputs,
                                                    target_weights, bucket_id,
                                                    True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                # If there is an EOS symbol in outputs, cut them at that point.
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Print out French sentence corresponding to outputs.
                print(outputs)
                for output in outputs:
                    if output >= len(rev_fr_vocab):
                        output = 3
                resultLogical = " ".join([
                    tf.compat.as_str(rev_fr_vocab[output])
                    for output in outputs
                ])
                if FLAGS.enable_table_test:
                    resultAnswer = logicalParser(tables[qid], resultLogical)
                    answerOutput.write(str(resultAnswer) + '\n')

                logicalTemp_geo.write(str(resultLogical) + '\n')
                q_index += 1
                sentence, logic_sen = geoQuestions.readline(
                ), geoLogics.readline()
        print('=== test dataset ===')
        with tf.gfile.GFile(testQuestionFile,
                            mode='r') as testQuestions, tf.gfile.GFile(
                                testLogicFile, mode='r') as testLogics:
            q_index = 0
            sentence, logic_sen = testQuestions.readline(
            ), testLogics.readline()
            while sentence and logic_sen:
                if q_index % 200 == 0:
                    print("  reading data line %d" % q_index)
                    sys.stdout.flush()
                qid = 'qID_' + str(q_index)
                print('testing question: ', qid)
                # Get token-ids for the input sentence.
                token_ids = [int(x) for x in sentence.split()]
                logic_ids = [int(x) for x in logic_sen.split()]
                # Which bucket does it belong to?
                bucket_id = len(_buckets) - 1
                for i, bucket in enumerate(_buckets):
                    if bucket[0] > len(token_ids) and bucket[1] > len(
                            logic_ids):
                        bucket_id = i
                        break
                else:
                    logging.warning("Sentence truncated: %s", sentence)

                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits, _ = model.step(sess, encoder_inputs,
                                                    decoder_inputs,
                                                    target_weights, bucket_id,
                                                    True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                # If there is an EOS symbol in outputs, cut them at that point.
                print(outputs)
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Print out French sentence corresponding to outputs.
                for output in outputs:
                    if output >= len(rev_fr_vocab):
                        output = 3
                resultLogical = " ".join([
                    tf.compat.as_str(rev_fr_vocab[output])
                    for output in outputs
                ])
                if FLAGS.enable_table_test:
                    resultAnswer = logicalParser(tables[qid], resultLogical)
                    answerOutput.write(str(resultAnswer) + '\n')

                logicalTemp_test.write(str(resultLogical) + '\n')
                q_index += 1
                sentence, logic_sen = testQuestions.readline(
                ), testLogics.readline()
        logicalTemp_geo.close()
        # logicalTemp_train.close()
        # logicalTemp_dev.close()
        logicalTemp_test.close()
        if FLAGS.enable_table_test:
            answerOutput.close()
        '''
Esempio n. 2
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils_tag.initialize_vocabulary(en_vocab_path)
        fr_vocab, rev_fr_vocab = data_utils_tag.initialize_vocabulary(
            fr_vocab_path)

        # Decode from standard input.
        # changed by Kaifeng, for test
        offset = 0
        # the test data is the last 20000 items in the table
        testTableFile = FLAGS.test_dir + '/test.json'
        if FLAGS.enable_table_test:
            print('loading database table')
            with open(testTableFile) as testTables:
                tables = json.load(testTables)
            answerOutput = open(FLAGS.test_dir + '/answer.out', 'w')

        trainQuestionFile = FLAGS.data_dir + '/rand_train.qu'
        trainTagFile = FLAGS.data_dir + '/rand_train.ta'  # For tagging model, Hongyu
        devQuestionFile = FLAGS.data_dir + '/rand_dev.qu'
        devTagFile = FLAGS.data_dir + '/rand_dev.ta'  # For tagging model, Hongyu
        testQuestionFile = FLAGS.data_dir + '/rand_test.qu'
        testTagFile = FLAGS.data_dir + '/rand_test.ta'  # For tagging model, Hongyu

        #0530 newly added
        geoQuestionFile = FLAGS.data_dir + '/GeoQuery/geo880.qu'
        geoTagFile = FLAGS.data_dir + '/GeoQuery/geo880.ta'  # For tagging model, Hongyu
        logicalTemp_geo = open(FLAGS.test_dir + '/logicalTemp_geo.out', 'w')

        logicalTemp_train = open(FLAGS.test_dir + '/logicalTemp_train.out',
                                 'w')
        logicalTemp_dev = open(FLAGS.test_dir + '/logicalTemp_dev.out', 'w')
        logicalTemp_test = open(FLAGS.test_dir + '/logicalTemp_test.out', 'w')

        ### evaluating tagging model, Hongyu

        print('======= start testing =======')
        print('=== train dataset ===')
        with open(trainQuestionFile, 'r') as trainQuestions:
            with open(trainTagFile, 'r') as trainTags:
                q_index = 0
                sentence, tag_sen = trainQuestions.readline(
                ), trainTags.readline()
                while sentence and tag_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('training question: ', qid)
                    # Get token-ids for the input sentence.
                    token_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    tag_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(tag_sen), fr_vocab)
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] >= len(token_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)
                    # Get output logits for the sentence.
                    _, _, output_logits, _ = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    # Print out French sentence corresponding to outputs.
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_train.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen = trainQuestions.readline(
                    ), trainTags.readline()

        print('=== dev dataset ===')
        with open(devQuestionFile, 'r') as devQuestions:
            with open(devTagFile, 'r') as devTags:
                q_index = 0
                sentence, tag_sen = devQuestions.readline(), devTags.readline()
                while sentence and tag_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('deving question: ', qid)
                    # Get token-ids for the input sentence.
                    token_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    tag_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(tag_sen), fr_vocab)
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] >= len(token_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)
                    # Get output logits for the sentence.
                    _, _, output_logits, _ = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    # Print out French sentence corresponding to outputs.
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_dev.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen = devQuestions.readline(
                    ), devTags.readline()

        print('=== test dataset ===')
        with open(testQuestionFile, 'r') as testQuestions:
            with open(testTagFile, 'r') as testTags:
                q_index = 0
                sentence, tag_sen = testQuestions.readline(
                ), testTags.readline()
                while sentence and tag_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('testing question: ', qid)
                    # Get token-ids for the input sentence.
                    token_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    tag_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(tag_sen), fr_vocab)
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] >= len(token_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)

                    # Get output logits for the sentence and CONFUSION matrix. # 0531 newly added
                    filename = "confusion_matrix.txt"
                    confusion_path = os.path.join("./PCA-visual/", filename)
                    f_con = open(confusion_path, 'a+')
                    _, _, output_logits, confusion_matrix = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)
                    f_con.write('*** example: ' + str(q_index) + ' ***\n')
                    for i in range(confusion_matrix.shape[1]):
                        words = [str(x) for x in confusion_matrix[0][i]]
                        f_con.write(','.join(words) + '\n')
                    f_con.close()

                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    # Print out French sentence corresponding to outputs.
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_test.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen = testQuestions.readline(
                    ), testTags.readline()

        print('=== geo dataset ===')
        with open(geoQuestionFile, 'r') as geoQuestions:
            with open(geoTagFile, 'r') as geoTags:
                q_index = 0
                sentence, tag_sen = geoQuestions.readline(), geoTags.readline()
                while sentence and tag_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('geoing question: ', qid)
                    # Get token-ids for the input sentence.
                    token_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    tag_ids = data_utils_tag.sentence_to_token_ids(
                        tf.compat.as_bytes(tag_sen), fr_vocab)
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] >= len(token_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)
                    # Get output logits for the sentence.
                    _, _, output_logits, _ = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    # Print out French sentence corresponding to outputs.
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_geo.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen = geoQuestions.readline(
                    ), geoTags.readline()
        logicalTemp_geo.close()
        logicalTemp_train.close()
        logicalTemp_dev.close()
        logicalTemp_test.close()
        if FLAGS.enable_table_test:
            answerOutput.close()
Esempio n. 3
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils_tag.initialize_vocabulary(en_vocab_path)
        fr_vocab, rev_fr_vocab = data_utils_tag.initialize_vocabulary(
            fr_vocab_path)

        # Decode from standard input.
        # changed by Kaifeng, for test
        offset = 0
        # the test data is the last 20000 items in the table
        testTableFile = FLAGS.test_dir + '/test.json'
        if FLAGS.enable_table_test:
            print('loading database table')
            with open(testTableFile) as testTables:
                tables = json.load(testTables)
            answerOutput = open(FLAGS.test_dir + '/answer.out', 'w')

        # trainQuestionFile = FLAGS.data_dir + '/rand_train.qu'
        # trainTagFile = FLAGS.data_dir + '/rand_train.ta'   # For tagging model, Hongyu
        # devQuestionFile = FLAGS.data_dir + '/rand_dev.qu'
        # devTagFile = FLAGS.data_dir + '/rand_dev.ta'   # For tagging model, Hongyu
        testQuestionFile = FLAGS.data_dir + '/%s_test.qu.ids1500' % subset
        testTagFile = FLAGS.data_dir + '/%s_test.ta.ids150' % subset  # For tagging model, Hongyu
        testLogicFile = FLAGS.data_dir + '/%s_test.lox.ids150' % subset  # For tagging model, Hongyu
        #0530 newly added
        geoQuestionFile = FLAGS.data_dir + '/%s_train.qu.ids1500' % subset
        geoTagFile = FLAGS.data_dir + '/%s_train.ta.ids150' % subset  # For tagging model, Hongyu
        geoLogicFile = FLAGS.data_dir + '/%s_train.lox.ids150' % subset  # For tagging model, Hongyu
        logicalTemp_geo = open(FLAGS.test_dir + '/%s_train.out' % subset, 'w')

        # logicalTemp_train = open(FLAGS.test_dir + '/logicalTemp_train.out', 'w')
        # logicalTemp_dev = open(FLAGS.test_dir + '/logicalTemp_dev.out', 'w')
        logicalTemp_test = open(FLAGS.test_dir + '/%s_test.out' % subset, 'w')

        ### evaluating tagging model, Hongyu

        print('======= start testing =======')
        print('=== testing dataset ===')
        with gfile.GFile(testQuestionFile,
                         mode='r') as testQuestions, gfile.GFile(
                             testLogicFile, mode='r') as testLogics:
            with gfile.GFile(testTagFile, mode='r') as testTags:
                q_index = 0
                sentence, tag_sen, logic_sen = testQuestions.readline(
                ), testTags.readline(), testLogics.readline()
                while sentence and tag_sen and logic_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('testing question: ', qid)
                    # Get token-ids for the input sentence.
                    # token_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
                    # tag_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(tag_sen), fr_vocab)
                    # logic_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(logic_sen), fr_vocab)
                    token_ids = [int(x) for x in sentence.split()]
                    tag_ids = [int(x) for x in tag_sen.split()]
                    logic_ids = [int(x) for x in logic_sen.split()]
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] > len(token_ids) and bucket[1] > len(
                                logic_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)

                    # Get output logits for the sentence and CONFUSION matrix. # 0531 newly added
                    _, _, output_logits, confusion_matrix = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)

                    # Newly modified 0624: This is a Constraint-Greedy decoder - outputs are just argmaxes of output_logits.
                    # resultLogical = []
                    # for i in range(len(output_logits)):
                    #   output = int(np.argmax(output_logits[i], axis=1))
                    # Constraint 1: advancd ending
                    # if i < len(logic_ids)-1 and output == data_utils_tag.EOS_ID:
                    #   output = int(np.argmax(output_logits[i][:,data_utils_tag.EOS_ID+1:], axis=1)) + data_utils_tag.EOS_ID+1
                    #   if i == 0:
                    #     prev_idx = output
                    #     if output >= len(rev_fr_vocab):
                    #       output = data_utils_tag.UNK_ID
                    #     prev = tf.compat.as_str(rev_fr_vocab[output])
                    #     resultLogical.append(prev)
                    #   else: # i>0
                    #     if str(prev) in ['equal','less','greater','neq','nl','ng']:
                    #       # Constraint 2: after 'equal' should be 'value'
                    #       output = int(np.argmax(output_logits[i][:,5:17], axis=1)) + 5
                    #     if output == 2: #data_utils_tag.EOS_ID:
                    #       if i < len(logic_ids)-1:
                    #         output = int(np.argmax(output_logits[i][:,3:], axis=1)) + 3
                    #       else:
                    #         break
                    #     pre_idx = output
                    #     if output >= len(rev_fr_vocab):
                    #       output = data_utils_tag.UNK_ID
                    #     prev = tf.compat.as_str(rev_fr_vocab[output])
                    #     resultLogical.append(prev)
                    # if str(resultLogical[-1]) in ['equal','less','greater','neq','nl','ng']:
                    #   resultLogical.append(resultLogical[-2])
                    # # Constraint 3, formats
                    # resultLogical = " ".join(resultLogical)
                    # resultLogical = resultLogical.replace('<field>:1 equal <field>:1', '<field>:1')
                    # resultLogical = resultLogical.replace('<value>:1 where <field>', '<value>:1 and <field>')
                    # resultLogical = resultLogical.replace('and where', 'and')
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_test.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen, logic_sen = testQuestions.readline(
                    ), testTags.readline(), testLogics.readline()

        print('=== train dataset ===')
        with gfile.GFile(geoQuestionFile,
                         mode='r') as geoQuestions, gfile.GFile(
                             geoLogicFile, mode='r') as geoLogics:
            with gfile.GFile(geoTagFile, mode='r') as geoTags:
                q_index = 0
                sentence, tag_sen, logic_sen = geoQuestions.readline(
                ), geoTags.readline(), geoLogics.readline()
                while sentence and tag_sen:
                    if q_index % 200 == 0:
                        print("  reading data line %d" % q_index)
                        sys.stdout.flush()
                    qid = 'qID_' + str(q_index)
                    print('training question: ', qid)
                    # Get token-ids for the input sentence.
                    # token_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
                    # tag_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(tag_sen), fr_vocab)
                    # logic_ids = data_utils_tag.sentence_to_token_ids(tf.compat.as_bytes(logic_sen), fr_vocab)
                    token_ids = [int(x) for x in sentence.split()]
                    tag_ids = [int(x) for x in tag_sen.split()]
                    logic_ids = [int(x) for x in logic_sen.split()]
                    # Which bucket does it belong to?
                    bucket_id = len(_buckets) - 1
                    for i, bucket in enumerate(_buckets):
                        if bucket[0] > len(token_ids) and bucket[1] > len(
                                logic_ids):
                            bucket_id = i
                            break
                    else:
                        logging.warning("Sentence truncated: %s", sentence)

                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, tag_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, tag_ids, [])]}, bucket_id)
                    # Get output logits for the sentence.
                    _, _, output_logits, _ = model.step(
                        sess, encoder_inputs, tag_inputs, decoder_inputs,
                        target_weights, bucket_id, True)
                    # Newly modified 0624: This is a Constraint-Greedy decoder - outputs are just argmaxes of output_logits.
                    # resultLogical = []
                    # for i in range(len(output_logits)):
                    #   output = int(np.argmax(output_logits[i], axis=1))
                    # Constraint 1: advancd ending
                    # if i < len(logic_ids)-1 and output == data_utils_tag.EOS_ID:
                    #   output = int(np.argmax(output_logits[i][:,data_utils_tag.EOS_ID+1:], axis=1)) + data_utils_tag.EOS_ID+1
                    #   if i == 0:
                    #     prev_idx = output
                    #     if output >= len(rev_fr_vocab):
                    #       output = data_utils_tag.UNK_ID
                    #     prev = tf.compat.as_str(rev_fr_vocab[output])
                    #     resultLogical.append(prev)
                    #   else: # i>0
                    #     if str(prev) in ['equal','less','greater','neq','nl','ng']:
                    #       # Constraint 2: after 'equal' should be 'value'
                    #       output = int(np.argmax(output_logits[i][:,5:17], axis=1)) + 5
                    #     if output == 2: #data_utils_tag.EOS_ID:
                    #       if i < len(logic_ids)-1:
                    #         output = int(np.argmax(output_logits[i][:,3:], axis=1)) + 3
                    #       else:
                    #         break
                    #     pre_idx = output
                    #     if output >= len(rev_fr_vocab):
                    #       output = data_utils_tag.UNK_ID
                    #     prev = tf.compat.as_str(rev_fr_vocab[output])
                    #     resultLogical.append(prev)
                    # if str(resultLogical[-1]) in ['equal','less','greater','neq','nl','ng']:
                    #   resultLogical.append(resultLogical[-2])
                    # # Constraint 3, formats
                    # resultLogical = " ".join(resultLogical)
                    # resultLogical = resultLogical.replace('<field>:1 equal <field>:1', '<field>:1')
                    # resultLogical = resultLogical.replace('<value>:1 where <field>', '<value>:1 and <field>')
                    # resultLogical = resultLogical.replace('and where', 'and')
                    outputs = [
                        int(np.argmax(logit, axis=1))
                        for logit in output_logits
                    ]
                    if data_utils_tag.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils_tag.EOS_ID
                                                         )]
                    resultLogical = " ".join([
                        tf.compat.as_str(rev_fr_vocab[output])
                        for output in outputs
                    ])
                    if FLAGS.enable_table_test:
                        resultAnswer = logicalParser(tables[qid],
                                                     resultLogical)
                        answerOutput.write(str(resultAnswer) + '\n')

                    logicalTemp_geo.write(str(resultLogical) + '\n')
                    q_index += 1
                    sentence, tag_sen, logic_sen = geoQuestions.readline(
                    ), geoTags.readline(), geoLogics.readline()
        logicalTemp_geo.close()
        # logicalTemp_train.close()
        # logicalTemp_dev.close()
        logicalTemp_test.close()
        if FLAGS.enable_table_test:
            answerOutput.close()