Beispiel #1
0
def generate(middle_sentence, forwards_sentence, backwards_sentence):
    '''
    Generates forwards and backwards sentences given a middle sentence.
    Args:
      middle_sentence: middle sentence (not tokenized)
      forwards_sentence: preceding sentence (not tokenized)
      backwards_sentence: following sentence (not tokenized)
    '''
    train_path, vocab_path, train_ids_path = data_utils.prepare_skip_thought_data(
        FLAGS.data_dir, FLAGS.train_data_name, FLAGS.vocab_size)

    with tf.Session() as sess:
        m = SkipThoughtModel(FLAGS.vocab_size, max_sentence_len=FLAGS.max_sentence_len,
                             batch_size=FLAGS.batch_size,
                             learning_rate=FLAGS.learning_rate,
                             learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
                             encoder_cell_size=FLAGS.encoder_cell_size,
                             word_embedding_size=FLAGS.word_embedding_size,
                             decoder_cell_size=FLAGS.decoder_cell_size,
                             max_gradient_norm=FLAGS.max_gradient_norm,
                             initial_decoder_state=None)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            m.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("No model found")
            return

        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
        tokenized_middle_sentence = data_utils.sentence_to_token_ids(
            middle_sentence, vocab)
        tokenized_forwards_sentence = data_utils.sentence_to_token_ids(
            forwards_sentence, vocab)
        tokenized_backwards_sentence = data_utils.sentence_to_token_ids(
            " ".join(reversed(backwards_sentence.split())), vocab)

        forwards_batch_logits, backwards_batch_logits = m.step(sess, [m.forwards_batch_logits_tensor, m.backwards_batch_logits_tensor], *m.prep_data(
            [tokenized_middle_sentence], [tokenized_forwards_sentence], [tokenized_backwards_sentence]))

        forwards_logits = forwards_batch_logits[:, 0, :]
        backwards_logits = backwards_batch_logits[:, 0, :]
        print(forwards_logits)
        print(forwards_logits.shape)

        forwards_sentence = map(
            lambda x: rev_vocab[x], map(np.argmax, forwards_logits))
        backwards_sentence = map(
            lambda x: rev_vocab[x], map(np.argmax, backwards_logits))

        print("Generated Forwards Sentence")
        print(" ".join(forwards_sentence))
        print("Generated Backwards Sentence")
        print(" ".join(backwards_sentence))
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[: outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([rev_fr_vocab[output] for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #3
0
def decode_from_stdin(show_all_n_best=False, FLAGS=None, buckets=None):

    assert FLAGS is not None
    assert buckets is not None

    # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # Create model and load parameters.
        model = create_seq2seq_model(sess, True, FLAGS, buckets, translate=True)

        # Load vocabularies.
        source_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.source_lang)

        target_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.target_lang)

        src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file)
        _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:

            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab)

            # Get output logits for the sentence.
            output_hypotheses, output_scores = model.translation_step(sess, token_ids, beam_size=FLAGS.beam_size, dump_remaining=False)

            outputs = []

            for x in output_hypotheses:
                try:
                    outputs.append(x[:x.index(data_utils.EOS_ID)])
                except ValueError:
                    pass

            output_hypotheses = outputs

            # print translations
            if show_all_n_best:
                for x in xrange(len(outputs)):
                    out = outputs[x]
                    # Print out French sentence corresponding to outputs.
                    print(str(numpy.exp(-output_scores[x])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out]))
            else:
                out = outputs[0]
                # Print out French sentence corresponding to outputs.
                print(str(numpy.exp(-output_scores[0])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out]))

            # wait for a new sentence to translate
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.tgt" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.src" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path )
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path )

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    sentence = map(lambda x:x.decode('utf-8'), ['こんにちは']).pop()
    with open('./narou/narou_dev.src.txt', 'r')  as f:
        lines = f.read().split('\n')
        print(lines)
    #while sentence:
    for sentence in lines:
      print(sentence)
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      print(token_ids)
      # Which bucket does it belong to?
      try:
        bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      except:
        continue
        # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]

      # Print out French sentence corresponding to outputs.
      print("ANS:>", " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
      print("> ", end="")
      continue
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Beispiel #5
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        ast_vocab_path = os.path.join(FLAGS.data_dir,
                                      "vocab%d.ast" % FLAGS.ast_vocab_size)
        nl_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.nl" % FLAGS.nl_v1000ocab_size)
        ast_vocab, _ = data_utils.initialize_vocabulary(ast_vocab_path)
        _, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), ast_vocab)
            # Which bucket does it belong to?
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

                # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([
                tf.compat.as_str(rev_nl_vocab[output]) for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #6
0
def decode():
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
    config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(config=config) as sess:
        model = create_model(sess, True)
        model.batch_size = 1
        enc_vocab_path = os.path.join(working_directory,
                                      "vocab%d.enc" % enc_vocab_size)
        dec_vocab_path = os.path.join(working_directory,
                                      "vocab%d.dec" % dec_vocab_size)

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), enc_vocab)
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            if sentence[:-1] in lines:
                temp_output = " ".join([
                    tf.compat.as_str(rev_dec_vocab[output])
                    for output in outputs
                ])
                trigger_check = trigger_activator(temp_output)
                if trigger_check == True:
                    print(" ".join([
                        tf.compat.as_str(rev_dec_vocab[output])
                        for output in outputs[:-1]
                    ]))
                else:
                    print(temp_output)
            else:
                print('i dont understand you')
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def scorer():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    test_file = os.path.join(gConfig['encoder_test_file'])
    test_captions = open(test_file,'r').readlines()

    model.batch_size = 1  # We decode one sentence at a time.
    output_captions = []
    for sentence in test_captions:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # Which bucket does it belong to?
      token_ids = token_ids[:40]
      try:
        bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] >= len(token_ids)])
      except:
        # if sentence length greater than the largest bucket size
        pdb.set_trace()
      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)

      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out French sentence corresponding to outputs.
      output_caption = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
      output_captions.append(output_caption)
    # pdb.set_trace()
    gt_info = pkl.load(open(test_file[:-4]+'.pkl','r'))

    id_list = gt_info['ids']
    gt_dict = gt_info['gt']
    pred_dict = {idx: [{'image_id':idx,'caption':sent}] for idx, sent in enumerate(output_captions)}
    with open(gConfig['result_dir']+gConfig['encoder_test_file'].split('/')[-1][:-4]+'_output.txt','w') as f:
        [f.write(i+'\n') for i in output_captions]
    scorer = COCOScorer()
    total_score = scorer.score(gt_dict, pred_dict, id_list)
Beispiel #8
0
def encode():
    """Encode all of the sentences to vector form"""
    train, dev, test = loader.getData()
    sentences = []
    tokens = []

    # Load the vocab
    en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE)

    # Collect all the training sentences
    for i, row in pd.concat((train, test)).iterrows():
        if isinstance(row["sentence1"], basestring) and isinstance(
                row["sentence2"], basestring):
            sentences.append(row["sentence1"])
            sentences.append(row["sentence2"])

    # Allocate the sentences to buckets
    bucketed = {}
    for sentence in sentences:
        bucket_id = get_bucket(en_vocab, sentence)
        bucketed.setdefault(bucket_id, [])
        bucketed[bucket_id].append(sentence)

    mapped = {}
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True, train_dir=TRAIN_DIR)
        model.batch_size = BATCH_SIZE  # We decode 64 sentence at a time.
        # Iterate over each bucket
        for bucket_id, sentences in bucketed.iteritems():
            for batch in chunker(sentences, BATCH_SIZE):
                data = []
                for sentence in batch:
                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    expected_output = []
                    data.append((token_ids, expected_output))
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: data}, bucket_id)
                contexts = model.step_context(sess, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id)
                features = np.hstack(contexts)
                print 'Extracted another set of features with shape:', features.shape
                # Now we align sentences with their contexts
                for i, sentence in enumerate(batch):
                    mapped[sentence] = features[i, :].tolist()
    print sentence
    print mapped[sentence]
    print "Saving sentences to %s" % JSON_NAME
    with open(JSON_NAME, 'w') as file:
        json.dump(mapped, file)
Beispiel #9
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.dec" % gConfig['dec_vocab_size'])

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), enc_vocab)
            #print(token_ids)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out corresponding response
            print(" ".join([
                tf.compat.as_str(rev_dec_vocab[output]) for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #10
0
    def train(self, pretrained_model_path=None):
        # get captions and feats
        captions = data_utils.get_some_captions(5000)
        # shape:[5000, 192, 512]
        feats = data_utils.get_features(FEATURES_PATH)

        maxlen = self.n_time_step

        # get word2ix, ixtoword dictionary
        word2ix, ixtoword = data_utils.initialize_vocabulary(VOCAB_PATH)

        learning_rate = self.learning_rate
        n_words = len(word2ix)

        sess = tf.InteractiveSession()

        loss, context, sentence = self.build_model()
        saver = tf.train.Saver(max_to_keep=25)

        train_op = self.optimizer(learning_rate, loss)
        tf.initialize_all_variables().run()

        if pretrained_model_path is not None:
            print("Starting with pretrained model")
            saver.restore(sess, pretrained_model_path)

        for epoch in range(self.epochs):
            for start, end in zip(range(0, len(captions), self.batch_size),
                                  range(self.batch_size, len(captions), self.batch_size)):
                current_feats = feats[start:end]
                current_feats = current_feats.reshape(-1, self.D, self.L).swapaxes(1, 2)

                current_captions = captions[start:end]

                current_captions_ind = []
                for caption in current_captions:
                    caption2id = data_utils.sentence_to_token_ids(caption, word2ix)
                    if len(caption2id) < maxlen:
                        caption2id = [data_utils.GO_ID] + caption2id + [data_utils.EOS_ID]
                        caption2id = caption2id + [data_utils.PAD_ID] * (maxlen - len(caption2id))
                    current_captions_ind.append(caption2id)

                current_captions_ind = np.asarray(current_captions_ind)

                _, loss_value = sess.run([train_op, loss], feed_dict={
                    context: current_feats,
                    sentence: current_captions_ind
                })

                print("Epoch:%d, Current loss:" % epoch, loss_value)
            saver.save(sess, MODEL_PATH, global_step=epoch)
def multi_test():
    """generate paraphrasing sentences for multiple input sentences."""
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

        # test_file = os.path.join(gConfig['encoder_test_file'])
        test_dir = 'data/top5/'
        output_dir = 'data/top5/'
        # test_captions = open(test_file,'r').readlines()
        for filename in os.listdir(test_dir):
            if filename.endswith(".txt"):
                test_captions = open(test_dir + filename,'r').readlines()

                output_captions = []
                for sentence in test_captions:
                    # Get token-ids for the input sentence.
                    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence.lower()), enc_vocab)
                    # Which bucket does it belong to?
                    token_ids = token_ids[:40]
                    try:
                      bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] >= len(token_ids)])
                    except:
                      # if sentence length greater than the largest bucket size
                      pdb.set_trace()
                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        {bucket_id: [(token_ids, [])]}, bucket_id)
                    # Get output logits for the sentence.
                    _, _, output_logits, project = model.step(sess, encoder_inputs, decoder_inputs,
                                                     target_weights, bucket_id, True)
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils.EOS_ID in outputs:
                      outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                    # Print out French sentence corresponding to outputs.
                    output_caption = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
                    output_captions.append(output_caption)

                with open(output_dir + 'paraphrase_' + filename, 'w') as f:
                    for item in output_captions:
                        f.write("%s\n" % item)
def inter_decode():
  if not (FLAGS.inter_decode_sent and FLAGS.inter_decode_position and FLAGS.inter_decode_map):
    raise ValueError(" Invalid argument, please set inter_decode setting! ")
  with tf.Session() as sess:
    # Load dictionary
    srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min)
    trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min)
    srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Create model
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Decode from standard input.  ---> interactive decoding
    # sys.stdout.write("> ")
    # sys.stdout.flush()
    # sentence = sys.stdin.readline()
    sentence = FLAGS.inter_decode_sent

    # read supplement input: children, weight.
    # init_pos = eval(sys.stdin.readline())
    # mapp = eval(sys.stdin.readline())
    init_pos = eval(FLAGS.inter_decode_position)
    mapp = eval(FLAGS.inter_decode_map)


    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab)
    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets))
                     if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    # pdb.set_trace()
    encoder_input, decoder_input, target_weight, pos, maps = model.get_batch(
        {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, 
                decoder_inputs_positions=pos, decoder_inputs_maps=maps)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
      outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    
    final_pos = out_pos[0].tolist()
    for l in xrange(len(outputs)-1):
      final_pos.extend(out_pos[l+1].tolist())

    return final_pos
def evaluate(filename):
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    writer = open('pred.txt', 'w')
    count = 0
    with open(filename, 'r') as reader:
      for sentence in reader:
        count += 1
        if count % 1000 == 0:
          print (count)        
        chunks = parser(sentence)
        #print (chunks)
        # Get token-ids for the input sentence.
        for sen in chunks:
          token_ids = data_utils.sentence_to_token_ids(sen.strip('\n'), en_vocab)
          # Which bucket does it belong to?
          bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
          # Get a 1-element batch to feed the sentence to the model.
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
          # Get output logits for the sentence.
          _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          # This is a greedy decoder - outputs are just argmaxes of output_logits.
          outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
          # If there is an EOS symbol in outputs, cut them at that point.
#          print ("previous: ")
#          print (outputs)
          if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
#          print ("after: ")
#          print (outputs)
          output = (" ".join([rev_fr_vocab[output] for output in outputs]))
          #print (rev_fr_vocab)
          #print ("output: %s" % output)
          writer.write(output.split()[0]+'\n')
    writer.close()
Beispiel #14
0
def test():
    print("Loading data...")
    vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \
        = data_utils.prepare_data(args.data_path)
    max_text_len = max([len(words) for words in train_words])
    vocab_len = len(vocab_word_list)
    # 模型构造
    model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size,
                           list(map(int, args.filter_sizes.split(","))),
                           args.num_filters, args.max_gradient_norm,
                           args.learning_rate, args.l2_reg_lambda)
    # 给定数据,对模型进行测试
    text = '''说实话没吃成, 但是对这家太不满意了, 所有都给差评!到那之后满屋子都是座, 服务员非得给安排在一个犄角旮旯, 黑咕隆咚的冷气还吹不到.
    点了餐喊半天服务员都不来拿单子, 而且是眼看着服务员从身边经过, 喊着服务员服务员, 她们就只当没听见.
    然后我自己换了个显眼的位置, 举着手喊服务员, 她们还是无视的从我旁边走过.我k, 你又不服务, 没事走来走去干什么?所以后来干脆不吃了.
    请问, 您家是要做生意么?'''
    words_ids = data_utils.sentence_to_token_ids(text, vocab_word)
    predicts = model.predict(words_ids, args.model_path)
    print text
    print "预测结果为:", predicts
    if predicts == [0]:
        print "正面评论"
    else:
        print "负面评论"

    text = '''瘦了点,可能和季节有关吧吃完加点青菜做泡饭满嗲的~孔雀开屏 45.00很大一条鱼,摆盘很漂亮,肉质挺嫩,如果加点醋更好,
    去腥更美味~~香菇菜心这个 我喜欢的呀~上面酱很嗲~ 香菇很入味,菜心很爽口~ 解油腻 总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,
    摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~雨天滴滴答答,不是很舒服,但却并没影响到FB的心情~~~店开在 比较老式的弄堂里,
    周围都是居民区,门面并不大,不过据说这里生意很好。性价比高么做的是绍兴菜,装修比较朴素,菜单也是很简单的A4纸
    塑封一下总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~'''
    words_ids = data_utils.sentence_to_token_ids(text, vocab_word)
    predicts = model.predict(words_ids, args.model_path)
    print text
    print "预测结果为:", predicts
    if predicts == [0]:
        print "正面评论"
    else:
        print "负面评论"
Beispiel #15
0
def decode():
  '''
  Manually input sentence interactively and the headline will be printed out
  '''
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = FLAGS.batch_size   #repeat single sentence 10 times as one batch  # We decode one sentence at a time.
    
    # Load vocabularies.
    vocab_path = os.path.join(FLAGS.data_dir,"vocab")
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
    
    # Decode from standard input interactively
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      if (len(sentence.strip('\n')) == 0):
        sys.stdout.flush()
        sentence = sys.stdin.readline()
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
      # print (token_ids) # print token ids
      # Which bucket does it belong to?
      bucket_id = min([b for b in xrange(len(buckets)) if buckets[b][0] > len(token_ids)])
      # Get a 1-element batch to feed the sentence to the model.
      # print ("current bucket id" + str(bucket_id))
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      
      # Get output logits for the sentence.
      _, _, output_logits_batch = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.     
      output_logits = []
      for item in output_logits_batch:
        output_logits.append(item[0])
      
      #print (output_logits)
      #print (len(output_logits))
      #print (output_logits[0])
      
      outputs = [int(np.argmax(logit)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
def translate_fun(sentence, sess, model, nl_vocab, rev_cm_vocab, FLAGS):
    # Get token-ids for the input sentence.
    if FLAGS.char:
        token_ids = data_utils.sentence_to_token_ids(sentence, nl_vocab,
                                                     data_tools.char_tokenizer, data_tools.basic_tokenizer)
    else:
        token_ids = data_utils.sentence_to_token_ids(sentence, nl_vocab,
                                                     data_tools.basic_tokenizer, None)

    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(model.buckets))
                    if model.buckets[b][0] > len(token_ids)])

    # Get a 1-element batch to feed the sentence to the model.
    formatted_example = model.format_example([token_ids], [[data_utils.ROOT_ID]],
                                             bucket_id=bucket_id)

    # Get output for the sentence.
    output_symbols, output_logits, losses, attn_masks = \
                model.step(sess, formatted_example, bucket_id, forward_only=True)
    batch_outputs = decode(output_symbols, rev_cm_vocab, FLAGS)

    return batch_outputs, output_logits
Beispiel #17
0
def decode():
    with tf.Session() as sess:
        # Load vocabularies.
        src_vocab_path = os.path.join(FLAGS.data_dir,
                                      "vocab%d.src" % FLAGS.src_vocab_size)
        trg_vocab_path = os.path.join(FLAGS.data_dir,
                                      "vocab%d.trg" % FLAGS.trg_vocab_size)
        src_vocab, rev_src_vocab = data_utils.initialize_vocabulary(
            src_vocab_path)
        trg_vocab, rev_trg_vocab = data_utils.initialize_vocabulary(
            trg_vocab_path)

        if FLAGS.src_vocab_size > len(src_vocab):
            FLAGS.src_vocab_size = len(src_vocab)
        if FLAGS.trg_vocab_size > len(trg_vocab):
            FLAGS.trg_vocab_size = len(trg_vocab)

        # Create model and load parameters.
        model = create_model(sess, True, FLAGS.model)
        model.batch_size = 1  # We decode one sentence at a time.

        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), src_vocab)
            token_ids.append(data_utils.EOS_ID)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, encoder_mask, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             encoder_mask, decoder_inputs,
                                             target_weights, bucket_id, True)

            # This is a beam search decoder - output is the best result from beam search
            outputs = [int(logit) for logit in output_logits]

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join([
                tf.compat.as_str(rev_trg_vocab[output]) for output in outputs
            ]))
            sentence = sys.stdin.readline()
Beispiel #18
0
 def reply_all(message):
     sentence = (message.text).lower()
     token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
     bucket_id = min([b for b in xrange(len(_buckets))
                     if _buckets[b][0] > len(token_ids)])
     encoder_inputs, decoder_inputs, target_weights = model.get_batch(
         {bucket_id: [(token_ids, [])]}, bucket_id)
     _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
     outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
     if data_utils.EOS_ID in outputs:
         outputs = outputs[:outputs.index(data_utils.EOS_ID)]
     message_text = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
     bot.reply_to(message, message_text)
Beispiel #19
0
def decode(config):
    with tf.Session() as sess:
        # Create model and load parameters.

        model = create_model(sess, config, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), config.embeddings_mentions)
            # Which bucket does it belong to?
            bucket_id = len(config.buckets) - 1
            for i, bucket in enumerate(config.buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            # else:
            #   logging.warning("Sentence truncated: %s", sentence)

            # Get a 1-element batch to feed the sentence to the model.
            # encoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            encoder_inputs, target_weights = model.get_batch([token_ids],
                                                             bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit[0])) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            # print(" ".join([tf.compat.as_str(config.embeddings_mentions_list[output]) for output in outputs]))
            out_str = ""
            for output in outputs:
                try:
                    out_str += config.embeddings_mentions_list[output] + ' '
                except:
                    pass
            print(out_str)

            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #20
0
def decode(originalText):
    global sess
    global model
    global en_vocab_path
    global cn_vocab_path
    global en_vocab
    global rev_cn_vocab

    if model is None:
        # Create model and load parameters.
        sess = tf.Session()
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one originalText at a time.
        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.train_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        cn_vocab_path = os.path.join(FLAGS.train_dir,
                                     "vocab%d.cn" % FLAGS.cn_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(
            en_vocab_path, data_utils.basic_decoder)
        _, rev_cn_vocab = data_utils.initialize_vocabulary(
            cn_vocab_path, data_utils.chinese_decoder)

    # Get token-ids for the input originalText.
    token_ids = data_utils.sentence_to_token_ids(originalText, en_vocab,
                                                 data_utils.basic_decoder)
    # Which bucket does it belong to?
    bucket_id = len(_buckets) - 1
    for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
            bucket_id = i
            break
    else:
        logging.warning("originalText truncated: %s", originalText)

    # Get a 1-element batch to feed the originalText to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the originalText.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    # Print out French originalText corresponding to outputs.
    res = ("".join([rev_cn_vocab[output] for output in outputs]))
    return res
Beispiel #21
0
def decode():
    with tf.Session() as sess:
        model = create_model(sess, True)
        model.batch_size = 1
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.dec" % gConfig['dec_vocab_size'])
        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        print("enc_vocab:", enc_vocab, "\n")
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
        print("rev_dec_vocab:", rev_dec_vocab, "\n")
        sys.stdout.write(">")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            print("sentence:", sentence)
            token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab)
            print("token_ids:", token_ids)
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            print("bucket_id:", bucket_id)
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(outfile, sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            print("output_logits:", output_logits)
            outputs = []
            for logit in output_logits:
                print("logit:", logit)
                _mx = np.argmax(logit, axis=1)
                print("_mx:", _mx)
                print("int(_mx):", int(_mx))
                outputs.append(int(_mx))
            print("outputs:", outputs)
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            res_str = ""
            for output in outputs:
                res_str += rev_dec_vocab[output][0] + " "
            print(res_str)
            print(">", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def testBLEU():
  source = sys.argv[1]
  target = sys.argv[2]
  with tf.Session() as sess:
    model = create_model(sess, True, True)
    model.batch_size = 1  
    s_vocab_path = os.path.join(FLAGS.data_dir,
                                "vocab%d.%s" % (FLAGS.s_vocab_size, source))
    t_vocab_path = os.path.join(FLAGS.data_dir,
                                "vocab%d.%s" % (FLAGS.t_vocab_size, target))
    s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path)
    _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path)
    BLEUscore = {0:[], 1:[], 2:[], 3:[]}
    s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source)
    t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target)
    f_s = open(s_test_path, 'r')
    f_t = open(t_test_path, 'r')
    step = 0
    for sentence in f_s:
      print(step)
      
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab)
      bucket_id = len(_buckets) - 1
      for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
          bucket_id = i
          break
      else:
        logging.warning("Sentence truncated: %s", sentence) 

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs]
      reference = f_t.readline().split(' ')
      try:
        temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
      except:
        temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5))
      BLEUscore[bucket_id].append(temp_score)
      step += 1
      print(temp_score)
    for key,val in BLEUscore.iteritems():
      print(key, ": ", np.mean(val))
Beispiel #23
0
def decode():
    input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        in_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.in" % FLAGS.in_vocab_size)
        out_vocab_path = os.path.join(FLAGS.data_dir,
                                      "vocab%d.out" % FLAGS.out_vocab_size)
        in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)
        _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)

        # Decode from standard input.
        with gfile.GFile("test.txt", "r") as f:
            sentence = f.readline()
        for _ in range(1):
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)
            # Which bucket does it belong to?
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            with gfile.GFile("output.txt", "w") as f:
                f.write(" ".join([rev_out_vocab[output]
                                  for output in outputs]))
Beispiel #24
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = FLAGS.batch_size  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    #sys.stdout.write("> ")
    #sys.stdout.flush()
    predif = open(FLAGS.predifname).readlines()
    predof = open(FLAGS.predofname, 'w')
    #sentence = predif.readline()
    #count = 0
    batch_decode = []
    for predin in predif:
        token_ids = data_utils.sentence_to_token_ids(predin, en_vocab)
        # Which bucket does it belong to?
        bucket_id = 0#min([b for b in xrange(len(_buckets))
                    #   if _buckets[b][0] > len(token_ids)])
        batch_decode.append((token_ids, []))
        if len(batch_decode) == FLAGS.batch_size:

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              {bucket_id: batch_decode}, bucket_id)

            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
            #embed()
            outputs = np.transpose(np.array(output_logits), (1, 0, 2))
            outputs = np.argmax(outputs, axis=2)
            # If there is an EOS symbol in outputs, cut them at that point.
            for ii, out in enumerate(outputs):
                idxx = np.where(out == data_utils.EOS_ID)[0]
                if len(idxx)>0:
                    out = out[:idxx[0]]
                predo = " ".join([rev_fr_vocab[word] for word in out])
                print (predo)
                predof.write(predo + '\n')
            batch_decode = []
Beispiel #25
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)



    # Decode sentence and store it
    with open(gConfig["test_enc"], 'r') as test_enc:
        with open(gConfig["output"], 'w') as predicted_headline:
            sentence_count = 0
            for sentence in test_enc:
                # Get token-ids for the input sentence.
                token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab)
                # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then X.
                bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1])
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                           target_weights, bucket_id, True)

                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

                # If there is an EOS symbol in outputs, cut them at that point.
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Write predicted headline corresponding to article.
                predicted_headline.write(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])+'\n')
                sentence_count += 1
                if sentence_count % 100 == 0:
                    print("predicted data line %d" % sentence_count)
                    sys.stdout.flush()

        predicted_headline.close()
    test_enc.close()

    print("Finished decoding and stored predicted results in %s!" % gConfig["output"])
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_idsgb = data_utils.sentence_to_token_ids(sentence, en_vocab)
            # Truncate sentence to the maximum bucket size
            token_ids = token_idsgb[0:479]
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([rev_fr_vocab[output] for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #27
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        source_vocab_path = os.path.join(FLAGS.data_dir,
                                         ("vocab%d." + FLAGS.source_ext) % FLAGS.source_vocab_size)
        target_vocab_path = os.path.join(FLAGS.data_dir,
                                         ("vocab%d." + FLAGS.target_ext) % FLAGS.target_vocab_size)
        source_vocab, _ = data_utils.initialize_vocabulary(source_vocab_path)
        _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), source_vocab)
            # Which bucket does it belong to?
            bucket_id = min([b for b in xrange(len(_buckets))
                             if _buckets[b][0] > len(token_ids)])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out target language sentence corresponding to outputs.
            out_sentence = " ".join([tf.compat.as_str(rev_target_vocab[output]) for output in outputs])
            print(out_sentence)
            if FLAGS.translation_file != "":
                with gfile.GFile(FLAGS.translation_file, mode="ab") as fw:
                    fw.write(FLAGS.source_ext + "> " + sentence)
                    fw.write(FLAGS.target_ext + "> " + out_sentence + b"\n\n")
                    fw.flush()
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def decode():

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
  config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(config=config) as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # bucket_belong???
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get the required batch.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output_logits
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)

      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      final_output = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])

      if('_UNK' in final_output ):
          final_output = "I didn\'t learn how to respond to that."
      print(final_output)
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Beispiel #29
0
def decode_input():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d_enc.txt" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d_dec.txt" % gConfig['dec_vocab_size'])

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab)
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ] + [len(_buckets) - 1])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]

            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join([
                tf.compat.as_str(rev_dec_vocab[output]) for output in outputs
            ]))

            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #30
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.in" % FLAGS.vocab_size)
        vocab, vocab_rev = data_utils.initialize_vocabulary(vocab_path)
        # Decode from standard input.
        sys.stdout.write("You> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline().lower()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out our response sentence corresponding to outputs.
            try:
                print('%s: %s' % (name, buildSentence(outputs, vocab_rev)))
            except Exception as e:
                print(e)
                pass
            print("You> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #31
0
def catreco():
    #parse out GET request parameters: e.g.:  /api/catreco?title=iphone5s&?nbest=10
    title = request.args.get('title')
    if 'nbest' in request.args:
        nbest = int(request.args.get('nbest'))
    else:
        nbest = 10

    # inference tensorflow model
    # Get token-ids for the input sentence.
    source_tokens = data_utils.sentence_to_token_ids(tf.compat.as_bytes(title),
                                                     app.src_vocab,
                                                     normalize_digits=True)
    src_len = len(source_tokens)
    if src_len > FLAGS.max_seq_length:
        source_tokens = source_tokens[:FLAGS.max_seq_length]
    else:
        source_tokens = source_tokens + [data_utils.PAD_ID
                                         ] * (FLAGS.max_seq_length - src_len)

    dict = app.model.get_predict_feed_dict(np.array([source_tokens]),
                                           app.target_inputs,
                                           np.array([src_len]),
                                           app.target_lens)

    pred_conf, pred_labels = app.sess.run(
        [app.model.predicted_tgts_score, app.model.predicted_labels],
        feed_dict=dict)
    pred_labels = np.vstack(pred_labels)
    pred_conf = np.vstack(pred_conf)
    top_confs = pred_conf[0][:nbest]
    top_tgtIDs = [
        app.fullLabel_tgtID_Map[lbl] for lbl in pred_labels[0][:nbest]
    ]
    top_tgtNames = [app.tgtID_Name_Map[id] for id in top_tgtIDs]
    topCategories = []
    for idx in range(nbest):
        print('top%d:  %s , %f ,  %s ' %
              (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx]))
        entry = {}
        entry['leafCatId'] = top_tgtIDs[idx]
        entry['leafCatName'] = top_tgtNames[idx]
        entry['confScore'] = float(top_confs[idx])
        topCategories.append(entry)
    return jsonify({'ReqeustTitle': title, 'ClassifyResults': topCategories})
Beispiel #32
0
def decode():
    """Propagate forward and create a response to an input sentence"""
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(
            sess, True
        )  # forward_only is True, because we don't need to backpropagate
        model.batch_size = 1  # We decode one sentence at a time.

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(buckets))
                if buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                # Creating dictionary, not list, because there's only one bucket_id which is maybe != 0
                {bucket_id: [(token_ids, [])]},
                bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:  # TODO rewrite according to data_utils
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out the response sentence corresponding to outputs.
            print(data_utils.TOTAL_VOCAB[output] for output in outputs)

            # Read next input line
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #33
0
 def run(self, sentence):
   # Get token-ids for the input sentence.
   token_ids = data_utils.sentence_to_token_ids(sentence, self.en_vocab)
   # Which bucket does it belong to?
   bucket_id = min(b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) )  
   # Get a 1-element batch to feed the sentence to the model.
   encoder_inputs, decoder_inputs, target_weights = self.model.get_batch(
       {bucket_id: [(token_ids, [])]}, bucket_id)
   # Get output logits for the sentence.
   _, _, output_logits = self.model.step(self.sess, encoder_inputs, decoder_inputs,
                                    target_weights, bucket_id, True)
   # This is a greedy decoder - outputs are just argmaxes of output_logits.
   outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
   # If there is an EOS symbol in outputs, cut them at that point.
   if data_utils.EOS_ID in outputs:
     outputs = outputs[:outputs.index(data_utils.EOS_ID)]
   # Print out French sentence corresponding to outputs.
   return "".join([self.rev_fr_vocab[output] for output in outputs])
Beispiel #34
0
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence):
    """
    DOCSTRING
    """
    token_ids = data_utils.sentence_to_token_ids(
        tensorflow.compat.as_bytes(sentence), enc_vocab)
    bucket_id = min(
        [b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)])
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    outputs = [int(numpy.argmax(logit, axis=1)) for logit in output_logits]
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    return " ".join([
        tensorflow.compat.as_str(rev_dec_vocab[output]) for output in outputs
    ])
def decode():
  with tf.device("/cpu:0") and tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = './en-vocab.txt'
    ner_vocab_path = './ner-vocab.txt'
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)

    _, rev_ner_vocab = data_utils.initialize_vocabulary(ner_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      # Which bucket does it belong to?
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out French sentence corresponding to outputs.
      inp_sentence = list(sentence.split())
      out_sentence = [tf.compat.as_str(rev_ner_vocab[output]) for output in outputs]
      tagging = zip(inp_sentence, out_sentence)
      for tags in tagging:
        print (tags)
      #print(" ".join([tf.compat.as_str(rev_ner_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Beispiel #36
0
def decode():
    """
    DOCSTRING
    """
    gpu_options = tensorflow.GPUOptions(per_process_gpu_memory_fraction=0.2)
    config = tensorflow.ConfigProto(gpu_options=gpu_options)
    with tensorflow.Session(config=config) as sess:
        model = create_model(sess, True)
        model.batch_size = 1
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.dec" % gConfig['dec_vocab_size'])
        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            token_ids = data_utils.sentence_to_token_ids(
                tensorflow.compat.as_bytes(sentence), enc_vocab)
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(numpy.argmax(logit, axis=1)) for logit in output_logits
            ]
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join([
                tensorflow.compat.as_str(rev_dec_vocab[output])
                for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #37
0
def decode():
    # Decode
    net = network_building()
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.load(MODEL_PATH)
    en_vocab, _ = data_utils.initialize_vocabulary(vocabulary_Path)
    #text = 'The paper is great. However, it comes glued and to start the roll there is quite a bit of waste. Perhaps, you can find a way to package it. Thanks'


    print('start commenting')
    HOST = ''  # Symbolic name meaning all available interfaces
    PORT = 8082  # Arbitrary non-privileged port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind((HOST, PORT))
    s.listen(1)
    conn, addr = s.accept()
    print('Connected by', addr)

    while True:
        text = conn.recv(1024)
        if not text:
            break
        print(str(text,'utf-8').rstrip())

        text = data_utils.sentence_to_token_ids(tf.compat.as_bytes(text), vocabulary=en_vocab, tokenizer=False)
        # print(text)
        datalist = []
        datalist.append(text)
        datalist = pad_sequences(datalist, maxlen=300, value=0.)
        # print(datalist)
        result = model.predict(datalist)
        # resultlabel = model.predict_label(datalist)
        print(result)
        # print(resultlabel)
        resultlist = result[0]
        maxnum = 0.0
        score = 0
        # print(resultlist)
        for i in range(len(resultlist)):
            if resultlist[i] > maxnum:
                maxnum = resultlist[i]
                score = i
        print(score)
        conn.sendall(bytes(" ".join(str(score))+'\n','utf-8'))
Beispiel #38
0
def decode_tester(sess, model):
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.from" % FLAGS.from_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.to" % FLAGS.to_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    # sys.stdout.write("> ")
    # sys.stdout.flush()
    sentence = "Who is the president of the United States?"
    # print(" input: " + sentence)
    # while sentence:
    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                 en_vocab)
    # Which bucket does it belong to?
    bucket_id = len(_buckets) - 1
    for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
            bucket_id = i
            break
    else:
        logging.warning("Sentence truncated: %s", sentence)

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    # Print out French sentence corresponding to outputs.
    print("\toutput: " + " ".join(
        [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
    sys.stdout.flush()
def decode():
    with tf.Session(config=config) as sess:
        #print ("Hello!!")
        model = create_model(sess, True)
        model.batch_size = 1

        in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt")
        out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt")

        in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)
        _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)

        print("Hello!!")
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            sentence = wakati(sentence)
            token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)

            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])

            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)

            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)

            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]

            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]

            print("".join([rev_out_vocab[output] for output in outputs]))
            print("\n> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #40
0
def decode():

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
  config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(config=config) as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # bucket_belong???
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get the required batch. 
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output_logits
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      
      print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Beispiel #41
0
def decode_word(word, sess, model, gr_vocab, rev_ph_vocab):
  # Get token-ids for the input sentence.
  token_ids = data_utils.sentence_to_token_ids(word, gr_vocab)
  # Which bucket does it belong to?
  bucket_id = min([b for b in xrange(len(_buckets))
                   if _buckets[b][0] > len(token_ids)])
  # Get a 1-element batch to feed the sentence to the model.
  encoder_inputs, decoder_inputs, target_weights = model.get_batch(
      {bucket_id: [(token_ids, [])]}, bucket_id)
  # Get output logits for the sentence.
  _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, True)
  # This is a greedy decoder - outputs are just argmaxes of output_logits.
  outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
  # If there is an EOS symbol in outputs, cut them at that point.
  if data_utils.EOS_ID in outputs:
    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
  # Print out phoneme corresponding to outputs.
  res_phoneme_seq = " ".join([rev_ph_vocab[output] for output in outputs])
  return res_phoneme_seq
Beispiel #42
0
def decode():
    print("Decoding interactively")
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.
        # Load vocabularies.
        vocab_path = "vocab%d" % FLAGS.vocab_size
        vocab, rev_vocab = data_utils.initialize_vocab(vocab_path)
        
        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
            # Which bucket does it belong to?
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence) 

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out sentence corresponding to outputs.
            print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #43
0
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence):
    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)

    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)

    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)

    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]

    return " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
Beispiel #44
0
def decode():
  with tf.Session() as sess:
    print ("Hello!!")
    model = create_model(sess, True)                         
    model.batch_size = 1  
    
    in_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab_in.txt")     
    out_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab_out.txt" )
                                                                        
    in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)        
    _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)    

    
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()    
    while sentence:

      token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)   
      
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])               

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
    
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,      
                                       target_weights, bucket_id, True)

      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]       

      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]                      

      print(" ".join([rev_out_vocab[output] for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()                                             
def inter_decode(sent, position, mapp):
  with tf.Session() as sess:
    # Load dictionary
    srce_vocab_path = os.path.join(data_dir, "train", "vocab%d.srce" % 2)
    trgt_vocab_path = os.path.join(data_dir, "train", "vocab%d.trgt" % 0)
    srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Create model
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    # model.batch_size = 1  # We decode one sentence at a time.

    sentence = sent
    init_pos = eval(position)
    mapp = eval(mapp)

    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab)
    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets))
                     if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, target_weight, pos, maps = model.get_batch(
        {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, 
                decoder_inputs_positions=pos, decoder_inputs_maps=maps)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
      outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    
    final_pos = out_pos[0].tolist()
    for l in xrange(len(outputs)-1):
      final_pos.extend(out_pos[l+1].tolist())

    return final_pos
Beispiel #46
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  #预测阶段我们只输入一个句子

    # 加载词汇表
    en_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.fr" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # 翻译:我们用控制台输入英语句子
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    #对输入结果进行翻译解码
    while sentence:
      # 先把输入的单词,转换成索引形式
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      # 根据句子的长度,判读属于哪个buckets
      bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      #得到概率输出序列
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      #取一个输出序列的argmax最大的概率单词
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

      if data_utils.EOS_ID in outputs:#如果翻译结果中存在EOS_ID,那么我们只需截取前面的单词,作为结果
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # 大印结果
      print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    if FLAGS.beam_size > 0:
        use_beamsearch = True
    else:
        use_beamsearch = False
    model = create_model(sess, True, use_beamsearch=use_beamsearch)
    model.batch_size = 1  # We decode one sentence at a time.
    if FLAGS.use_ori:
      tokenizer = useori_tokenizer
      vocab_data_dir = os.path.join(FLAGS.data_dir, 'ori')
    else:
      tokenizer = cut_tokenizer
      vocab_data_dir = os.path.join(FLAGS.data_dir, 'cut')

    # Load vocabularies.
    en_vocab_path = os.path.join(vocab_data_dir,
                                 "vocab%d.q" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(vocab_data_dir,
                                 "vocab%d.a" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      try:
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab, tokenizer=tokenizer)
        if len(token_ids) >= _buckets[-1][0]:
            token_ids = token_ids[0:(_buckets[-1][0]-1)]
        print(token_ids)
        # Which bucket does it belong to?
        bucket_id = min([b for b in xrange(len(_buckets))
                         if _buckets[b][0] > len(token_ids)])
        # TODO: indeed can produce longer answers, but with some repeat parts consequently
        #bucket_id = len(_buckets) - 1

        if FLAGS.beam_size > 0:
            def cal_function(decoder_token_ids, idx):
                print('decoder_token_ids:', decoder_token_ids)
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, decoder_token_ids)]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                 target_weights, bucket_id, True)
                #print(np.shape(output_logits[idx-1]))
                #print(output_logits[idx-1])
                fake_logits = output_logits[idx-1].reshape([-1])
                return log_sigmoid(inputs=fake_logits)
            beam_search = BeamSearch(beam_size=FLAGS.beam_size)
            beam_search.run(max_step=model.buckets[bucket_id][1], cal_function=cal_function)
            final_token_paths = beam_search.get_final_token_paths()
            for outputs in final_token_paths:
                print(outputs)
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Print out French sentence corresponding to outputs.
                print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
                print('done')
        else:
            model.use_beamsearch = False
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            print(outputs)
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
            print('done')
      except ValueError, e:
        print(e)
        print("Bad input! Try again:")
      finally:
def decode():
   with tf.Session() as sess:
	   # Create model and load parameters.
	   # second arguments means this model are not Training
	   model = create_model(sess, True)
	   # we decode one sentence at a time
	   model.batch_size = 1

	   # Load vocabularies

	   vocab_path = os.path.join(FLAGS.data_dir,"Word_map.txt")
	   vocab, Q_vocab = data_utils.initialize_vocabulary(vocab_path)


	   while 1:
		   # Get token-ids for the input sentence
		   sys.stdout.write("Input >> ")
		   sys.stdout.flush()
		   sentence = sys.stdin.readline()
		   token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
		   #print sentence
		   #print token_ids
		   #print np.shape(token_ids)
		   # Which bucket oes it belong to?
		   bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

		   # Get a 1-element batch to feed the sentence to the model.
		   encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id : [(token_ids,[])]},bucket_id)
		   #print np.shape(decoder_inputs)
		   # Get output logits for the sentence.
		   _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)

		   bucket_length = (_buckets[bucket_id])[1]
#		   softmax_output_logits = np.zeros(),dtype=np.float)
		   #outputs = np.zeros(bucket_length,np.int)
		   outputs = []
		   max_outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits]
		   
		  # for i in range(bucket_length):
		  # 	softmax_output_logits = sess.run(tf.nn.softmax(output_logits[i]))
		  #	cum_sum = np.cumsum(softmax_output_logits)
		  #	random_number_02 = np.random.random_sample()
			#print softmax_output_logits.max()
			#print softmax_output_logits.argmax()
#	  	max_outputs.append(softmax_output_logits.argmax())
		  #	output = min( [j for j in xrange(len(cum_sum)) if cum_sum[j] > random_number_02] )
		  #	outputs.append(output)
		   # This is a greedy decoder - outputs are just argmaxes of output_logits.
#		   outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
		   # If there is an EOS symbol in outputs, cut them at that point.
#
#		   if data_utils.EOS_ID in outputs:
#			   outputs = outputs[:outputs.index(data_utils.EOS_ID)]
		   if data_utils.EOS_ID in max_outputs:
		   	   max_outputs = max_outputs[:max_outputs.index(data_utils.EOS_ID)]
		   #print Q_vocab[outputs[0]]
		   #print (outputs)
#		   print ("sampling output >>")
#		   print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in outputs]))
		
		   #print (max_outputs)
		   print ("output >>")
		   print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in max_outputs]))
		   print("=====================")
Beispiel #49
0
def decode_from_file(files, model_path=None, use_best=False, get_ids=True, FLAGS=None, buckets=None):

    assert FLAGS is not None
    assert buckets is not None

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # load model parameters.
        model = create_seq2seq_model(sess, model_path=model_path, forward_only=True,
                                     use_best=use_best, FLAGS=FLAGS, buckets=buckets,
                                     translate=True)

        # Load vocabularies.
        source_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.source_lang)

        target_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.target_lang)

        src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file)
        _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file)

        start_total_time = time.time()
        total_sentence_count = 0

        for file_path in files:

            print("Translating file %s\n" % file_path)

            sentence_count = 0

            # Decode from file.
            with gfile.GFile(file_path, mode='r') as source:
                with gfile.GFile(file_path + '.trans', mode='w') as destiny:
                    sentence = source.readline()

                    start_time = time.time()
                    while sentence:

                        sentence_count += 1
                        print("Translating sentence %d ", sentence_count)

                        if get_ids:

                            # Get token-ids for the input sentence.
                            token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab)

                        else:

                            # if sentence is already converted, just split the ids
                            token_ids = [int(ss) for ss in sentence.strip().split()]

                        # Get output logits for the sentence.
                        output_hypotheses, output_scores = model.translation_step(sess,
                                                                                  token_ids,
                                                                                  FLAGS.beam_size,
                                                                                  normalize=True,
                                                                                  dump_remaining=True)

                        outputs = output_hypotheses[0]

                        # Print out sentence corresponding to outputs.
                        destiny.write(" ".join([rev_tgt_vocab[output] for output in outputs]))
                        destiny.write("\n")
                        sentence = source.readline()

                    end_time = time.time() - start_time

                    print("\nDone file %s" % file_path)
                    print("Avg. %.3f sentences/sec" % (sentence_count / end_time))

            total_sentence_count += sentence_count

        end_total_time = time.time() - start_total_time

        print("\nDone!")
        print("Avg. %.3f sentences/sec" % (total_sentence_count / end_total_time))
Beispiel #50
0
def translate_file(source_path=dev_code_file, target_path=translated_dev_code): 
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        code_vocab_path = os.path.join(data_dir,
           "vocab%d.code" % FLAGS.code_vocab_size)
        en_vocab_path = os.path.join(data_dir,
           "vocab%d.en" % FLAGS.en_vocab_size)
        code_vocab, _ = data_utils.initialize_vocabulary(code_vocab_path)
        _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)

        with tf.gfile.GFile(source_path, mode="r") as source_file:
            with tf.gfile.GFile(target_path, mode="w") as translated_file:
            
                sentence = source_file.readline()
                counter = 0
                print (" Translating file %s " % dev_code_file)
                
                while sentence:
                    # Get token-ids for the input sentence.
                    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), code_vocab)

                    buckets = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]
                    if buckets:
                        bucket_id = min(buckets)
                    else:
                        # print ("line %d with tokens %d" % (counter, len(token_ids)))
                        translated_file.write("_UNK \n")
                        sentence = source_file.readline()
                        continue
                    
                    # Which bucket does it belong to?
                    # bucket_id = min([b for b in xrange(len(_buckets))
                                    # if _buckets[b][0] > len(token_ids)])
                                    
                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                                                                                            {bucket_id: [(token_ids, [])]}, bucket_id)

                    # Get output logits for the sentence.
                    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                                    target_weights, bucket_id, True)
                                                                    
                                                                    
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                        
                    # Write translated sentence to translation file.
                    translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n")
                    
                    # print ("> %s" % sentence)
                    # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
                    
                    # Get next sentence and print checkpoints.
                    counter +=1
                    sentence = source_file.readline()
                    if( counter % 500 is 0):
                        print(" Line %d translated" % counter)
                    
                print (" File translated")
def test_BLEU():
    # Perform BLEU score testing here
    with tf.Session() as sess:
      model = create_model(sess, True, False)
      source = sys.argv[1]
      target = sys.argv[2]
      model.batch_size = 1  # We decode one sentence at a time.

      # Load vocabularies.
      s_vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.%s" % (FLAGS.s_vocab_size, source))
      t_vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.%s" % (FLAGS.t_vocab_size, target))
      s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path)
      _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path)

      # Decode from standard input.
      BLEUscore = {0:[], 1:[], 2:[], 3:[]}
      s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source)
      t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target)
      f_s = open(s_test_path, 'r')
      f_t = open(t_test_path, 'r')
      # print(f_s.readline())
      step = 0
      for sentence in f_s:
        print(step)
        # sentence = f_ja.readline()
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab)
        # Which bucket does it belong to?
        bucket_id = len(_buckets) - 1
        for i, bucket in enumerate(_buckets):
          if bucket[0] >= len(token_ids):
            bucket_id = i
            break
        else:
          logging.warning("Sentence truncated: %s", sentence) 

        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, True)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out Japanese sentence corresponding to outputs.
        candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs]
        reference = f_t.readline().split(' ')
        print(candidate, reference)
        try:
          temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
        except:
          temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5))
        BLEUscore[bucket_id].append(temp_score)
        step += 1
        print(temp_score)
      for key,val in BLEUscore.iteritems():
        print(key, ": ", np.mean(val))
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    # en_vocab_path = os.path.join(FLAGS.data_dir,
    #                              "vocab%d.from" % FLAGS.from_vocab_size)
    # fr_vocab_path = os.path.join(FLAGS.data_dir,
    #                              "vocab%d.to" % FLAGS.to_vocab_size)
    # en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    # _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    input = open('chinese_word2id.txt', 'r')

    chinese_word2id = {}
    while True:
        line = input.readline()
        if line == None or len(line) == 0:
            break
        words = line.split(' ')
        chinese_word2id[words[0]] = int(words[1].strip('\n'))

    input = open('english_word2id.txt', 'r')

    english_id2word = {}
    while True:
        line = input.readline()
        if line == None or len(line) == 0:
            break
        words = line.split(' ')
        english_id2word[int(words[1].strip('\n'))] = words[0]

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), chinese_word2id)
      # Which bucket does it belong to?
      bucket_id = len(_buckets) - 1
      for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
          bucket_id = i
          break
      else:
        logging.warning("Sentence truncated: %s", sentence)

      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out French sentence corresponding to outputs.
      print(" ".join([tf.compat.as_str(english_id2word[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()