Beispiel #1
0
 def __init__(self, args):
     super(DataManager, self).__init__()
     self.args = args
     self.pairs = args.pairs.split(',')
     self.lang_vocab, self.lang_ivocab = ut.init_vocab(join(args.data_dir, 'lang.vocab'))
     self.vocab, self.ivocab = ut.init_vocab(join(args.data_dir, 'vocab.joint'))
     self.logit_masks = {}
     for lang in self.lang_vocab:
         mask = np.load(join(args.data_dir, 'mask.{}.npy'.format(lang)))
         self.logit_masks[lang] = torch.from_numpy(mask)
Beispiel #2
0
def decode():
    with tf.Session() as sess:
        model = create_model(sess, True)
        model.batch_size = 1

        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.form_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)

        en_vocab, _     = utils.init_vocab(en_vocab_path)
        _, rev_fr_vocab = utils.init_vocab(fr_vocab_path)

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            token_ids = utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(tokenids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            outputs = [int(np.argmax(logti, axis=1)) for logti in output_logits]

            if utils.END_ID in outputs:
                outputs = outputs[:outputs.index(utils.EOS_ID)]

            print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Beispiel #3
0
def get_summary(input_path, reference_path, summary_path):
    input_file = open(input_path, 'r')
    reference_file = open(reference_path, 'r')
    out_file = open(summary_path, 'w')

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True

    with tf.Session(config=gpu_config) as sess:
        model = create_model(session=sess, forward_only=True, train=False)
        _, index2word_vocab = utils.init_vocab('./data/vocab.txt')
        for content_ids, reference_ids in zip(input_file, reference_file):
            content = [int(index) for index in content_ids.split()]
            reference = [int(index) for index in reference_ids.split()]

            bucket_ids = [
                b for b in xrange(len(buckets)) if buckets[b][0] > len(content)
            ]
            if len(bucket_ids) == 0:
                print('sentence length %d exceed max length in buckets' %
                      len(content))
                continue
            bucket_id = min(bucket_ids)
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(content, [])]}, bucket_id)

            # output logits for the sentence
            _, _, output_logits_batch = model.step(sess, encoder_inputs,
                                                   decoder_inputs,
                                                   target_weights, bucket_id,
                                                   True)
            output_logits = []
            for item in output_logits_batch:
                output_logits.append(item[0])
            output_index = [int(np.argmax(logit)) for logit in output_logits]

            # 如果存在EOS_ID,进行截断
            if utils.EOS_ID in output_index:
                output_index = output_index[:output_index.index(utils.EOS_ID)]

            summary = [
                tf.compat.as_str(index2word_vocab[index])
                for index in output_index
            ]
            out_file.write(''.join(summary) + '\n')
            print(' '.join(summary))
    start_vocab = ac._START_VOCAB
    sorted_keys = joint_vocab.most_common()
    sorted_keys = [kv[0] for kv in sorted_keys]
    vocab_keys = start_vocab + sorted_keys
    max_vocab_size = args.max_vocab_size
    if 0 < max_vocab_size < len(vocab_keys):
        print('Cut off vocab to top {} types'.format(max_vocab_size))
        vocab_keys = vocab_keys[:max_vocab_size]

    joint_vocab_file = join(data_dir, 'vocab.joint')
    open(joint_vocab_file, 'w').close()
    with open(joint_vocab_file, 'w') as fout:
        for idx, key in enumerate(vocab_keys):
            fout.write('{} {} {}\n'.format(key, idx, joint_vocab.get(key, 0)))

    joint_vocab, _ = ut.init_vocab(joint_vocab_file)

    # get logit mask for each language
    for lang in langs:
        # 0 means masked out, 1 means kept
        mask = np.zeros(len(joint_vocab), dtype=np.uint8)
        mask[ac.UNK_ID] = 1
        mask[ac.EOS_ID] = 1
        for key in sub_vocabs[lang]:
            mask[joint_vocab.get(key, ac.UNK_ID)] = 1

        mask_file = join(data_dir, 'mask.{}.npy'.format(lang))
        np.save(mask_file, mask)

    # save all training data as npy files
    for pair in pairs:
Beispiel #5
0
if __name__ == '__main__':
    args = get_parser().parse_args()
    config = getattr(configurations, args.config)()
    for k, v in config.items():
        setattr(args, k, v)

    if not exists(args.bleu_script):
        raise ValueError('Bleu script not found at {}'.format(
            args.bleu_script))

    dump_dir = args.dump_dir
    Popen('mkdir -p %s' % dump_dir, shell=True).wait()

    # model needs these vocab sizes, but it's better to be calculated here
    vocab_file = join(args.data_dir, 'vocab.joint')
    vocab, _ = ut.init_vocab(vocab_file)
    args.joint_vocab_size = len(vocab)

    lang_vocab_file = join(args.data_dir, 'lang.vocab')
    lang_vocab, _ = ut.init_vocab(lang_vocab_file)
    args.lang_vocab_size = len(lang_vocab)

    # since args is passed to many modules, keep logger with it instead of reinit everytime
    log_file = join(dump_dir, 'DEBUG.log')
    logger = args.logger = ut.get_logger(log_file)

    # log args for future reference
    logger.info(args)

    model = Transformer(args)
    # TODO: nicer formatting?