def __init__(self, args): super(DataManager, self).__init__() self.args = args self.pairs = args.pairs.split(',') self.lang_vocab, self.lang_ivocab = ut.init_vocab(join(args.data_dir, 'lang.vocab')) self.vocab, self.ivocab = ut.init_vocab(join(args.data_dir, 'vocab.joint')) self.logit_masks = {} for lang in self.lang_vocab: mask = np.load(join(args.data_dir, 'mask.{}.npy'.format(lang))) self.logit_masks[lang] = torch.from_numpy(mask)
def decode(): with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.form_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = utils.init_vocab(en_vocab_path) _, rev_fr_vocab = utils.init_vocab(fr_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(tokenids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logti, axis=1)) for logti in output_logits] if utils.END_ID in outputs: outputs = outputs[:outputs.index(utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def get_summary(input_path, reference_path, summary_path): input_file = open(input_path, 'r') reference_file = open(reference_path, 'r') out_file = open(summary_path, 'w') gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True with tf.Session(config=gpu_config) as sess: model = create_model(session=sess, forward_only=True, train=False) _, index2word_vocab = utils.init_vocab('./data/vocab.txt') for content_ids, reference_ids in zip(input_file, reference_file): content = [int(index) for index in content_ids.split()] reference = [int(index) for index in reference_ids.split()] bucket_ids = [ b for b in xrange(len(buckets)) if buckets[b][0] > len(content) ] if len(bucket_ids) == 0: print('sentence length %d exceed max length in buckets' % len(content)) continue bucket_id = min(bucket_ids) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(content, [])]}, bucket_id) # output logits for the sentence _, _, output_logits_batch = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) output_logits = [] for item in output_logits_batch: output_logits.append(item[0]) output_index = [int(np.argmax(logit)) for logit in output_logits] # 如果存在EOS_ID,进行截断 if utils.EOS_ID in output_index: output_index = output_index[:output_index.index(utils.EOS_ID)] summary = [ tf.compat.as_str(index2word_vocab[index]) for index in output_index ] out_file.write(''.join(summary) + '\n') print(' '.join(summary))
start_vocab = ac._START_VOCAB sorted_keys = joint_vocab.most_common() sorted_keys = [kv[0] for kv in sorted_keys] vocab_keys = start_vocab + sorted_keys max_vocab_size = args.max_vocab_size if 0 < max_vocab_size < len(vocab_keys): print('Cut off vocab to top {} types'.format(max_vocab_size)) vocab_keys = vocab_keys[:max_vocab_size] joint_vocab_file = join(data_dir, 'vocab.joint') open(joint_vocab_file, 'w').close() with open(joint_vocab_file, 'w') as fout: for idx, key in enumerate(vocab_keys): fout.write('{} {} {}\n'.format(key, idx, joint_vocab.get(key, 0))) joint_vocab, _ = ut.init_vocab(joint_vocab_file) # get logit mask for each language for lang in langs: # 0 means masked out, 1 means kept mask = np.zeros(len(joint_vocab), dtype=np.uint8) mask[ac.UNK_ID] = 1 mask[ac.EOS_ID] = 1 for key in sub_vocabs[lang]: mask[joint_vocab.get(key, ac.UNK_ID)] = 1 mask_file = join(data_dir, 'mask.{}.npy'.format(lang)) np.save(mask_file, mask) # save all training data as npy files for pair in pairs:
if __name__ == '__main__': args = get_parser().parse_args() config = getattr(configurations, args.config)() for k, v in config.items(): setattr(args, k, v) if not exists(args.bleu_script): raise ValueError('Bleu script not found at {}'.format( args.bleu_script)) dump_dir = args.dump_dir Popen('mkdir -p %s' % dump_dir, shell=True).wait() # model needs these vocab sizes, but it's better to be calculated here vocab_file = join(args.data_dir, 'vocab.joint') vocab, _ = ut.init_vocab(vocab_file) args.joint_vocab_size = len(vocab) lang_vocab_file = join(args.data_dir, 'lang.vocab') lang_vocab, _ = ut.init_vocab(lang_vocab_file) args.lang_vocab_size = len(lang_vocab) # since args is passed to many modules, keep logger with it instead of reinit everytime log_file = join(dump_dir, 'DEBUG.log') logger = args.logger = ut.get_logger(log_file) # log args for future reference logger.info(args) model = Transformer(args) # TODO: nicer formatting?