def __init__(self): # para setting self.arg_state = 'search_state.pkl' self.arg_changes = "" self.arg_model_path = 'search_model.npz' self.arg_beam_search = True self.arg_ignore_unk = False self.arg_normalize = False self.state = prototype_state() with open(self.arg_state) as src: self.state.update(cPickle.load(src)) self.state.update(eval("dict({})".format(self.arg_changes))) logging.basicConfig( level=getattr(logging, self.state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(self.state['seed']) self.enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True) self.enc_dec.build() self.lm_model = self.enc_dec.create_lm_model() self.lm_model.load(self.arg_model_path) self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb')) self.beam_search = None self.beam_search = BeamSearch(self.enc_dec) self.beam_search.compile() self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=args.skip_init, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def create_loop(state, skip_init=False): """TODO: Docstring for create_loop. :state: TODO :skip_init: TODO :returns: TODO """ log.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state["seed"]) enc_dec = RNNEncoderDecoder(state, rng, skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() log.debug("Load data") train_data = get_batch_iterator(state) log.debug("Compile trainer") algo = eval(state["algo"])(lm_model, state, train_data) log.debug("Run training") return MainLoop( train_data, None, None, lm_model, algo, state, None, reset=state["reset"], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state["hookFreq"] >= 0 else None, )
def get_models(): args = parse_args() state_en2fr = prototype_state() if hasattr(args, 'state_en2fr'): with open(args.state_en2fr) as src: state_en2fr.update(cPickle.load(src)) state_en2fr.update(eval("dict({})".format(args.changes))) state_fr2en = prototype_state() if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: with open(args.state_fr2en) as src: state_fr2en.update(cPickle.load(src)) state_fr2en.update(eval("dict({})".format(args.changes))) rng = numpy.random.RandomState(state_en2fr['seed']) enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True) enc_dec_en_2_fr.build() lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model() lm_model_en_2_fr.load(args.model_path_en2fr) indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb')) if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: rng = numpy.random.RandomState(state_fr2en['seed']) enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True) enc_dec_fr_2_en.build() lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model() lm_model_fr_2_en.load(args.model_path_fr2en) return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] else: return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr,\ None, None, None]
def __init__(self, args): self.args = args self.state = prototype_state() with open(self.args.state) as src: self.state.update(cPickle.load(src)) self.state.update(eval("dict({})".format(self.args.changes))) logging.basicConfig(level=getattr(logging, self.state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(self.state['seed']) enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True) enc_dec.build() self.lm_model = enc_dec.create_lm_model() self.lm_model.load(self.args.model_path) self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb')) self.sampler = None self.beam_search = None if self.args.beam_search: self.beam_search = BeamSearch(enc_dec) self.beam_search.compile() else: self.sampler = enc_dec.create_sampler(many_samples=True) self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): args = parse_args() # this loads the state specified in the prototype state = getattr(experiments.nmt, args.proto)() # this is based on the suggestion in the README.md in this foloder if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() # If we are going to use validation with the bleu script, we # will need early stopping bleu_validator = None if state['bleu_script'] is not None and state['validation_set'] is not None\ and state['validation_set_grndtruth'] is not None: # make beam search beam_search = BeamSearch(enc_dec) beam_search.compile() bleu_validator = BleuValidator(state, lm_model, beam_search, verbose=state['output_validation_set']) logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], bleu_val_fn = bleu_validator, hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 and state['validation_set'] is not None else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def get_models(): args = parse_args() state_en2fr = prototype_state() if hasattr(args, 'state_en2fr'): with open(args.state_en2fr) as src: state_en2fr.update(cPickle.load(src)) state_en2fr.update(eval("dict({})".format(args.changes))) state_fr2en = prototype_state() if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: with open(args.state_fr2en) as src: state_fr2en.update(cPickle.load(src)) state_fr2en.update(eval("dict({})".format(args.changes))) rng = numpy.random.RandomState(state_en2fr['seed']) enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True) enc_dec_en_2_fr.build() lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model() lm_model_en_2_fr.load(args.model_path_en2fr) indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb')) if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: rng = numpy.random.RandomState(state_fr2en['seed']) enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True) enc_dec_fr_2_en.build() lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model() lm_model_fr_2_en.load(args.model_path_fr2en) return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] else: return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ None, None, None]
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s" ) server_address = ("", args.port) httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state["seed"]) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state["word_indx"], "rb")) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state["indx_word"], "r")) tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"] detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"] sampler = Sampler( state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd, ) httpd.sampler = sampler print "Server starting.." httpd.serve_forever() """
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) scoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) idict_trg = cPickle.load(open(state['indx_word_target'],'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) ScoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) alignment_fun = enc_dec.create_probs_computer(return_alignment=True) word_indx_src = cPickle.load(open(state['word_indx'], 'rb')) word_indx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) source_file = args.source target_file = args.target output_file = args.output comput_alignment(source_file, target_file, output_file, alignment_fun, word_indx_src, word_indx_trg, state)
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") server_address = ('', args.port) httpd = ThreadedHTTPServer(server_address, MTReqHandler) #httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-'] detokenizer_cmd = [ os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-' ] sampler = Sampler(state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd) httpd.sampler = sampler print 'Server starting..' httpd.serve_forever() '''
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 # which means don't sort state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >> score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug( "{} batches, {} samples, {} per sample; example scores: {}". format(count, n_samples, up_time / scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format( probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input( state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input( state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format( word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'], 'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'], 'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample( lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort( nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort( nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort( nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % ( t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: trans, aligns, costs, coverages, fertility, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] print "Aligns:" # aligns shape: (target_len, source_len) # we reverse it to the shape (source_len, target_len) to show the matrix print numpy.array(aligns[best]).transpose().tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 coverage = coverages[best] print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f'%(words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f'%(words[k], fertility[k]), print '' print total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
state['indx_word'] = prel('ivocab.lang1.pkl') state['indx_word_target'] = prel('ivocab.lang2.pkl') state['word_indx'] = prel('vocab.lang1.pkl') state['word_indx_trgt'] = prel('vocab.lang2.pkl') update_custom_keys(state, conf, ['bs', 'loopIters', 'timeStop', 'dim', 'null_sym_source', 'null_sym_target']) if conf['method'] == 'RNNenc-50': state['prefix'] = 'encdec-50_' state['seqlen'] = 50 state['sort_k_batches'] = 20 log.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, False) enc_dec.build() lm_model = enc_dec.create_lm_model() log.debug("Load data") train_data = get_batch_iterator(state) log.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) log.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load()
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'fixed_embeddings' not in state: state['fixed_embeddings'] = False if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state, rng) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) if state['rolling_vocab']: logger.debug("Initializing extra parameters") init_extra_parameters(lm_model, state) if not state['fixed_embeddings']: init_adadelta_extra_parameters(algo, state) with open(state['rolling_vocab_dict'], 'rb') as f: lm_model.rolling_vocab_dict = cPickle.load(f) lm_model.total_num_batches = max(lm_model.rolling_vocab_dict) lm_model.Dx_shelve = shelve.open(state['Dx_file']) lm_model.Dy_shelve = shelve.open(state['Dy_file']) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['rolling_vocab']: lm_model.Dx_shelve.close() lm_model.Dy_shelve.close()
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False with open(args.topn_file, 'rb') as f: topn = cPickle.load( f ) # Load dictionary (source word index : list of target word indices) if args.less_transfer: for elt in topn: topn[elt] = topn[ elt][:args.num_ttables] # Take the first args.num_ttables only else: for elt in topn: topn[elt] = set( topn[elt][:args.num_ttables] ) # Take the first args.num_ttables only and convert list to set num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) original_W_0_dec_approx_embdr.append(lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) # On GPU, this will free memory for the next models # Additional gains could be made by rolling the source vocab lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].set_value( numpy.zeros((1), dtype=numpy.float32)) indx_word = cPickle.load(open(state['word_indx'], 'rb')) #Source w2i sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_decs) beam_search.compile() else: raise NotImplementedError #sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) #Source i2w original_target_i2w = lm_models[0].word_indxs.copy() # I don't think that we need target_word2index max_words = len(original_b_dec_deep_softmax[0]) if args.less_transfer: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend( topn[elt] ) # Add topn best unigram translations for each source word output = update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every ) == 0 and args.change_every > 0 and i > 0: output = True if output: D_dict[prev_line] = D.copy( ) # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [ state['null_sym_target'], state['unk_sym_target'] ] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts( indices, d, D, C, args.num_common ) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices # For now, keep all input words in the model. # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now if args.verbose: print "Parsed Input:", parsed_in if args.less_transfer: if i in D_dict: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp) else: # Extract the indices you need indices = set() for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices = indices.union( topn[elt] ) # Add topn best unigram translations for each source word num_common_words = args.num_common while True: if num_common_words >= max_words: final = True num_common_words = max_words else: final = False if args.final: # No matter the number of words final = True indices = indices.union(set( xrange(num_common_words))) # Add common words indices = list( indices) # Convert back to list for advanced indexing eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) # Set the target word matrices and biases for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word try: trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final) break # Breaks only if it succeeded (If final=True, will always succeed) except RuntimeError: indices = set(indices) num_common_words *= 2 if not args.n_best: best = numpy.argmin(costs) print >> ftrans, trans[best] else: order = numpy.argsort(costs) best = order[0] for elt in order: print >> ftrans, str( i + args.start) + ' ||| ' + trans[elt] + ' ||| ' + str( costs[elt]) if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: raise NotImplementedError
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) t_indx_word = cPickle.load(open(state['word_indx_trgt'], 'rb')) sampler = None beam_search = BeamSearch(enc_dec) beam_search.compile() idict_src = cPickle.load(open(state['indx_word'], 'r')) t_idict_src = cPickle.load(open(state['indx_word_target'], 'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') start_time = time.time() total_cost = 0.0 # for i, line in enumerate(fsrc): i = 0 while 1: try: seqin = fsrc.next().strip() seqout = ftrg.next().strip() except StopIteration: break seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) out, parsed_out = parse_target(state, t_indx_word, seqout, idx2word=t_idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: aligns, costs, coverage, fertility = force_decoding( lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: aligns, costs, coverage = force_decoding( lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: aligns, costs = force_decoding(lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) print "Parsed Input:", parsed_in print "Parsed Target:", parsed_out print 'Aligns:' print aligns.tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f' % (words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f' % (words[k], fertility[k]), print '' print total_cost += costs[0] if (i + 1) % 100 == 0: logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'],'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'],'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >>ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) try: best = numpy.argmin(costs) print >>ftrans, trans[best] total_cost += costs[best] except: print >> ftrans, "FAIL" if args.verbose: print "Translation:", trans[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
class SampleBlock(object): ''' class for sampling ''' def __init__(self): # para setting self.arg_state = 'search_state.pkl' self.arg_changes = "" self.arg_model_path = 'search_model.npz' self.arg_beam_search = True self.arg_ignore_unk = False self.arg_normalize = False self.state = prototype_state() with open(self.arg_state) as src: self.state.update(cPickle.load(src)) self.state.update(eval("dict({})".format(self.arg_changes))) logging.basicConfig( level=getattr(logging, self.state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(self.state['seed']) self.enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True) self.enc_dec.build() self.lm_model = self.enc_dec.create_lm_model() self.lm_model.load(self.arg_model_path) self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb')) self.beam_search = None self.beam_search = BeamSearch(self.enc_dec) self.beam_search.compile() self.idict_src = cPickle.load(open(self.state['indx_word'], 'r')) ''' seqin: input sentence, k sample number return a list of tuple(sentence, score) ''' def getSamples(self, seqori, k): # split the sentence seqin = "" for i in range(0, len(seqori), 3): w = seqori[i:i + 3] seqin = seqin + w + " " print "split seq:#%s#" % (seqin) #return seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) ans, align, rester, updater = self.sample(seq, k) return ans, align, rester, updater def sample(self, seq, n_samples): ans = [] trans, costs, align, rester, updater = self.beam_search.search( seq, n_samples, ignore_unk=self.arg_ignore_unk, minlen=len(seq) / 2) if self.arg_normalize: counts = [len(s) for s in trans] costs = [co / cn for co, cn in zip(costs, counts)] for i in range(len(trans)): sen = indices_to_words(self.lm_model.word_indxs, trans[i]) ans.append((" ".join(sen), costs[i])) return ans, align, rester, updater def getRep(self, seqori): seqin = "" for i in range(0, len(seqori), 3): w = seqori[i:i + 3] seqin = seqin + w + " " print "split seq:#%s#" % (seqin) seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) rep = self.beam_search.search(seq, 20, ignore_unk=self.arg_ignore_unk, minlen=len(seq) / 2, getRep=True) return rep
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) ########################################################### # by He Wei #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) ########################################################### enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here #assert beam_search #assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() #n_samples = args.beam_size total_cost = 0.0 #logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in if args.beam_search: trans, costs, _, aligns = sample(lm_model, seq, args.beam_size, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _, aligns = sample(lm_model, seq, 1, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) best = numpy.argmin(costs) out_str = trans[best] align_str = [] if args.beam_search and args.alignment: for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3])) out_str += "\t" + ' '.join(align_str) if args.beam_search and args.nbest: nbest_trans = trans nbest_costs = costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) nbest_str = ' ||| '.join( "%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) out_str += "\t" + nbest_str print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >>score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time/scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format(word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'fixed_embeddings' not in state: state['fixed_embeddings'] = False if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if 'reprocess_each_iteration' not in state: state['reprocess_each_iteration'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state, rng) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) if state['rolling_vocab']: logger.debug("Initializing extra parameters") init_extra_parameters(lm_model, state) if not state['fixed_embeddings']: init_adadelta_extra_parameters(algo, state) with open(state['rolling_vocab_dict'], 'rb') as f: lm_model.rolling_vocab_dict = cPickle.load(f) lm_model.total_num_batches = max(lm_model.rolling_vocab_dict) lm_model.Dx_shelve = shelve.open(state['Dx_file']) lm_model.Dy_shelve = shelve.open(state['Dy_file']) hooks = [] if state['hookFreq'] >= 0: hooks.append(RandomSamplePrinter(state, lm_model, train_data)) if 'external_validation_script' in state and state['external_validation_script']: hooks.append(ExternalValidator(state, lm_model)) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks= hooks) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['rolling_vocab']: lm_model.Dx_shelve.close() lm_model.Dy_shelve.close()
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if args.num_common and args.num_ttables and args.topn_file: with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] alignment_fns = [] if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn")) if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) if args.mapping: with open(args.mapping, 'rb') as f: mapping = cPickle.load(f) heuristic = args.heuristic else: heuristic = 0 mapping = None word2idx_src = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) word2idx_trg['<eos>'] = state['null_sym_target'] word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index. idict_trg[state['null_sym_target']] = '<eos>' idict_trg[state['unk_sym_target']] = state['oov'] if args.num_common and args.num_ttables and args.topn_file: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() start_time = time.time() if args.source and args.trans and args.new_trans: with open(args.source, 'r') as src_file: with open(args.trans, 'r') as trans_file: with open(args.new_trans, 'w') as new_trans_file: if not (args.num_common and args.num_ttables and args.topn_file): eos_id = state['null_sym_target'] unk_id = state['unk_sym_target'] new_word2idx_trg = word2idx_trg prev_i = -1 if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': raise IOError("File is empty") full_trans_line = full_trans_line.split('|||') n_best_start = int(full_trans_line[0].strip()) trans_file.seek(0) while True: if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': break full_trans_line = full_trans_line.split('|||') i = int(full_trans_line[0].strip()) - n_best_start trans_line = full_trans_line[1].strip() else: trans_line = trans_file.readline() if trans_line == '': break i = prev_i + 1 if i == (prev_i + 1): prev_i = i if (i % args.change_every) == 0 and i > 0: hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words( src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) if (i % 100 == 0) and i > 0: new_trans_file.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / i)) src_line = src_file.readline() src_seq, src_words = parse_input(state, word2idx_src, src_line.strip()) src_words.append('<eos>') if (i % args.change_every) == 0: src_seqs = [] src_word_seqs = [] trg_seqs = [] trg_word_seqs = [] full_trans_lines = [] # Only used with n-best lists if args.num_common and args.num_ttables and args.topn_file: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)]) elif i != prev_i: raise ValueError("prev_i: %d, i: %d" % (prev_i, i)) trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id) trans_words.append('<eos>') src_seqs.append(src_seq) src_word_seqs.append(src_words) trg_seqs.append(trans_seq) trg_word_seqs.append(trans_words) if args.n_best: full_trans_lines.append(full_trans_line) # Out of loop hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) else: raise NotImplementedError
def main(): args = parse_args() state = getattr(experiments.nmt, args.state_fn)() if hasattr(args, 'state') and args.state: with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) assert state['enc_rec_layer'] == "RecursiveConvolutionalLayer", "Only works with gated recursive convolutional encoder" logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) x = TT.lvector() h = TT.tensor3() proj_x = theano.function([x], enc_dec.encoder.input_embedders[0]( enc_dec.encoder.approx_embedder(x)).out, name='proj_x') new_h, gater = enc_dec.encoder.transitions[0].step_fprop( None, h, return_gates = True) step_up = theano.function([h], [new_h, gater], name='gater_step') while True: try: seqin = raw_input('Input Sequence: ') seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue # get the initial embedding new_h = proj_x(seq) new_h = new_h.reshape(new_h.shape[0], 1, new_h.shape[1]) nodes = numpy.arange(len(seq)).tolist() node_idx = len(seq)-1 rules = [] nodes_level = copy.deepcopy(nodes) G = nx.DiGraph() input_nodes = [] merge_nodes = [] aggregate_nodes = [] nidx = 0 vpos = 0 nodes_pos = {} nodes_labels = {} # input nodes for nn in nodes[:-1]: nidx += 1 G.add_node(nn, pos=(nidx, 0), ndcolor="blue", label="%d"%nn) nodes_pos[nn] = (nidx, vpos) nodes_labels[nn] = idict_src[seq[nidx-1]] input_nodes.append(nn) node_idx = len(seq) - 1 vpos += 6 for dd in xrange(len(seq)-1): new_h, gater = step_up(new_h) decisions = numpy.argmax(gater, -1) new_nodes_level = numpy.zeros(len(seq) - (dd+1)) hpos = float(len(seq)+1) - 0.5 * (dd+1) last_node = True for nn in xrange(len(seq)-(dd+1)): hpos -= 1 if not last_node: # merge nodes node_idx += 1 G.add_node(node_idx, ndcolor="red", label="m") nodes_labels[node_idx] = "" nodes_pos[node_idx] = (hpos, vpos) G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,0]) G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,0]) merge_nodes.append(node_idx) merge_node = node_idx # linear aggregation nodes node_idx += 1 G.add_node(node_idx, ndcolor="red", label="") nodes_labels[node_idx] = "$+$" nodes_pos[node_idx] = (hpos, vpos+6) G.add_edge(merge_node, node_idx, weight=gater[-(nn+1),0,0]) G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,1]) G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,2]) aggregate_nodes.append(node_idx) new_nodes_level[-(nn+1)] = node_idx last_node = False nodes_level = copy.deepcopy(new_nodes_level) vpos += 12 # TODO: Show only strong edges. threshold = float(raw_input('Threshold: ')) edges = [(u,v,d) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold] #edges = G.edges(data=True) use_weighting = raw_input('Color according to weight [Y/N]: ') if use_weighting == 'Y': cm = plt.get_cmap('binary') cNorm = colors.Normalize(vmin=0., vmax=1.) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) colorList = [scalarMap.to_rgba(d['weight']) for (u,v,d) in edges] else: colorList = 'k' nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=input_nodes, node_color='white', alpha=1., edge_color='white') nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=merge_nodes, node_color='blue', alpha=0.8, node_size=20) nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=aggregate_nodes, node_color='red', alpha=0.8, node_size=80) nx.draw_networkx_edges(G, pos=nodes_pos, edge_color=colorList, edgelist=edges) nx.draw_networkx_labels(G,pos=nodes_pos,labels=nodes_labels,font_family='sans-serif') plt.axis('off') figname = raw_input('Save to: ') if figname[-3:] == "pdf": plt.savefig(figname, type='pdf') else: plt.savefig(figname) plt.close() G.clear()
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: trans, aligns, costs, coverages, fertility, _ = sample( lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, coverages, _ = sample( lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >> ftrans, trans[best] if args.verbose: print "Translation:", trans[best] print "Aligns:" # aligns shape: (target_len, source_len) # we reverse it to the shape (source_len, target_len) to show the matrix print numpy.array(aligns[best]).transpose().tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 coverage = coverages[best] print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f' % (words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f' % (words[k], fertility[k]), print '' print total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] if lm_model.maintain_coverage:
def main(): args = parse_args() # this loads the state specified in the prototype state = getattr(experiments.nmt, args.proto)() # this is based on the suggestion in the README.md in this foloder if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() # If we are going to use validation with the bleu script, we # will need early stopping bleu_validator = None if state['bleu_script'] is not None and state['validation_set'] is not None\ and state['validation_set_grndtruth'] is not None: # make beam search beam_search = BeamSearch(enc_dec) beam_search.compile() bleu_validator = BleuValidator(state, lm_model, beam_search, verbose=state['output_validation_set']) logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], bleu_val_fn=bleu_validator, hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 and state['validation_set'] is not None else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()