def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") server_address = ('', args.port) httpd = ThreadedHTTPServer(server_address, MTReqHandler) #httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-'] detokenizer_cmd = [ os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-' ] sampler = Sampler(state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd) httpd.sampler = sampler print 'Server starting..' httpd.serve_forever() '''
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s" ) server_address = ("", args.port) httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state["seed"]) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state["word_indx"], "rb")) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state["indx_word"], "r")) tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"] detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"] sampler = Sampler( state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd, ) httpd.sampler = sampler print "Server starting.." httpd.serve_forever() """
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False with open(args.topn_file, 'rb') as f: topn = cPickle.load( f ) # Load dictionary (source word index : list of target word indices) if args.less_transfer: for elt in topn: topn[elt] = topn[ elt][:args.num_ttables] # Take the first args.num_ttables only else: for elt in topn: topn[elt] = set( topn[elt][:args.num_ttables] ) # Take the first args.num_ttables only and convert list to set num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) original_W_0_dec_approx_embdr.append(lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) # On GPU, this will free memory for the next models # Additional gains could be made by rolling the source vocab lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].set_value( numpy.zeros((1), dtype=numpy.float32)) indx_word = cPickle.load(open(state['word_indx'], 'rb')) #Source w2i sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_decs) beam_search.compile() else: raise NotImplementedError #sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) #Source i2w original_target_i2w = lm_models[0].word_indxs.copy() # I don't think that we need target_word2index max_words = len(original_b_dec_deep_softmax[0]) if args.less_transfer: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend( topn[elt] ) # Add topn best unigram translations for each source word output = update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every ) == 0 and args.change_every > 0 and i > 0: output = True if output: D_dict[prev_line] = D.copy( ) # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [ state['null_sym_target'], state['unk_sym_target'] ] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts( indices, d, D, C, args.num_common ) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices # For now, keep all input words in the model. # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now if args.verbose: print "Parsed Input:", parsed_in if args.less_transfer: if i in D_dict: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp) else: # Extract the indices you need indices = set() for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices = indices.union( topn[elt] ) # Add topn best unigram translations for each source word num_common_words = args.num_common while True: if num_common_words >= max_words: final = True num_common_words = max_words else: final = False if args.final: # No matter the number of words final = True indices = indices.union(set( xrange(num_common_words))) # Add common words indices = list( indices) # Convert back to list for advanced indexing eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) # Set the target word matrices and biases for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word try: trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final) break # Breaks only if it succeeded (If final=True, will always succeed) except RuntimeError: indices = set(indices) num_common_words *= 2 if not args.n_best: best = numpy.argmin(costs) print >> ftrans, trans[best] else: order = numpy.argsort(costs) best = order[0] for elt in order: print >> ftrans, str( i + args.start) + ' ||| ' + trans[elt] + ' ||| ' + str( costs[elt]) if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: raise NotImplementedError
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if args.num_common and args.num_ttables and args.topn_file: with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] alignment_fns = [] if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn")) if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) if args.mapping: with open(args.mapping, 'rb') as f: mapping = cPickle.load(f) heuristic = args.heuristic else: heuristic = 0 mapping = None word2idx_src = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) word2idx_trg['<eos>'] = state['null_sym_target'] word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index. idict_trg[state['null_sym_target']] = '<eos>' idict_trg[state['unk_sym_target']] = state['oov'] if args.num_common and args.num_ttables and args.topn_file: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1 and elt in topn: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() start_time = time.time() if args.source and args.trans and args.new_trans: with open(args.source, 'r') as src_file: with open(args.trans, 'r') as trans_file: with open(args.new_trans, 'w') as new_trans_file: if not (args.num_common and args.num_ttables and args.topn_file): eos_id = state['null_sym_target'] unk_id = state['unk_sym_target'] new_word2idx_trg = word2idx_trg prev_i = -1 if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': raise IOError("File is empty") full_trans_line = full_trans_line.split('|||') n_best_start = int(full_trans_line[0].strip()) trans_file.seek(0) while True: if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': break full_trans_line = full_trans_line.split('|||') i = int(full_trans_line[0].strip()) - n_best_start trans_line = full_trans_line[1].strip() else: trans_line = trans_file.readline() if trans_line == '': break i = prev_i + 1 if i == (prev_i + 1): prev_i = i if (i % args.change_every) == 0 and i > 0: hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words( src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) if (i % 100 == 0) and i > 0: new_trans_file.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / i)) src_line = src_file.readline() src_seq, src_words = parse_input(state, word2idx_src, src_line.strip()) src_words.append('<eos>') if (i % args.change_every) == 0: src_seqs = [] src_word_seqs = [] trg_seqs = [] trg_word_seqs = [] full_trans_lines = [] # Only used with n-best lists if args.num_common and args.num_ttables and args.topn_file: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)]) elif i != prev_i: raise ValueError("prev_i: %d, i: %d" % (prev_i, i)) trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id) trans_words.append('<eos>') src_seqs.append(src_seq) src_word_seqs.append(src_words) trg_seqs.append(trans_seq) trg_word_seqs.append(trans_words) if args.n_best: full_trans_lines.append(full_trans_line) # Out of loop hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) else: raise NotImplementedError
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) if args.less_transfer: for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only else: for elt in topn: topn[elt] = set(topn[elt][:args.num_ttables]) # Take the first args.num_ttables only and convert list to set num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) # On GPU, this will free memory for the next models # Additional gains could be made by rolling the source vocab lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) indx_word = cPickle.load(open(state['word_indx'],'rb')) #Source w2i sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_decs) beam_search.compile() else: raise NotImplementedError #sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) #Source i2w original_target_i2w = lm_models[0].word_indxs.copy() # I don't think that we need target_word2index max_words = len(original_b_dec_deep_softmax[0]) if args.less_transfer: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word output = update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: output = True if output: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices # For now, keep all input words in the model. # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now if args.verbose: print "Parsed Input:", parsed_in if args.less_transfer: if i in D_dict: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp) else: # Extract the indices you need indices = set() for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices = indices.union(topn[elt]) # Add topn best unigram translations for each source word num_common_words = args.num_common while True: if num_common_words >= max_words: final = True num_common_words = max_words else: final = False if args.final: # No matter the number of words final = True indices = indices.union(set(xrange(num_common_words))) # Add common words indices = list(indices) # Convert back to list for advanced indexing eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) # Set the target word matrices and biases for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word try: trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final) break # Breaks only if it succeeded (If final=True, will always succeed) except RuntimeError: indices = set(indices) num_common_words *= 2 if not args.n_best: best = numpy.argmin(costs) print >>ftrans, trans[best] else: order = numpy.argsort(costs) best = order[0] for elt in order: print >>ftrans, str(i+args.start) + ' ||| ' + trans[elt] + ' ||| ' + str(costs[elt]) if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: raise NotImplementedError
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) best = numpy.argmin(costs) print >> ftrans, trans[best] if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if args.num_common and args.num_ttables and args.topn_file: with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] alignment_fns = [] if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn")) if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) if args.mapping: with open(args.mapping, 'rb') as f: mapping = cPickle.load(f) heuristic = args.heuristic else: heuristic = 0 mapping = None word2idx_src = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) word2idx_trg['<eos>'] = state['null_sym_target'] word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index. idict_trg[state['null_sym_target']] = '<eos>' idict_trg[state['unk_sym_target']] = state['oov'] if args.num_common and args.num_ttables and args.topn_file: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() start_time = time.time() if args.source and args.trans and args.new_trans: with open(args.source, 'r') as src_file: with open(args.trans, 'r') as trans_file: with open(args.new_trans, 'w') as new_trans_file: if not (args.num_common and args.num_ttables and args.topn_file): eos_id = state['null_sym_target'] unk_id = state['unk_sym_target'] new_word2idx_trg = word2idx_trg prev_i = -1 if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': raise IOError("File is empty") full_trans_line = full_trans_line.split('|||') n_best_start = int(full_trans_line[0].strip()) trans_file.seek(0) while True: if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': break full_trans_line = full_trans_line.split('|||') i = int(full_trans_line[0].strip()) - n_best_start trans_line = full_trans_line[1].strip() else: trans_line = trans_file.readline() if trans_line == '': break i = prev_i + 1 if i == (prev_i + 1): prev_i = i if (i % args.change_every) == 0 and i > 0: hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words( src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) if (i % 100 == 0) and i > 0: new_trans_file.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / i)) src_line = src_file.readline() src_seq, src_words = parse_input(state, word2idx_src, src_line.strip()) src_words.append('<eos>') if (i % args.change_every) == 0: src_seqs = [] src_word_seqs = [] trg_seqs = [] trg_word_seqs = [] full_trans_lines = [] # Only used with n-best lists if args.num_common and args.num_ttables and args.topn_file: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)]) elif i != prev_i: raise ValueError("prev_i: %d, i: %d" % (prev_i, i)) trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id) trans_words.append('<eos>') src_seqs.append(src_seq) src_word_seqs.append(src_words) trg_seqs.append(trans_seq) trg_word_seqs.append(trans_words) if args.n_best: full_trans_lines.append(full_trans_line) # Out of loop hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) else: raise NotImplementedError
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state["sort_k_batches"] = 1 state["shuffle"] = False state["use_infinite_loop"] = False state["force_enc_repr_cpu"] = False logging.basicConfig( level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s" ) rng = numpy.random.RandomState(state["seed"]) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state["word_indx"], "rb")) indx_word_trgt = cPickle.load(open(state["word_indx_trgt"], "rb")) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state["source"] = [args.src] state["target"] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator( state, args.src, indx_word_src, args.trg, indx_word_trgt, state["bs"], raise_unk=not args.allow_unk ) data_iter.start() else: data_iter = get_batch_iterator(state, rng) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info("Scoring phrases") for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch["y"] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch["y"] = y st = time.time() [scores] = scorer(batch["x"], batch["y"], batch["x_mask"], batch["y_mask"]) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >> score_file, "{:.5e}".format(float(s)) n_samples += batch["x"].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug( "{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time / scores.shape[0], scores[:5] ) ) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input("Source sequence: ") trgt_line = raw_input("Target sequence: ") src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk) trgt_seq, trgt_words = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format(word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state, rng) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >>score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time/scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk) trgt_seq, trgt_words = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format(word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p) best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, alpha=alpha, verbose=True)