def save_hyp_graph(self, filename, word_idict_trg, detailed=True, highlight_best=True): """ Writes this translation's search graph to disk. """ if self.hyp_graph: renderer = HypGraphRenderer(self.hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(filename, detailed, highlight_best) else: pass #TODO: Warning if no search graph has been constructed during decoding?
def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n')
def main(models, source_file, saveto, save_alignment=None, k=5, normalization_alpha=0.0, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False, device_list=[]): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' print 'input dict - 100 most common' for i in xrange(100): print i, " ", word_idict[i] print 'output dict - 100 most common' for i in xrange(100): print i, " ", word_idict_trg[i] # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): deviceid = '' if device_list is not None and len(device_list) != 0: deviceid = device_list[midx % len(device_list)].strip() processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalization_alpha, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, deviceid)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] if len(w) != options[0]['factors']: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0]*options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive() and processes[midx].exitcode != 0: # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write("Error: translate worker process {0} crashed with exitcode {1}".format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) saveto.write(_seqs2words(samples) + "\n") if i%1==0: print 'input:' print ' '.join(source_sentences[i]) print 'output:' print _seqs2words(samples) + "\n" if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(alignment, save_alignment) sys.stderr.write('Done\n')
def main(models, source_file, saveto, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] if len(w) != options[0]['factors']: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0]*options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) saveto.write(_seqs2words(samples) + "\n") if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(alignment, save_alignment) sys.stderr.write('Done\n')