def create_conversation_vocab(dir): for file in os.listdir(dir): if 'train' in file: tokenizer = moses.MosesTokenizer() vocab = dict() f = open(dir + '/' + file, mode='r') for line in f.readlines(): for token in tokenizer.tokenize(line.encode('utf-8')): # Create Vocab Dictionary if not token in vocab: vocab[token] = 0 vocab[token] += 1 sorted_vocab = sorted(vocab, key=vocab.get, reverse=True) f_train = open('data/vocab.{}'.format(file[-7:]), 'w') f_train.write('<pad>' + '\n') f_train.write('<unk>' + '\n') f_train.write('<s>' + '\n') f_train.write('<\s>' + '\n') index = 4 for word in sorted_vocab: if vocab[word] < config_parameters.THRESHOLD: with open('config_parameters.py', 'a') as cf: if file[-7:] == 'encoder': cf.write('\n' + 'ENC_VOCAB = ' + str(index)) else: cf.write('\n' + 'DEC_VOCAB = ' + str(index)) break f_train.write(str(word) + '\n') index += 1 return print('Vocab Created!')
def main(truecase, sock): s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = sock # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Initialise truecaser with codecs.open(truecase, 'r', encoding='utf-8') as f: tc_init = f.read().split('\n') truecaser = defaultdict(str) for line in tc_init: truecaser[line.split(' ')[0].lower()] = line.split(' ')[0] # Initialise nltk.moses tokenizer and detokenizer tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # Start listening for connections while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=listen, args=(c, addr, tokenizer, detokenizer, truecaser)) t.start() except KeyboardInterrupt: break s.close()
def main(): global tokenizer parser = argparse.ArgumentParser() parser.add_argument('--input', '-i', type=str) parser.add_argument('--output', '-o', type=str) parser.add_argument('--lang', '-l', type=str, default='en') parser.add_argument('--poolsize', '-p', type=int, default=10) args = parser.parse_args() if args.input: input_file = open(args.input) else: input_file = sys.stdin if args.output: output_file = open(args.output, 'w') else: output_file = sys.stdout tokenizer = moses.MosesTokenizer(lang=args.lang) with Pool(args.poolsize) as p: for line in p.imap(tokenize, input_file): print(line, file=output_file)
def main(truecase, sock, fasttext, bpe): s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = sock # Reserve a port for your service. s.bind(('', port)) # Bind to the port # Now wait for client connection. with codecs.open(truecase, 'r', encoding='utf-8') as f: tc_init = f.read().split('\n') truecaser = defaultdict(str) for line in tc_init: truecaser[line.split(' ')[0].lower()] = line.split(' ')[0] ft_mdl = fastText.load_model(fasttext) tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() while True: try: s.listen(5) LOG.info("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=listen, args=(c, addr, tokenizer, detokenizer, truecaser, ft_mdl, bpe)) t.start() except KeyboardInterrupt: break s.close()
def parse_corpus(dataset): dataset_path = "data/" + dataset + "/" try: dataset_file = open(dataset_path + "raw.csv") except OSError: print("Could not find the file " + dataset_path + "raw.csv. Terminating.") return print("*** Parsing raw corpus ... ***") tokenizer = moses.MosesTokenizer() poems_split_by_words = [] word2i = {"SOP": 0, "EOP": 1, "CAP": 2, '\n': 3} i2word = {0: "SOP", 1: "EOP", 2: "CAP", 3: '\n'} for poem in csv.reader(dataset_file): poem = poem[0] poem_split_by_words = ["SOP"] for line in poem.split('\n')[:-1]: tokens = [] for token in tokenizer.tokenize(line, escape=False): assert len(token) > 0, "Empty token." if token[0].isupper(): tokens.append("CAP") # TODO: Support data like "\'When ... ". Currently, we lowercase every word. token = token.lower() tokens.append(token) if token not in word2i: new_index = len(i2word) word2i[token] = new_index i2word[new_index] = token tokens.append('\n') poem_split_by_words.extend(tokens) poem_split_by_words.append("EOP") poems_split_by_words.append(poem_split_by_words) with open(dataset_path + "processed.pkl", 'wb') as file: pickle.dump(poems_split_by_words, file) with open(dataset_path + "index.pkl", 'wb') as file: pickle.dump(word2i, file) pickle.dump(i2word, file) print("*** Finished parsing corpus. ***")
def __create_conv_vocab__(filename): in_path = os.path.join(Config_Params.final_data, filename) out_path = os.path.join(Config_Params.final_data, 'vocab.{}'.format(filename[-3:])) tokenizer = moses.MosesTokenizer() # For Tokens vocab = {} with open(in_path, 'rb') as f: for line in f.readlines(): for token in tokenizer.tokenize(line.decode('utf-8')): # Create Vocab Dictionary if not token in vocab: vocab[token] = 0 vocab[token] += 1 sorted_vocab = sorted(vocab, key=vocab.get, reverse=True) # Write Vocab To File with open(out_path, 'wb') as f: f.write('<pad>' + '\n') f.write('<unk>' + '\n') f.write('<s>' + '\n') f.write('<\s>' + '\n') index = 4 for word in sorted_vocab: if vocab[word] < Config_Params.THRESHOLD: with open('Config_Params.py', 'ab') as cf: if filename[-3:] == 'enc': cf.write('ENC_VOCAB = ' + str(index) + '\n') else: cf.write('DEC_VOCAB = ' + str(index) + '\n') break f.write(word.encode('utf-8') + '\n') index += 1
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
def sentence2id(vocab, line): tokenizer = moses.MosesTokenizer() return [ vocab.get(token, vocab['<unk>']) for token in tokenizer.tokenize(line) ]