def handle_websocket(): wsock = request.environ.get('wsgi.websocket') if not wsock: abort(400, 'Expected WebSocket request.') while True: try: message = wsock.receive() if message is not None: segments = json.loads(message)['segments'] # force potential unicode to str() for boost conversion listSentences = [ segment.encode('utf8') for segment in segments ] numEle = len(listSentences) if numEle > 0 and listSentences[numEle - 1] == "": del listSentences[numEle - 1] trans = nmt.translate(listSentences) assert len( trans ) == 1, 'we only support single inputs for now (we decode one segment at a time)' trans = trans[0].decode('utf8') # parse the n-best list output of Marian n_best_outputs = [ segment.split(u' ||| ')[1] for segment in trans.split(u'\n') ] wsock.send(json.dumps({'segments': n_best_outputs})) except WebSocketError: break
def parallelized_main(c, tokenizer, detokenizer, truecaser): gotthis = c.recv(4096) info = json.loads(gotthis, encoding='utf-8') source_file_t = pre_processing(tokenizer, truecaser, info) try: while source_file_t[0][0] != "EOT": trans = [] weights = [] raw_in = [] for j in range(len(source_file_t)): if j % 2 == 0: for i in source_file_t[j]: temp = nmt.translate([i])[0][:-4].split(' ||| ') # To handle different outputs, origin of difference yet to be determined if len(temp) == 2: trans += [temp[0]] weights += [temp[1]] raw_in += [i] else: trans += [temp[1]] weights += [temp[2]] raw_in += [temp[0]] else: trans[-1] += str(source_file_t[j]) postproced = post_processing(detokenizer, trans, info) if info['alignweights']: msg = json.dumps( { 'raw_trans': trans, 'raw_input': raw_in, 'weights': weights, 'final_trans': postproced }, encoding='utf-8') else: msg = json.dumps( { 'raw_trans': trans, 'raw_input': source_file_t[0], 'final_trans': postproced }, encoding='utf-8') c.send(msg) gotthis = c.recv(4096) try: info = json.loads(gotthis, encoding='utf-8') source_file_t = pre_processing(tokenizer, truecaser, info) except ValueError: break c.close() sys.stderr.write('Done\n') except IndexError: c.close() sys.stderr.write('Bad connecntion made\n')
def handle_websocket(): wsock = request.environ.get('wsgi.websocket') if not wsock: abort(400, 'Expected WebSocket request.') while True: try: message = wsock.receive() if message is not None: trans = nmt.translate(message.split('\n')) wsock.send('\n'.join(trans)) except WebSocketError: break
def parallelized_main(c, tokenizer, detokenizer, tokenize, truecaser): print tokenize gotthis = c.recv(4096).decode('utf-8') source_file_t = pre_processing(tokenize, tokenizer, truecaser, gotthis) try: while source_file_t[0][0] != "EOT": detokenized = '' trans = [] for j in range(len(source_file_t)): if j % 2 == 0: for i in source_file_t[j]: trans += nmt.translate([i]) else: trans[-1] += str(source_file_t[j]) # print trans # print source_file_t if tokenize: for i in trans: # print i splitting = re.split('([\t\n\r\f\v]+)', i) # print splitting for j in range(len(splitting)): if j % 2 == 0: try: detokenized_par = detokenizer.detokenize( (splitting[j].decode('utf-8') + " ").split(), return_str=True) detokenized += detokenized_par[0].upper( ) + detokenized_par[1:] + " " except IndexError: pass else: detokenized += splitting[j] else: for i in trans: detokenized_par = i.decode('utf-8') + " " detokenized += detokenized_par[0].upper( ) + detokenized_par[1:] # print detokenized.replace('@@ ', '').encode('utf-8').strip() print detokenized.replace('@@ ', '').encode('utf-8') c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) gotthis = c.recv(4096).decode('utf-8') source_file_t = pre_processing(tokenize, tokenizer, truecaser, gotthis) c.close() sys.stderr.write('Done\n') except IndexError: c.close() sys.stderr.write('Bad connecntion made\n')
def handle_websocket(): wsock = request.environ.get('wsgi.websocket') if not wsock: abort(400, 'Expected WebSocket request.') while True: try: message = wsock.receive() #print message if message is not None: listSentences = message.split('\n') numEle = len(listSentences) #print numEle if numEle > 0 and listSentences[numEle - 1] == "": #print "deleting" del listSentences[numEle - 1] trans = nmt.translate(listSentences) wsock.send('\n'.join(trans)) except WebSocketError: break
#!/usr/bin/env python import libamunmt as nmt import sys nmt.init(sys.argv[1]) sentences = [] for line in sys.stdin: sentences.append(line.rstrip()) output = nmt.translate(sentences) for line in output: sys.stdout.write(line)
def parallelized_main(c, tokenizer, detokenizer, truecaser): # print tokenize gotthis = c.recv(1024) print(gotthis) # Change the input message size dynamically, useful for handling long inputs # Size control code "msize:", if not prepended, then assumes size within 1024 if gotthis[0:5] == 'msize:': size = int(gotthis[6:]) c.send('OK') c.recv(size) # RECEIVE JSON MESSAGE info = json.loads(gotthis, encoding='utf-8') # APPLY PRE-PROCESSING source_file_t = pre_processing(tokenizer, truecaser, info) # try: while source_file_t[0][0] != "EOT": detokenized = '' trans = [] weights = [] raw_in = [] for j in range(len(source_file_t)): if j % 2 == 0: for i in source_file_t[j]: translated_sent = nmt.translate([i]) LOG.debug(translated_sent) qe_total = utils.is_good_sentence(translated_sent[0], THRESHOLD) LOG.debug("Estimation: " + str(qe_total)) qe_total = math.exp(qe_total) LOG.debug("Estimation: " + str(qe_total)) temp = translated_sent[0].split(' |||') trans += [temp[0]]# hacekd for marian [temp[1]] weights += ['0'] raw_in += [i]# hacked for marian [temp[0]] else: trans[-1] += str(source_file_t[j]) if info['tok']: for i in trans: splitting = re.split('([\t\n\r\f\v]+)', i) for j in range(len(splitting)): if j % 2 == 0: try: detokenized_par = detokenizer.detokenize((splitting[j] + " ").decode('utf-8').split(), return_str=True) detokenized += detokenized_par[0].upper() + detokenized_par[1:] + " " except IndexError: pass else: detokenized += splitting[j] else: for i in trans: detokenized_par = i.decode('utf-8') + " " detokenized += detokenized_par[0].upper() + detokenized_par[1:] print detokenized.replace('@@ ', '').encode('utf-8') ret = {'raw_trans': trans, 'raw_input': raw_in, 'final_trans': detokenized.replace('@@ ', '').encode('utf-8').strip()} if info.get('align_weights'): ret['weights'] = weights if info.get('quality_estimation'): ret['estimation'] = unicode(qe_total) msg = json.dumps(ret, encoding='utf-8') size = sys.getsizeof(msg) if size >= 1024: LOG.debug("Size bigger than 1024, initialisisng message size control protocol.") c.send('msize:' + str(sys.getsizeof(msg))) gotthis = c.recv(1024) LOG.debug(gotthis) c.send(msg) LOG.debug("Sent") LOG.debug(msg) gotthis = c.recv(4096) LOG.debug("Got response") print(gotthis) try: info = json.loads(gotthis, encoding='utf-8') source_file_t = pre_processing(tokenizer, truecaser, info) except ValueError: break c.close() sys.stderr.write('Done\n') except IndexError: c.close() sys.stderr.write('Bad connecntion made\n')
def test_translate(): output = nmt.translate(['Alice has a cat .']) assert len(output) == 1 translation = output[0] assert translation.strip() == 'Alice hat eine Katze .'
#!/usr/bin/env python # script by Ulrich Germann # This script is meant to test the python interface of amun by emulating the amun executable. import sys, os if 'AMUN_PYLIB_DIR' in os.environ: sys.path.append(os.environ['AMUN_PYLIB_DIR']) pass import libamunmt if __name__ == "__main__": libamunmt.init(" ".join(sys.argv[1:])) print libamunmt.translate(sys.stdin.readlines()) libamunmt.shutdown()