Example #1
0
    def setUp(self):

        with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile:
            self.bpe = BPE(bpefile)

        self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
        self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
Example #2
0
    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'USA']
        rglossaries = ['M[Manuel]*l']
        self.bpe = BPE(amock, glossaries=glossaries, rglossaries=rglossaries)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ")
    subparsers = parser.add_subparsers(dest='command',
                                       help="""command to run. Run one of the commands with '-h' for more info.

learn-bpe: learn BPE merge operations on input text.
apply-bpe: apply given BPE operations to input text.
get-vocab: extract vocabulary and word frequencies from input text.
learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""")

    learn_bpe_parser = create_learn_bpe_parser(subparsers)
    apply_bpe_parser = create_apply_bpe_parser(subparsers)
    get_vocab_parser = create_get_vocab_parser(subparsers)
    learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers)

    args = parser.parse_args()

    if args.command == 'learn-bpe':
        # read/write files as UTF-8
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')

        learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input)
    elif args.command == 'apply-bpe':
        # read/write files as UTF-8
        args.codes = codecs.open(args.codes.name, encoding='utf-8')
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
        if args.vocabulary:
            args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')

        if args.vocabulary:
            vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
        else:
            vocabulary = None

        bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)

        for line in args.input:
            args.output.write(bpe.process_line(line))

    elif args.command == 'get-vocab':
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
        get_vocab(args.input, args.output)
    elif args.command == 'learn-joint-bpe-and-vocab':
        learn_joint_bpe_and_vocab(args)
    else:
        raise Exception('Invalid command provided')
Example #4
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"),
                       self.opt.seprator, None, None)

        self.translator = onmt.Translator(opt)

        self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")
Example #5
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)
Example #6
0
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):

        with codecs.open(os.path.join(currentdir, 'data', 'bpe.ref'),
                         encoding='utf-8') as bpefile:
            self.bpe = BPE(bpefile)

        self.infile = codecs.open(os.path.join(currentdir, 'data',
                                               'corpus.en'),
                                  encoding='utf-8')
        self.reffile = codecs.open(os.path.join(currentdir, 'data',
                                                'corpus.bpe.ref.en'),
                                   encoding='utf-8')

    def tearDown(self):

        self.infile.close()
        self.reffile.close()

    def test_apply_bpe(self):

        for line, ref in zip(self.infile, self.reffile):
            out = self.bpe.process_line(line)
            self.assertEqual(out, ref)

    def test_trailing_whitespace(self):
        """BPE.proces_line() preserves leading and trailing whitespace"""

        orig = '  iron cement  \n'
        exp = '  ir@@ on c@@ ement  \n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)

    def test_utf8_whitespace(self):
        """UTF-8 whitespace is treated as normal character, not word boundary"""

        orig = 'iron\xa0cement\n'
        exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)

    def test_empty_line(self):

        orig = '\n'
        exp = '\n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)
Example #7
0
class SplitWord():
    def __init__(self, config):
        if "BPE" in config:
            if "BPE" in config["BPE"]:
                self.way = config["BPE"]
                if config["BPE"] == "BPE":
                    self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code',
                                               encoding='utf-8'),
                                   separator='')
                elif config["BPE"] == "BPE1000":
                    self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code',
                                               encoding='utf-8'),
                                   separator='')
                else:
                    print("BPE define error")
                    exit()
            else:
                self.way = config["BPE"]
        else:
            self.way = "Normal"

    def __call__(self, word):
        if self.way == "BPE":
            return self.bpe.process_line(word).split(" ")
        elif self.way == "Ngram":
            list_of_ngram = []
            for i in range(3, 7):
                list_of_ngram.extend(ngram(word, i))
            return list_of_ngram
        else:
            return word
Example #8
0
class BPEService(object):

    def __init__(self,codes):
        self.bpe = BPE(codecs.open(codes,encoding='utf-8'))

    def process_line(self,line):
        return self.bpe.process_line(line.decode("UTF-8")).encode("UTF-8")
Example #9
0
 def __init__(self, config):
     self.config = config
     self.f_dict = config['vhred_dict']
     # Load the VHRED model.
     self.model, self.enc_fn, self.dec_fn = self._build_vhred_model()
     # Load in Twitter dictionaries for BPE conversion.
     f_bpe_dictionary = config['vhred_bpe_file']
     with open(f_bpe_dictionary, 'r') as handle:
         self.bpe = BPE(handle.readlines(), '@@')
     with open(self.f_dict, 'r') as handle:
         twitter_dict = cPickle.load(handle)
     self.str_to_idx = dict([(tok, tok_id)
                             for tok, tok_id, _, _ in twitter_dict])
     self.idx_to_str = dict([(tok_id, tok)
                             for tok, tok_id, _, _ in twitter_dict])
     self.MODELS = ['hred', 'human', 'tfidf', 'de']
Example #10
0
    def __init__(self,
                 srclang,
                 targetlang,
                 sourcebpe=None,
                 targetbpe=None,
                 sourcespm=None,
                 targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences = []
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)

        if self.bpe_source:
            self.detokenizer = MosesDetokenizer(targetlang)
Example #11
0
 def __init__(self, config):
     if "BPE" in config:
         if "BPE" in config["BPE"]:
             self.way = config["BPE"]
             if config["BPE"] == "BPE":
                 self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code',
                                            encoding='utf-8'),
                                separator='')
             elif config["BPE"] == "BPE1000":
                 self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code',
                                            encoding='utf-8'),
                                separator='')
             else:
                 print("BPE define error")
                 exit()
         else:
             self.way = config["BPE"]
     else:
         self.way = "Normal"
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@")
    wordMapPath = "word-map"
    tagMapPath = "tag-map"
    pMapPath = "prefix-list"
    sMapPath = "suffix-list"

    pMap = readAffix(pMapPath)
    sMap = readAffix(sMapPath)

    wordMap = readMap(wordMapPath)
    tagMap = readMap(tagMapPath)
    wordMap, _ = bpe.segment(wordMap)
    wordMap = list(set(process_seg_sent(wordMap)))

    wordMap.insert(0, "-start-")
    wordMap.insert(0, "-end-")
    wordMap.insert(0, "-unknown-")

    pMap.insert(0, "-start-")
    pMap.insert(0, "-unknown-")
    sMap.insert(0, "-start-")
    sMap.insert(0, "-unknown-")

    feature_sizes = [
        8, 8, 2, 4
    ]  #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags
    domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1]
    num_actions = 45
    embedding_dims = [8, 64, 8, 16]

    train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu'
    dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu'
    logging.info("loading data and precomputing features...")
    train_data = ConllData(train_data_path, wordMap, tagMap, pMap, sMap, bpe)
    dev_data = ConllData(dev_data_path, wordMap, tagMap, pMap, sMap, bpe)

    with tf.Session(FLAGS.tf_master) as sess:
        Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims,
              wordMap, tagMap, pMap, sMap, train_data, dev_data, bpe)
Example #13
0
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe.segment(orig)
        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'w@@ o@@ r@@ d@@ like@@ w@@ o@@ r@@ d l@@ i@@ k@@ e@@ M@@ a@@ n@@ u@@ e@@ l@@ word'
        test_case = (orig, exp)
        self._run_test_case(test_case)
class TestBPEIsolateGlossariesMethod(unittest.TestCase):

    def setUp(self):
        
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe._isolate_glossaries(orig)
        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeUSAwordManuelManuelwordUSA'
        exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA']
        test_case = (orig, exp)
        self._run_test_case(test_case)
class TestBPEIsolateGlossariesMethod(unittest.TestCase):
    def setUp(self):
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe._isolate_glossaries(orig)
        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeUSAwordManuelManuelwordUSA'
        exp = [
            'word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA'
        ]
        test_case = (orig, exp)
        self._run_test_case(test_case)
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
    def _run_test_case(self, test_case, encode_function):
        orig, expected = test_case
        out = self.bpe.segment(orig)

        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
        test_case = (orig, exp)
        self._run_test_case(test_case)
Example #17
0
class TestRegexIsolateGlossaries(unittest.TestCase):

    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe._isolate_glossaries(orig)
        self.assertEqual(out, expected)

    def test_regex_glossaries(self):
        orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>'
        exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>']
        test_case = (orig, exp)
        self._run_test_case(test_case) 
class TestBPESegmentMethod(unittest.TestCase):

    def setUp(self):
        
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
    def _run_test_case(self, test_case, encode_function):

        orig, expected = test_case
        out = self.bpe.segment(orig)

        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
        test_case = (orig, exp)
        self._run_test_case(test_case)
Example #19
0
class E2C(object):
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)

    def tokenDoc(self, doc):
        sentenceList = sent_tokenize(doc.strip())
        print 'e2c sentenceList : ', sentenceList
        tokens = []
        for sent in sentenceList:
            sent = sent.lower()
            sent = self.detokenizer.unescape_xml(
                self.tokenizer.tokenize(sent, return_str=True))
            if self.opt.bpe_codes != "":
                sent = self.bpe.segment(sent).strip()
            token = sent.split()
            tokens += [token]
        return tokens

    def translate(self, doc):
        batch = self.tokenDoc(doc)
        pred, _, _, _, _ = self.translator.translate(batch, None)
        rstr = ""
        #ipdb.set_trace()
        for idx in range(len(pred)):
            rstr += ''.join(' '.join(pred[idx][0]).replace(
                self.sep, '').split()) + "\n\n"
        print 'e2c rstr : ', rstr.strip()
        return rstr.strip()
Example #20
0
from onmt import model_builder

model = model_builder.build_base_model(model_opt, fields, cur_device == 'gpu',
                                       checkpoint, gpu_id)

model.to(cur_device)
model.eval()

from apply_bpe import BPE
import codecs

codes = codecs.open(
    "/scratch/project_2001970/AleModel/bpe-model.de-en-35k.wmt19-news-para.norm.tok.tc",
    encoding='utf-8')
bpe = BPE(codes)


def prepare(params, samples):
    #_, params.word2id = create_dictionary(samples)
    #params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
    #params.wvec_dim = 300
    import ipdb
    ipdb.set_trace()
    if params['save_embedds']:
        # get rid of empty lines
        samples = [sent if sent != [] else ['.'] for sent in samples]
        #  apply BPE to batch
        sents = []
        for sent in samples:
            str1 = ' '.join(sent)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-raw_dir', required=True)
    parser.add_argument('-data_dir', required=True)
    parser.add_argument('-codes', required=True)
    parser.add_argument('-save_data', required=True)
    parser.add_argument('-prefix', required=True)
    parser.add_argument('-max_len', type=int, default=100)
    parser.add_argument('--symbols', '-s', type=int, default=32000, help="Vocabulary size")
    parser.add_argument(
        '--min-frequency', type=int, default=6, metavar='FREQ',
        help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
    parser.add_argument('--dict-input', action="store_true",
                        help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
    parser.add_argument(
        '--separator', type=str, default='@@', metavar='STR',
        help="Separator between non-final subword units (default: '%(default)s'))")
    parser.add_argument('--total-symbols', '-t', action="store_true")
    opt = parser.parse_args()

    # Create folder if needed.
    mkdir_if_needed(opt.raw_dir)
    mkdir_if_needed(opt.data_dir)

    # Download and extract raw data.
    raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES)
    raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES)
    raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES)

    # Merge files into one.
    train_src, train_trg = compile_files(opt.raw_dir, raw_train, opt.prefix + '-train')
    val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val')
    test_src, test_trg = compile_files(opt.raw_dir, raw_test, opt.prefix + '-test')

    # Build up the code from training files if not exist
    opt.codes = os.path.join(opt.data_dir, opt.codes)
    if not os.path.isfile(opt.codes):
        sys.stderr.write(f"Collect codes from training data and save to {opt.codes}.\n")
        learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols, opt.min_frequency, True)
    sys.stderr.write(f"BPE codes prepared.\n")

    sys.stderr.write(f"Build up the tokenizer.\n")
    with codecs.open(opt.codes, encoding='utf-8') as codes:
        bpe = BPE(codes, separator=opt.separator)

    sys.stderr.write(f"Encoding ...\n")
    encode_files(bpe, train_src, train_trg, opt.data_dir, opt.prefix + '-train')
    encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val')
    encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test')
    sys.stderr.write(f"Done.\n")

    field = torchtext.data.Field(
        tokenize=str.split,
        lower=True,
        pad_token=Constants.PAD_WORD,
        init_token=Constants.BOS_WORD,
        eos_token=Constants.EOS_WORD)

    fields = (field, field)

    MAX_LEN = opt.max_len

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN

    enc_train_files_prefix = opt.prefix + '-train'
    train = TranslationDataset(
        fields=fields,
        path=os.path.join(opt.data_dir, enc_train_files_prefix),
        exts=('.src', '.trg'),
        filter_pred=filter_examples_with_length)

    from itertools import chain
    field.build_vocab(chain(train.src, train.trg), min_freq=2)

    data = {'settings': opt, 'vocab': field, }
    opt.save_data = os.path.join(opt.data_dir, opt.save_data)

    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    pickle.dump(data, open(opt.save_data, 'wb'))
        response_list.append(train_responses_txt[ind])
        print(train_contexts_txt[i])
        print(response_list[i])

    with open(output_file, 'w') as f1:
        for response in response_list:
            f1.write(response)


if __name__ == '__main__':
    twitter_bpe_dictionary = '../TwitterData/BPE/Twitter_Codes_5000.txt'
    twitter_bpe_separator = '@@'
    twitter_model_dictionary = '../TwitterData/BPE/Dataset.dict.pkl'

    # Load in Twitter dictionaries
    twitter_bpe = BPE(
        open(twitter_bpe_dictionary, 'r').readlines(), twitter_bpe_separator)
    twitter_dict = pickle.load(open(twitter_model_dictionary, 'r'))
    twitter_str_to_idx = dict([(tok, tok_id)
                               for tok, tok_id, _, _ in twitter_dict])
    twitter_idx_to_str = dict([(tok_id, tok)
                               for tok, tok_id, _, _ in twitter_dict])

    # Get data, for Twitter
    train_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Train.dialogues.pkl'
    test_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Test.dialogues.pkl'
    output_file = './output.csv'

    with open(train_file) as f1:
        train_data = pickle.load(f1)
    with open(test_file) as f1:
        test_data = pickle.load(f1)
Example #23
0
###############################################################################
#
# Main program
#
###############################################################################

parser = create_parser()
args = parser.parse_args()

# all models are grouped into one structure
model = namedtuple("model", ["htable", "bpe", "net"])
print("\nLoading models")
loaded = torch.load(args.model.name)
model.htable = LoadHashTable(args.hash_table, args.verbose)
model.bpe = BPE(args.bpe_codes, separator=args.separator)
model.net = encoders.BLSTM(args.model.name, gpu=args.gpu, verbose=args.verbose)

# all data structures are grouped into one structure
data = namedtuple("data", [
    "text_bpe", "text_slen", "text_bin", "text_enc"
    "file_bpe", "file_bin", "file_enc"
    "idx_pad", "idx_unk"
])
data.text_slen = np.empty(args.bsize, dtype=np.int32)
data.text_bin = np.empty((args.max_len, args.bsize), dtype=np.int32)
data.text_enc = np.empty((args.bsize, model.net.nembed), dtype=np.float32) \
                         if model.net else 0
data.idx_unk = model.htable['<UNK>']
data.idx_pad = model.htable['<PAD>']
Example #24
0
class VHRED(object):
    def __init__(self, config):
        self.config = config
        self.f_dict = config['vhred_dict']
        # Load the VHRED model.
        self.model, self.enc_fn, self.dec_fn = self._build_vhred_model()
        # Load in Twitter dictionaries for BPE conversion.
        f_bpe_dictionary = config['vhred_bpe_file']
        with open(f_bpe_dictionary, 'r') as handle:
            self.bpe = BPE(handle.readlines(), '@@')
        with open(self.f_dict, 'r') as handle:
            twitter_dict = cPickle.load(handle)
        self.str_to_idx = dict([(tok, tok_id)
                                for tok, tok_id, _, _ in twitter_dict])
        self.idx_to_str = dict([(tok_id, tok)
                                for tok, tok_id, _, _ in twitter_dict])
        self.MODELS = ['hred', 'human', 'tfidf', 'de']

    def _convert_text_to_bpe(self,
                             contexts,
                             gt_responses,
                             model_responses,
                             ignore_models=False):
        # Files needed for BPE conversions.
        context_ids = self._strs_to_idxs(contexts)
        gt_response_ids = self._strs_to_idxs(gt_responses)

        longest = 0
        for res in gt_response_ids:
            if len(res) > longest:
                longest = len(res)
        print 'Longest Response:', longest

        if not ignore_models:
            model_response_ids = self._strs_to_idxs(model_responses)
        else:
            model_response_ids = None
        return context_ids, gt_response_ids, model_response_ids

    def _strs_to_idxs(self, data):
        out = []
        for row in data:
            bpe_segmented = self.bpe.segment(row.strip())
            out.append([
                self.str_to_idx[word] for word in bpe_segmented.split()
                if word in self.str_to_idx
            ])
        return out

    def _idxs_to_strs(self, data):
        out = []
        for row in data:
            s = ' '.join([self.idx_to_str[word] for word in row])
            out.append(s.replace('@@ ', ''))
        return out

    def _build_vhred_model(self):
        # Update the state dictionary.
        state = VHRED_prototype_state()
        model_prefix = self.config['vhred_prefix']
        state_path = model_prefix + "_state.pkl"
        model_path = model_prefix + "_model.npz"
        with open(state_path, 'rb') as handle:
            state.update(cPickle.load(handle))
        # Update the bs for the current data.
        state['bs'] = 100
        state['dictionary'] = self.f_dict

        # Create the model:
        model = VHRED_DialogEncoderDecoder(state)
        model.bs = 100
        enc_fn = model.build_encoder_function()
        dec_fn = model.build_decoder_encoding()

        return model, enc_fn, dec_fn

    def _extract_text(self, dataset, ignore_models=False):
        cs, gt_rs, m_rs = [], [], []
        for entry in dataset:
            cs.append(entry['c'])
            gt_rs.append(entry['r_gt'])
            # Extract in this order so we don't mix up which responses came from which models.
            if not ignore_models:
                for m_name in self.MODELS:
                    m_rs.append(entry['r_models'][m_name][0])

        # Add </s> token to beginning of each.
        cs = [
            '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
            for c in cs
        ]
        gt_rs = [
            '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
            for c in gt_rs
        ]
        if not ignore_models:
            m_rs = [
                '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
                for c in m_rs
            ]

        return cs, gt_rs, m_rs

    # Compute model embeddings for contexts or responses
    # Embedding type can be 'CONTEXT' or 'DECODER'
    def _compute_embeddings(self, data):
        embeddings = []
        context_ids_batch = []
        batch_index = 0
        batch_total = int(math.ceil(float(len(data)) / float(self.model.bs)))

        counter = 0
        max_len = 0
        for context_ids in data:
            counter += 1
            context_ids_batch.append(context_ids)

            # If we have filled up a batch, or reached the end of our data:
            if len(context_ids_batch) == self.model.bs or counter == len(data):
                batch_index += 1
                length = len(context_ids_batch)
                if len(context_ids_batch) < self.model.bs:
                    # Pad the data to get a full batch.
                    while len(context_ids_batch) < self.model.bs:
                        context_ids_batch.append(context_ids_batch[0])
                print 'Computing embeddings for batch %d/%d' % (batch_index,
                                                                batch_total)
                encs = VHRED_compute_encodings(context_ids_batch, self.model,
                                               self.enc_fn, self.dec_fn,
                                               self.config['embedding_type'])
                if length < self.model.bs:
                    encs = encs[:length]
                for i in range(len(encs)):
                    embeddings.append(encs[i, :].tolist())
                context_ids_batch = []

        return embeddings

    def _add_embeddings_to_dataset(self,
                                   dataset,
                                   c_embs,
                                   r_gt_embs,
                                   r_model_embs,
                                   ignore_models=False):
        for ix in xrange(len(dataset)):
            dataset[ix]['c_emb'] = c_embs[ix]
            dataset[ix]['r_gt_emb'] = r_gt_embs[ix]
            if not ignore_models:
                dataset[ix]['r_model_embs'] = {}
                for jx, m_name in enumerate(self.MODELS):
                    dataset[ix]['r_model_embs'][m_name] = r_model_embs[
                        ix * len(self.MODELS) + jx]
        return dataset

    def get_embeddings(self, dataset, new_models=None, ignore_models=False):
        ''' Dataset should be a list of dictionaries. Each dictionary should have
			keys: c, r_gt, r_models = {'model_name': [r, score, length], ...}
		'''
        if not new_models is None:
            self.MODELS = new_models
        if 'r_models' not in dataset[0]:
            ignore_models = True

        contexts, gt_responses, model_responses = self._extract_text(
            dataset, ignore_models=ignore_models)
        context_ids, gt_response_ids, model_response_ids = self._convert_text_to_bpe(
            contexts,
            gt_responses,
            model_responses,
            ignore_models=ignore_models)

        print 'Computing context embeddings...'
        context_embs = self._compute_embeddings(context_ids)
        print 'Computing ground truth response embeddings...'
        gt_response_embs = self._compute_embeddings(gt_response_ids)
        if not ignore_models:
            print 'Computing model response embeddings...'
            model_response_embs = self._compute_embeddings(model_response_ids)
        else:
            model_response_embs = None

        # Update our dataset with each of the embeddings.
        dataset = self._add_embeddings_to_dataset(dataset,
                                                  context_embs,
                                                  gt_response_embs,
                                                  model_response_embs,
                                                  ignore_models=ignore_models)

        return dataset

    def use_saved_embeddings(self):
        with open(self.config['vhred_embeddings_file'], 'rb') as handle:
            dataset = cPickle.load(handle)
        return dataset
Example #25
0
class ContentProcessor():
    def __init__(self,  srclang,
            targetlang, sourcebpe=None, targetbpe=None,sourcespm=None,targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences=[]
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)
            self.detokenizer = MosesDetokenizer(targetlang)

    def preprocess(self, srctxt):
        normalized_text = '\n'.join(self.normalizer(line) for line in srctxt.split('\n'))   # normalizer do not accept '\n'
        sentSource = self.sentence_splitter([normalized_text])
        self.sentences=[]
        for s in sentSource:
            if self.tokenizer:
                # print('raw sentence: ' + s, flush=True)
                tokenized = ' '.join(self.tokenizer(s))
                # print('tokenized sentence: ' + tokenized, flush=True)
                segmented = self.bpe_source.process_line(tokenized)
            elif self.sp_processor_source:
                print('raw sentence: ' + s, flush=True)
                segmented = ' '.join(self.sp_processor_source.EncodeAsPieces(s))
                # print(segmented, flush=True)
            else:
                raise RuntimeError("No tokenization / segmentation method defines, can't preprocess")
            self.sentences.append(segmented)
        return self.sentences

    def postprocess(self, recievedsentences):
        sentTranslated = []
        for index, s in enumerate(recievedsentences):
            received = s.strip().split(' ||| ')
            # print(received, flush=True)

            # undo segmentation
            if self.bpe_source:
                translated = received[0].replace('@@ ','')
            elif self.sp_processor_target:
                translated = self.sp_processor_target.DecodePieces(received[0].split(' '))
            else:
                translated = received[0].replace(' ','').replace('▁',' ').strip()

            alignment = ''
            if len(received) == 2:
                alignment = received[1]
                links = alignment.split(' ')
                fixedLinks = []
                outputLength = len(received[0].split(' '))
                for link in links:
                    ids = link.split('-')
                    if ids[0] != '-1' and int(ids[0])<len(self.sentences[index]):
                        if int(ids[1])<outputLength:
                            fixedLinks.append('-'.join(ids))
                alignment = ' '.join(fixedLinks)

            if self.detokenizer:
                detokenized = self.detokenizer(translated.split())
            else:
                detokenized = translated

            sentTranslated.append(detokenized)
        return sentTranslated
 def setUp(self):
     
     amock = mock.MagicMock()
     amock.readline.return_value = 'something'
     glossaries = ['like', 'Manuel', 'USA']
     self.bpe = BPE(amock, glossaries=glossaries)
Example #27
0
 def __init__(self, codes):
     self.bpe = BPE(codecs.open(codes, encoding='utf-8'))
def Eval(sess):
    """Builds and evaluates a network."""
    logging.set_verbosity(logging.INFO)
    bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@")
    wordMapPath = "word-map"
    tagMapPath = "tag-map"
    pMapPath = "prefix-list"
    sMapPath = "suffix-list"

    pMap = readAffix(pMapPath)
    sMap = readAffix(sMapPath)

    wordMap = readMap(wordMapPath)
    tagMap = readMap(tagMapPath)
    wordMap, _ = bpe.segment(wordMap)
    wordMap = list(set(process_seg_sent(wordMap)))

    wordMap.insert(0, "-start-")
    wordMap.insert(0, "-end-")
    wordMap.insert(0, "-unknown-")

    pMap.insert(0, "-start-")
    pMap.insert(0, "-unknown-")
    sMap.insert(0, "-start-")
    sMap.insert(0, "-unknown-")

    feature_sizes = [
        8, 8, 2, 4
    ]  #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags
    domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1]
    num_actions = 45
    embedding_dims = [8, 64, 8, 16]

    t = time.time()
    hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(','))
    logging.info(
        'Building training network with parameters: feature_sizes: %s '
        'domain_sizes: %s', feature_sizes, domain_sizes)

    test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu'
    logging.info("loading data and precomputing features...")
    test_data = ConllData(test_data_path, wordMap, tagMap, pMap, sMap, bpe)

    tagger = GreedyTagger(num_actions,
                          feature_sizes,
                          domain_sizes,
                          embedding_dims,
                          hidden_layer_sizes,
                          gate_gradients=True)

    tagger.AddEvaluation(FLAGS.batch_size)
    tagger.AddSaver()
    sess.run(tagger.inits.values())
    tagger.saver.restore(sess, FLAGS.model_path)

    t = time.time()
    num_epochs = None
    num_tokens = 0
    num_correct = 0
    index = 0
    epochs = 0

    epochs, sent_batch = loadBatch(FLAGS.batch_size, epochs, test_data)
    while True:
        sent_batch, epochs, feature_endpoints, gold_tags, words = get_current_features(
            sent_batch, epochs, test_data, wordMap, tagMap, pMap, sMap)
        tf_eval_metrics = sess.run(
            tagger.evaluation['logits'],
            feed_dict={tagger.test_input: feature_endpoints})
        for i in range(FLAGS.batch_size):
            best_action = 0
            best_score = float("-inf")
            for j in range(45):
                if tf_eval_metrics[i][j] > best_score:
                    best_score = tf_eval_metrics[i][j]
                    best_action = j
            sent_batch[i].set_tag(tagMap[best_action])
        if num_epochs is None:
            num_epochs = epochs
        elif num_epochs < sent_batch[0].get_epoch():
            break

    test_data.reset_index()
    while test_data.has_next_sent():
        sent = test_data.get_next_sent()
        output_tags = sent.get_tag_output()
        gold_tags = sent.origin_tag_list
        word_list, output_tags = combine_seg(sent.seg_word_list, output_tags)
        for idx, tag in enumerate(gold_tags):
            num_tokens += 1
            if tag == output_tags[idx]:
                num_correct += 1
        sent.reset_state()

    eval_metric = 0 if num_tokens == 0 else (100.0 * num_correct / num_tokens)

    logging.info(
        'Number of Tokens: %d, Seconds elapsed in evaluation: %.2f, '
        'eval metric: %.2f%%', num_tokens,
        time.time() - t, eval_metric)
    logging.info('num correct tokens: %d', num_correct)
Example #29
0
parser.add_argument('-c',
                    '--cache',
                    type=str,
                    default='opusMT-cache.db',
                    help='BPE model for source text segmentation')

args = parser.parse_args()

if not args.deftrg:
    args.deftrg = args.trglangs[0]

## load BPE model for pre-processing
if args.bpe:
    print("load BPE codes from " + args.bpe, flush=True)
    BPEcodes = codecs.open(args.bpe, encoding='utf-8')
    bpe = BPE(BPEcodes)

## load SentencePiece model for pre-processing
if args.spm:
    print("load sentence piece model from " + args.spm, flush=True)
    spm = spm.SentencePieceProcessor()
    spm.Load(args.spm)

## open the cache DB
print("open cache at " + args.cache, flush=True)
cache = SqliteDict(args.cache, autocommit=True)


## add signal handler for SIGINT to properly close
## the DB when interrupting
def signal_handler(sig, frame):
Example #30
0
def main(args):
    setup_logger(args)

    args.interactive = sys.stdin.isatty(
    ) and not args.file  # Just make the code more understendable

    if args.file:
        data_descriptor = open(args.file, 'r')
    else:
        data_descriptor = sys.stdin

    if args.interactive:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1
    if args.buffer_size > 50000:
        print(
            "WARNING: To prevent memory exhaustion buffer size is set to 50000",
            file=sys.stderr)
        args.buffer_size = 50000

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args, file=sys.stderr)

    use_cuda = torch.cuda.is_available() and not args.cpu

    processing_start = time.time()

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path), file=sys.stderr)
    model_paths = args.path.split(':')
    models, model_args, src_dict, tgt_dict = load_ensemble_for_inference(
        model_paths)
    if args.fp16:
        for model in models:
            model.half()

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(need_attn=args.print_alignment)

    # Initialize generator
    translator = SequenceGenerator(
        models,
        tgt_dict.get_metadata(),
        maxlen=args.max_target_positions,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
        sampling_temperature=args.sampling_temperature)

    if use_cuda:
        translator.cuda()

    # Load BPE codes file
    if args.bpe_codes:
        codes = open(args.bpe_codes, 'r')
        bpe = BPE(codes)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    def make_result(src_str, hypos):
        result = Translation(
            src_str=src_str,
            hypos=[],
            pos_scores=[],
            alignments=[],
        )

        # Process top predictions
        for hypo in hypos[:min(len(hypos), args.nbest)]:
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu()
                if hypo['alignment'] is not None else None,
                align_dict=align_dict,
                tgt_dict=tgt_dict,
                remove_bpe=args.remove_bpe,
            )
            hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de').strip()
            result.hypos.append((hypo['score'], hypo_str))
            result.pos_scores.append('P\t' + ' '.join(
                f'{x:.4f}' for x in hypo['positional_scores'].tolist()))
            result.alignments.append('A\t' + ' '.join(
                str(utils.item(x))
                for x in alignment) if args.print_alignment else None)

        return result

    gen_timer = StopwatchMeter()

    def process_batch(batch):
        tokens = batch.tokens
        lengths = batch.lengths

        if use_cuda:
            tokens = tokens.cuda()
            lengths = lengths.cuda()

        translation_start = time.time()
        gen_timer.start()
        translations = translator.generate(
            tokens,
            lengths,
            maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b),
        )
        gen_timer.stop(sum(len(h[0]['tokens']) for h in translations))
        dllogger.log(step='infer',
                     data={'latency': time.time() - translation_start})

        return [
            make_result(batch.srcs[i], t) for i, t in enumerate(translations)
        ]

    if args.interactive:
        print('| Type the input sentence and press return:')
    for inputs in buffered_read(args.buffer_size, data_descriptor):
        indices = []
        results = []
        for batch, batch_indices in make_batches(inputs, args, src_dict,
                                                 args.max_positions, bpe):
            indices.extend(batch_indices)
            results += process_batch(batch)

        for i in np.argsort(indices):
            result = results[i]
            print(result.src_str, file=sys.stderr)
            for hypo, pos_scores, align in zip(result.hypos, result.pos_scores,
                                               result.alignments):
                print(f'Score {hypo[0]}', file=sys.stderr)
                print(hypo[1])
                print(pos_scores, file=sys.stderr)
                if align is not None:
                    print(align, file=sys.stderr)

    if args.file:
        data_descriptor.close()

    log_dict = {
        'throughput': 1. / gen_timer.avg,
        'latency_avg': sum(gen_timer.intervals) / len(gen_timer.intervals),
        'latency_p90': gen_timer.p(90),
        'latency_p95': gen_timer.p(95),
        'latency_p99': gen_timer.p(99),
        'total_infernece_time': gen_timer.sum,
        'total_run_time': time.time() - processing_start,
    }
    print('Translation time: {} s'.format(log_dict['total_infernece_time']),
          file=sys.stderr)
    print('Model throughput (beam {}): {} tokens/s'.format(
        args.beam, log_dict['throughput']),
          file=sys.stderr)
    print(
        'Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'
        .format(log_dict['latency_avg'], log_dict['latency_p90'],
                log_dict['latency_p95'], log_dict['latency_p99']),
        file=sys.stderr)
    print('End to end time: {} s'.format(log_dict['total_run_time']),
          file=sys.stderr)
    dllogger.log(step=(), data=log_dict)
Example #31
0
def main(models,
         saveto,
         bpe_file,
         save_alignment=None,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False,
         verbose=False,
         nbest=False,
         suppress_unk=False,
         a_json=False,
         print_word_probabilities=False,
         return_hyp_graph=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    # CAN I MAKE IT INTO SERVER

    ###### The following functions should be already a part of serverisation

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f, processes, queue):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [
                    word_dicts[i][f] if f in word_dicts[i] else 1
                    for (i, f) in enumerate(w.split('|'))
                ]
                if len(w) != options[0]['factors']:
                    sys.stderr.write(
                        'Error: expected {0} factors, but input word has {1}\n'
                        .format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0] * options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx + 1, source_sentences

    def _finish_processes(queue):
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples, processes, queue, rqueue):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = None
            while resp is None:
                try:
                    resp = rqueue.get(True, 5)
                # if queue is empty after 5s, check if processes are still alive
                except Empty:
                    for midx in xrange(n_process):
                        if not processes[midx].is_alive():
                            # kill all other processes and raise exception if one dies
                            queue.cancel_join_thread()
                            rqueue.cancel_join_thread()
                            for idx in xrange(n_process):
                                processes[idx].terminate()
                            sys.stderr.write(
                                "Error: translate worker process {0} crashed with exitcode {1}"
                                .format(processes[midx].pid,
                                        processes[midx].exitcode))
                            sys.exit(1)
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1),
                                                                  n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer):
        source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        #print(source_file_t[i])
        while source_file_t[0] != "EOT":
            for i in range(len(source_file_t)):
                # print source_file_t[i].decode('utf-8')
                #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True)
                #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model",
                #                         "en-truecase.mdl"], stdout=subprocess.PIPE)
                #result = pipe.stdout.read()
                #print pipe.communicate()
                #print pipe
                #print pipe.stdout
                #print pipe.stdout.read()
                #print pipe.
                #print "Here"
                #print result
                #source_file_t[i] = subprocess.check_output()
                source_file_t[i] = bpe.segment(
                    tokenizer.tokenize(source_file_t[i],
                                       return_str=True)).strip()
            #print "Passed"
            print source_file_t
            detokenized = ''
            queue = Queue()
            rqueue = Queue()
            processes = [None] * n_process
            for midx in xrange(n_process):
                processes[midx] = Process(
                    target=translate_model,
                    args=(queue, rqueue, midx, models, options, k, normalize,
                          verbose, nbest, save_alignment is not None,
                          suppress_unk, return_hyp_graph, fs_init, fs_next))
                processes[midx].start()

            n_samples, source_sentences = _send_jobs(source_file_t, processes,
                                                     queue)
            _finish_processes(queue)
            #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs
            for i, trans in enumerate(
                    _retrieve_jobs(n_samples, processes, queue, rqueue)):
                print "NEXT SENTENCE:"
                if nbest:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    order = numpy.argsort(scores)
                    for j in order:
                        if print_word_probabilities:
                            probs = " ||| " + " ".join(
                                "{0}".format(prob) for prob in word_probs[j])
                        else:
                            probs = ""
                        saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(
                            i, _seqs2words(samples[j]), scores[j], probs))
                        # print alignment matrix for each hypothesis
                        # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos
                        # translation_token_count+eos
                        if save_alignment is not None:
                            if a_json:
                                print_matrix_json(
                                    alignment[j], source_sentences[i],
                                    _seqs2words(samples[j]).split(), i, i + j,
                                    save_alignment)
                            else:
                                save_alignment.write(
                                    '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'
                                    .format(i, _seqs2words(samples[j]),
                                            scores[j],
                                            ' '.join(source_sentences[i]),
                                            len(source_sentences[i]) + 1,
                                            len(samples[j])))
                                print_matrix(alignment[j], save_alignment)
                else:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    ## TODO: Handle the output here
                    #print((_seqs2words(samples) + "\n").encode('utf-8'))
                    #text.append(_seqs2words(samples) + "\n")
                    x = _seqs2words(samples)
                    #print x[0].upper() + x[1:]
                    detokenized += detokenizer.detokenize(
                        (x.decode('utf-8') + " ").split(), return_str=True)
                    detokenized = detokenized[0].upper() + detokenized[1:]
                    #print "ref this"
                    #print detokenized
                    #detokenized[0] = detokenized[0].upper()
                    #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
                    ## TODO: End of output handling
                    if print_word_probabilities:
                        for prob in word_probs:
                            saveto.write("{} ".format(prob))
                        saveto.write('\n')
                    if save_alignment is not None:
                        if a_json:
                            print_matrix_json(alignment, source_sentences[i],
                                              _seqs2words(trans[0]).split(), i,
                                              i, save_alignment)
                        else:
                            save_alignment.write(
                                '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.
                                format(i, _seqs2words(trans[0]), 0,
                                       ' '.join(source_sentences[i]),
                                       len(source_sentences[i]) + 1,
                                       len(trans[0])))
                            print_matrix(alignment, save_alignment)
            c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
            source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        c.close()
        sys.stderr.write('Done\n')

    def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe):
        while True:
            try:  # Establish connection with client.
                try:
                    print 'Got connection from', addr
                    print "Receiving..."
                    fname = c.recv(4096)
                except socket.error:
                    c.close()
                    print "connection closed"
                    break
                print fname
                c.send("okay")
                #if fname == 'exit':
                #    print "Terminating connection with client."
                #    c.close()
                #    break
                #else:
                #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c))
                try:
                    t = threading.Thread(target=_parallelized_main,
                                         args=(fs_init, fs_next, c, bpe,
                                               tokenizer, detokenizer))
                    t.start()
                    t.join()
                except socket.error:
                    c.close()
                    break
            except KeyboardInterrupt as e:
                LOG.debug('Crtrl+C issued ...')
                LOG.info('Terminating server ...')
                try:
                    c.shutdown(socket.SHUT_RDWR)
                    c.close()
                except:
                    pass
                break

    s = socket.socket()  # Create a socket object
    host = socket.gethostname()  # Get local machine name
    port = 12345  # Reserve a port for your service.
    s.bind((host, port))  # Bind to the port #  Now wait for client connection.

    # Beginning model loading
    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams,
                                       option,
                                       use_noise,
                                       trng,
                                       return_alignment=save_alignment
                                       is not None)

        fs_init.append(f_init)
        fs_next.append(f_next)
    # end of model loading
    tokenizer = moses.MosesTokenizer()
    detokenizer = moses.MosesDetokenizer()
    # start listening to connections once models are loaded
    args.codes = codecs.open(bpe_file[0], encoding='utf-8')
    bpe = BPE(args.codes, '@@')
    while True:
        try:
            s.listen(5)
            print("Waiting for connections and stuff...")
            c, addr = s.accept()
            t = threading.Thread(target=_listen,
                                 args=(c, addr, fs_init, fs_next, tokenizer,
                                       detokenizer, bpe))
            t.start()
        except KeyboardInterrupt:
            break
    s.close()
Example #32
0
import kenlm
from apply_bpe import BPE
from common_text_features_functions import cut_xml

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Applying LM for the gazette.")
    parser.add_argument('-vw', help="VW file without calculated lm -> gazettetitle.without_lm.vw")
    parser.add_argument('--codes', '-c', type=argparse.FileType('r'), help="File with BPE codes (created by learn_bpe.py).")
    args = parser.parse_args()

    #Define lm
    pages_lm = kenlm.LanguageModel("LM/necrologies_lm.klm")
    necrologues_lm = kenlm.LanguageModel("LM/necrologies_lm.klm")

    bpe = BPE(args.codes, "@@")

    vw_file = args.vw
    file_name = os.path.basename(vw_file)
    gazette_title = vw_file.replace(file_name, "")

    with open(vw_file) as rectangles_to_check:
        for rectangle in rectangles_to_check.readlines():
            page = re.search(r"PAGE:\d\d?", rectangle).group(0).replace("PAGE:","")
            x1 = re.search(r"X1:\d{1,4}", rectangle).group(0).replace("X1:","")
            x2 = re.search(r"X2:\d{1,4}", rectangle).group(0).replace("X2:","")
            y1 = re.search(r"Y1:\d{1,4}", rectangle).group(0).replace("Y1:","")
            y2 = re.search(r"Y2:\d{1,4}", rectangle).group(0).replace("Y2:","")

            xml_coord = gazette_title + "/page_" + page + ".xml_coord"
 def setUp(self):
     amock = mock.MagicMock()
     amock.readline.return_value = 'something'
     glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
     self.bpe = BPE(amock, glossaries=glossaries)
def main(args):
    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    model_paths = args.path.split(':')
    models, model_args = utils.load_ensemble_for_inference(
        model_paths, task, model_arg_overrides=eval(args.model_overrides))

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Initialize generator
    translator = SequenceGenerator(
        models,
        tgt_dict,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
        sampling_temperature=args.sampling_temperature)

    if use_cuda:
        translator.cuda()

    # Load BPE codes file
    if args.bpe_codes:
        codes = open(args.bpe_codes, 'r')
        bpe = BPE(codes)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    def make_result(src_str, hypos):
        result = Translation(
            src_str='O\t{}'.format(src_str),
            hypos=[],
            pos_scores=[],
            alignments=[],
        )

        # Process top predictions
        for hypo in hypos[:min(len(hypos), args.nbest)]:
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu()
                if hypo['alignment'] is not None else None,
                align_dict=align_dict,
                tgt_dict=tgt_dict,
                remove_bpe=args.remove_bpe,
            )
            hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de')
            result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str))
            result.pos_scores.append('P\t{}'.format(' '.join(
                map(
                    lambda x: '{:.4f}'.format(x),
                    hypo['positional_scores'].tolist(),
                ))))
            result.alignments.append('A\t{}'.format(' '.join(
                map(lambda x: str(utils.item(x)), alignment))) if args.
                                     print_alignment else None)
        return result

    gen_timer = StopwatchMeter()
    end2end_timer = StopwatchMeter()

    def process_batch(batch):
        tokens = batch.tokens
        lengths = batch.lengths

        if use_cuda:
            tokens = tokens.cuda()
            lengths = lengths.cuda()

        gen_timer.start()
        translations = translator.generate(
            tokens,
            lengths,
            maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b),
        )
        gen_timer.stop()

        return [
            make_result(batch.srcs[i], t) for i, t in enumerate(translations)
        ]

    if args.buffer_size > 1:
        print('| Sentence buffer size:', args.buffer_size)
    print('| Type the input sentence and press return:')
    for inputs in buffered_read(args.buffer_size):
        indices = []
        results = []
        end2end_timer.start()
        for batch, batch_indices in make_batches(inputs, args, src_dict,
                                                 models[0].max_positions(),
                                                 bpe):
            indices.extend(batch_indices)
            results += process_batch(batch)

        for i in np.argsort(indices):
            result = results[i]
            print(result.src_str)
            for hypo, pos_scores, align in zip(result.hypos, result.pos_scores,
                                               result.alignments):
                print(hypo)
                print(pos_scores)
                if align is not None:
                    print(align)

        print('Model latency: {} s'.format(gen_timer.sum))
        gen_timer.reset()
        end2end_timer.stop()
        print('End-to-end translation time: {} s'.format(end2end_timer.sum))
        end2end_timer.reset()