def __init__(self, bpe_codes, bpe_vocab=None, bpe_threshold=None,\
         bpe_glossaries=None):
     super().__init__('bpe')
     if bpe_glossaries == None:
         _glossaries = []
     else:
         _glossaries = [self._parse_glossary(i) for i in bpe_glossaries]
     if bpe_vocab:
         _vocab = \
             _rv(_codecs.open(bpe_vocab, encoding='utf-8'), bpe_threshold)
         self._bpe = _BPE(_codecs.open(bpe_codes, encoding='utf-8'),\
             vocab=_vocab, glossaries=_glossaries)
     else:
         self._bpe = _BPE(_codecs.open(bpe_codes, encoding='utf-8'))
Esempio n. 2
0
    def __init__(self, bpe_path, span, max_span=10, constrain_chunks=False):
        ''' Initialize the chunker '''
        self.span = span
        self.max_span = max_span
        self.constrain_chunks = constrain_chunks

        with open(bpe_path, 'rt') as bpe_file:
            self.bpe = _BPE(bpe_file)
Esempio n. 3
0
def _apply_bpe(bpe_path, path, output_path):
    ''' Apply the bpe '''
    with ExitStack() as stack:
        input_file = stack.enter_context(open(path, 'rt'))
        output_file = stack.enter_context(open(output_path, 'wt'))
        bpe_file = stack.enter_context(codecs.open(bpe_path, encoding='utf-8'))

        vocab = set()
        bpe = _BPE(bpe_file)
        for line in input_file:
            leading_whitespace = line[:len(line) - len(line.lstrip('\r\n '))]
            trailing_whitespace = line[len(line.rstrip('\r\n ')) - len(line):]
            subwords = bpe.segment_tokens(line.strip('\r\n ').split(' '))

            vocab.update(subwords)
            output_file.write(leading_whitespace + ' '.join(subwords) + trailing_whitespace)
        return vocab