def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write(
            'Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [
        codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab
    ]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list,
                            output,
                            args.symbols,
                            args.min_frequency,
                            args.verbose,
                            is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(),
                                key=lambda x: x[1],
                                reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close()
Exemple #2
0
    def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)
        if from_filenames:
            filenames = item_list
            if isinstance(filenames, str):
                filenames = [filenames]

            # get combined vocabulary of all input files
            full_vocab = OrderedCounter()
            for fname in filenames:
                with codecs.open(fname, encoding='UTF-8') as f:
                    full_vocab += learn_bpe.get_vocabulary(f)
        else:
            # get combined vocabulary of all input texts
            full_vocab = OrderedCounter()
            full_vocab += learn_bpe.get_vocabulary(item_list)

        vocab_list = [
            '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
        ]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.main(vocab_list,
                           output,
                           self.num_symbols,
                           self.min_frequency,
                           False,
                           is_dict=True)
        self.set_bpe(self.codes_file)
Exemple #3
0
def main(input, output_name, vocab, symbols, separator='@@', min_frequency=2, verbose=False):

    # read/write files as UTF-8
    input = [codecs.open(f, encoding='UTF-8') for f in input]
    vocab = [codecs.open(f, 'w', encoding='UTF-8') for f in vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(output_name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list, output, symbols, min_frequency, verbose, is_dict=True)

    with codecs.open(output_name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator, None)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(input, vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close()
Exemple #4
0
def main(args):
    vocab = learn_bpe.get_vocabulary(args.input, args.dict_input,
                                     args.mincount)
    assert isinstance(vocab, Counter)
    bpe = apply_bpe.BPE(args.codes,
                        args.separator,
                        vocab=None,
                        unkchar=args.unkchar,
                        unktag=args.unktag)
    bpevocab = learn_bpe.restricted_vocabulary(bpe, vocab)
    if args.outcodes is not None: bpe.write_subset(args.outcodes, bpevocab)
    if args.bpevocab is not None:
        learn_bpe.write_vocabulary(bpevocab, args.bpevocab)
Exemple #5
0
    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write(
            'Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [
        codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab
    ]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list,
                       output,
                       args.symbols,
                       args.min_frequency,
                       args.verbose,
                       is_dict=True)
    args = parser.parse_args()

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write(
            'Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [
        codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab
    ]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    vocabs = []
    for f in args.input:
        v = learn_bpe.get_vocabulary(f, args.dict_input, args.mincount)
        vocabs.append(v)
        full_vocab += v
        f.seek(0)

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main_args(args, full_vocab, output, is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, args.separator, None)
        # apply BPE to each training corpus and get vocabulary
        learn_bpe.make_vocabularies(bpe, vocabs, args.vocab)
Exemple #7
0
def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write(
            'Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [
        codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab
    ]

    if args.special_vocab:
        with codecs.open(args.special_vocab, encoding='UTF-8') as f:
            l = [
                line.strip('\r\n ')
                for line in codecs.open(args.special_vocab, encoding='UTF-8')
            ]
        args.special_vocab = l

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f, args.dict_input)
        f.seek(0)
    if args.special_vocab:
        for word in args.special_vocab:
            full_vocab[word] += 1  # integrate special vocab to full_vocab

    vocab_list = yield_dict_lines(full_vocab)

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list,
                            output,
                            args.symbols,
                            args.min_frequency,
                            args.verbose,
                            is_dict=True,
                            total_symbols=args.total_symbols,
                            is_postpend=args.postpend,
                            special_vocab=args.special_vocab)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes,
                            separator=args.separator,
                            is_postpend=args.postpend)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):
        if args.dict_input:
            vocab = Counter()
            for i, line in enumerate(train_file):
                try:
                    word, count = line.strip('\r\n ').split(' ')
                    segments = bpe.segment_tokens([word])
                except:
                    print('Failed reading vocabulary file at line {0}: {1}'.
                          format(i, line))
                    sys.exit(1)
                for seg in segments:
                    vocab[seg] += int(count)
        else:
            tmp = tempfile.NamedTemporaryFile(delete=False)
            tmp.close()

            tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

            train_file.seek(0)
            for line in train_file:
                tmpout.write(bpe.process_line(line).strip())
                tmpout.write('\n')

            tmpout.close()
            tmpin = codecs.open(tmp.name, encoding='UTF-8')

            vocab = learn_bpe.get_vocabulary(tmpin)
            tmpin.close()
            os.remove(tmp.name)

        # if special vocab is defined, include them
        if args.special_vocab:
            for i, word in enumerate(args.special_vocab):
                try:
                    segments = bpe.segment_tokens([word])
                except:
                    print(
                        'Failed reading special vocabulary file at line {0}: {1}'
                        .format(i, line))
                    sys.exit(1)
                if len(segments) != 1:
                    sys.stderr.write(
                        'WARNING: special vocab \'{0}\' not captured by merges, split into \'{1}\'\n'
                        .format(word, ' '.join(segments)))
                for seg in segments:
                    vocab[seg] += 1

        sys.stderr.write('Vocabulary got {0:d} unique items\n'.format(
            len(vocab)))

        # if character vocab is to be included
        if args.character_vocab:
            char_internal, char_terminal = learn_bpe.extract_uniq_chars(
                full_vocab, args.postpend)
            sys.stderr.write(
                'Got {0:d} non-terminal and {1:d} terminal characters\n'.
                format(len(char_internal), len(char_terminal)))
            pseudo_count_terminal = max(
                vocab.values()) + 2  # always precedes non-terminal
            pseudo_count_internal = max(
                vocab.values()) + 1  # always precedes other items
            for c in char_terminal:
                vocab[c] = pseudo_count_terminal
            for c in char_internal:
                c = '{0}{1}'.format(args.separator,
                                    c) if args.postpend else '{0}{1}'.format(
                                        c, args.separator)
                vocab[c] = pseudo_count_internal
        for key, freq in sorted(vocab.items(), key=lambda x: (-x[1], x[0])):
            vocab_file.write("{0} {1}\n".format(key, freq))
        train_file.close()
        vocab_file.close()
    parser = create_parser()
    args = parser.parse_args()

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)