コード例 #1
0
    def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)
        if from_filenames:
            filenames = item_list
            if isinstance(filenames, str):
                filenames = [filenames]

            # get combined vocabulary of all input files
            full_vocab = OrderedCounter()
            for fname in filenames:
                with codecs.open(fname, encoding='UTF-8') as f:
                    full_vocab += learn_bpe.get_vocabulary(f)
        else:
            # get combined vocabulary of all input texts
            full_vocab = OrderedCounter()
            full_vocab += learn_bpe.get_vocabulary(item_list)

        vocab_list = [
            '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
        ]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.main(vocab_list,
                           output,
                           self.num_symbols,
                           self.min_frequency,
                           False,
                           is_dict=True)
        self.set_bpe(self.codes_file)
コード例 #2
0
ファイル: preprocess.py プロジェクト: elmines/VocabTuner
def learn_joint_codes(source_corp,
                      dest_corp,
                      joint_codes,
                      num_sequences,
                      verbose=False):
    """
    source_corp - file path to source corpus
    dest_corp - file path to destination corpus
    joint_codes - file path to write joint codes
    num_sequences - maximum number of sequences to learn
    verbose - print all messages
    """
    temp_file = "temp_cat_file"
    with open(source_corp,
              "r", encoding="utf-8") as s, open(dest_corp,
                                                "r",
                                                encoding="utf-8") as d, open(
                                                    temp_file,
                                                    "w",
                                                    encoding="utf-8") as t:
        lines = s.readlines() + d.readlines()
        t.writelines(lines)
    if verbose:
        print("Concatenated %s and %s to temporary file %s" %
              (source_corp, dest_corp, temp_file))
    with open(temp_file, "r",
              encoding="utf-8") as temp, open(joint_codes,
                                              "w",
                                              encoding="utf-8") as joint:
        learn_bpe.main(temp, joint, num_sequences, verbose=verbose)
    os.remove(temp_file)
    if verbose:
        print("Deleted temporary file %s" % temp_file)
        print("Wrote joint codes file %s" % joint_codes)
コード例 #3
0
ファイル: preprocess.py プロジェクト: elmines/VocabTuner
def learn_codes(source_corp,
                dest_corp,
                source_codes,
                dest_codes,
                num_sequences,
                verbose=False):
    """
    source_corp - file path to source corpus
    dest_corp - file path to destination corpus
    source_codes - file path to write source codes
    dest_codes - file path to write destination codes
    num_sequences - maximum number of sequences to learn
    verbose - print all messages
    """
    with open(source_corp, "r",
              encoding="utf-8") as corp, open(source_codes,
                                              "w",
                                              encoding="utf-8") as codes:
        learn_bpe.main(corp, codes, num_sequences, verbose=verbose)
    if verbose: print("Wrote codes file %s" % source_codes)
    with open(dest_corp, "r",
              encoding="utf-8") as corp, open(dest_codes,
                                              "w",
                                              encoding="utf-8") as codes:
        learn_bpe.main(corp, codes, num_sequences, verbose=verbose)
    if verbose: print("Wrote codes file %s" % dest_codes)
コード例 #4
0
ファイル: msra_bakeoff3_dataio.py プロジェクト: safpla/t2t
def build_subword_vocab(operation_num=100):
    infile = os.path.join(datagen_path, 'word')
    outfile = infile + '_subword'
    learn_bpe.main(codecs.open(infile, encoding='utf-8'),
                   codecs.open(outfile, 'w', encoding='utf-8'),
                   operation_num,
                   2,
                   verbose=True,
                   vocab_name=vocab_file)
コード例 #5
0
    def test_learn_bpe(self):
        infile = codecs.open(os.path.join(currentdir, 'data', 'corpus.en'),
                             encoding='utf-8')
        outfile = codecs.open(os.path.join(currentdir, 'data', 'bpe.out'),
                              'w',
                              encoding='utf-8')
        learn_bpe.main(infile, outfile, 1000)
        infile.close()
        outfile.close()

        outlines = open(os.path.join(currentdir, 'data', 'bpe.out'))
        reflines = open(os.path.join(currentdir, 'data', 'bpe.ref'))

        for line, line2 in zip(outlines, reflines):
            self.assertEqual(line, line2)

        outlines.close()
        reflines.close()
コード例 #6
0
def main(input, output_name, vocab, symbols, separator='@@', min_frequency=2, verbose=False):

    # read/write files as UTF-8
    input = [codecs.open(f, encoding='UTF-8') for f in input]
    vocab = [codecs.open(f, 'w', encoding='UTF-8') for f in vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(output_name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list, output, symbols, min_frequency, verbose, is_dict=True)

    with codecs.open(output_name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator, None)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(input, vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close()
コード例 #7
0
    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list,
                       output,
                       args.symbols,
                       args.min_frequency,
                       args.verbose,
                       is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
コード例 #8
0
    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with io.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list,
                       output,
                       args.symbols,
                       args.min_frequency,
                       args.verbose,
                       is_dict=True,
                       case_insensitive=args.case_insensitive)

    separator = '@@'
    if args.opennmt_separator: separator = '■'
    with io.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes,
                            separator,
                            None,
                            case_feature=args.case_insensitive)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):
コード例 #9
0
    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
コード例 #10
0
def main(args_list: list = None):
    parser = create_parser()
    args = parser.parse_args(args_list)

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write(
            'Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [
        codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab
    ]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.main(vocab_list,
                       output,
                       args.symbols,
                       args.min_frequency,
                       args.verbose,
                       is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(),
                                key=lambda x: x[1],
                                reverse=True):
            vocab_file.write("{0}\t{1}\n".format(key, freq))
        vocab_file.close()