Beispiel #1
0
def build_experimental_torchtext_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    with open(hf_vocab_file, 'r') as f:
        vocab = load_vocab_from_file(f)
        pipeline = TextSequentialTransforms(tokenizer, vocab)
        jit_pipeline = torch.jit.script(pipeline.to_ivalue())
        print('jit experimental torchtext pipeline success!')
        return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #2
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f))
         jit_pipeline = torch.jit.script(pipeline.to_ivalue())
         self.assertEqual(pipeline('of that new'), [7, 18, 24])
         self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
Beispiel #3
0
def build_experimental_fasttext_vector_pipeline():
    tokenizer = basic_english_normalize()
    vector = FastTextExperimental()

    pipeline = TextSequentialTransforms(tokenizer, vector)
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())

    print('jit legacy fasttext pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #4
0
def build_fasttext_vector_pipeline():
    tokenizer = basic_english_normalize()
    vector = FastText()

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit fasttext pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #5
0
def build_sp_pipeline(spm_file):
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #6
0
def build_text_vocab_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    f = open(hf_vocab_file, 'r')
    vocab = vocab_from_file_object(f)

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit text vocab pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #7
0
def build_pytext_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary
    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(
        tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)),
        ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #8
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
     jit_pipeline = torch.jit.script(pipeline)
     self.assertEqual(pipeline('of that new'), [7, 18, 24])
     self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
Beispiel #9
0
def build_legacy_pytext_script_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = TextSequentialTransforms(tokenizer_func(tokenizer),
                                        PyTextScriptVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #10
0
def build_experimental_pytext_script_vocab_pipeline(vocab_file):
    import os
    import sys
    # this is needed because we want to add 'torchtext/examples/vocab' directory to the
    # `sys.path` variable in order to import the pytext_vocab (since its not a module)
    sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "vocab"))
    from pytext_vocab import script_vocab

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    ordered_dict = OrderedDict(sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer,
                                        PyTextScriptVocabTransform(script_vocab(ordered_dict)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #11
0
def build_sp_pipeline(spm_file):
    tokenizer = PretrainedSPTokenizer(spm_file)
    vocab = PretrainedSPVocab(spm_file)

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, jit_pipeline
Beispiel #12
0
def build_sp_pipeline(args):
    spm_file = args.spm_filename
    if spm_file in PRETRAINED_SP_MODEL:
        spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file])
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline, jit_pipeline