Beispiel #1
0
def build_legacy_torchtext_vocab_pipeline(vocab_file):
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for line in f:
            for token in line:
                yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    pipeline = sequential_transforms(tokenizer_func(tokenizer), vocab_func(vocab))
    return iterate_batch(pipeline), None, None
Beispiel #2
0
def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None
Beispiel #3
0
def build_legacy_pytext_script_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = TextSequentialTransforms(tokenizer_func(tokenizer),
                                        PyTextScriptVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #4
0
def build_legacy_fasttext_vector_pipeline():
    tokenizer = get_tokenizer("basic_english")
    vector = FastText()

    pipeline = sequential_transforms(tokenizer_func(tokenizer), vector_func(vector))
    return pipeline, None, None