Ejemplo n.º 1
0
def test_spacy_model_with_regex_example_tokenize(tokenizers):
    text = "In 1096, Crusaders passing by the siege of Amalfi were joined by Bohemond of Taranto and his nephew Tancred with an army of Italo-Normans. Bohemond was the de facto leader of the Crusade during its passage through Asia Minor. After the successful Siege of Antioch in 1097, Bohemond began carving out an independent principality around that city. Tancred was instrumental in the conquest of Jerusalem and he worked for the expansion of the Crusader kingdom in Transjordan and the region of Galilee.[citation needed]"

    sent_tokenizer = SentTokenizer("punkt")
    word_tokenizer = WordTokenizer("spacy_en",
                                   sent_tokenizer,
                                   split_with_regex=True)

    disables = ["vectors", "textcat", "parser"]
    spacy_model = spacy.load("en_core_web_sm", disable=disables)
    spacy_model.tokenizer = load_spacy_model_for_tokenizer(
        word_tokenizer.extra_split_chars_re)

    sentences = sent_tokenizer.tokenize(text)

    spacy_model_results = []
    for sentence in sentences:
        spacy_model_results += [token.text for token in spacy_model(sentence)]

    assert word_tokenizer.tokenize(text) == spacy_model_results

    text = "20th Century Fox, Lionsgate, Paramount Pictures, Universal Studios and Walt Disney Studios paid for movie trailers to be aired during the Super Bowl. Fox paid for Deadpool, X-Men: Apocalypse, Independence Day: Resurgence and Eddie the Eagle, Lionsgate paid for Gods of Egypt, Paramount paid for Teenage Mutant Ninja Turtles: Out of the Shadows and 10 Cloverfield Lane, Universal paid for The Secret Life of Pets and the debut trailer for Jason Bourne and Disney paid for Captain America: Civil War, The Jungle Book and Alice Through the Looking Glass.[citation needed]"
    sentences = sent_tokenizer.tokenize(text)

    spacy_model_results = []
    for sentence in sentences:
        spacy_model_results += [token.text for token in spacy_model(sentence)]

    assert word_tokenizer.tokenize(text) == spacy_model_results
Ejemplo n.º 2
0
    def __init__(
        self,
        file_paths,
        lang_code,
        tokenizers,
        max_seq_length=384,
        context_stride=128,
        max_question_length=64,
    ):

        super(SQuADBertReader, self).__init__(file_paths, SQuADBertDataset)
        self.lang_code = lang_code
        self.max_seq_length = max_seq_length
        self.context_stride = context_stride
        self.max_question_length = max_question_length

        self.text_columns = ["bert_input", "context", "question"]

        if "subword" not in tokenizers:
            raise ValueError("WordTokenizer and SubwordTokenizer is required.")

        sent_tokenizer = SentTokenizer("punkt", {})
        if lang_code == "ko":
            self.word_tokenizer = WordTokenizer("mecab_ko",
                                                sent_tokenizer,
                                                split_with_regex=True)
        else:
            self.word_tokenizer = WordTokenizer("treebank_en",
                                                sent_tokenizer,
                                                split_with_regex=True)
        self.subword_tokenizer = tokenizers["subword"]
Ejemplo n.º 3
0
def tokenizers(request):
    sent_name, sent_config, word_name, word_config, \
        subword_name, subword_config, char_name, char_config = request.param

    sent_tokenizer = SentTokenizer(sent_name, config=sent_config)
    word_tokenizer = WordTokenizer(word_name,
                                   sent_tokenizer,
                                   config=word_config)
    subword_tokenizer = SubwordTokenizer(subword_name,
                                         word_tokenizer,
                                         config=subword_config)
    char_tokenizer = CharTokenizer(char_name,
                                   word_tokenizer,
                                   config=char_config)

    return {
        "sent": sent_tokenizer,
        "word": word_tokenizer,
        "subword": subword_tokenizer,
        "char": char_tokenizer,
    }
Ejemplo n.º 4
0
    def __init__(
        self,
        file_paths,
        lang_code,
        tokenizers,
        max_seq_length=384,
        context_stride=128,
        max_question_length=64,
        cls_token="[CLS]",
        sep_token="[SEP]",
    ):

        super(SQuADBertReader, self).__init__(file_paths, SQuADBertDataset)
        self.lang_code = lang_code
        self.max_seq_length = max_seq_length
        self.context_stride = context_stride
        self.max_question_length = max_question_length
        self.cls_token = cls_token
        self.sep_token = sep_token

        self.text_columns = ["bert_input", "context", "question"]

        sent_tokenizer = SentTokenizer("punkt", {})
        if lang_code == "ko":
            self.word_tokenizer = WordTokenizer("mecab_ko",
                                                sent_tokenizer,
                                                split_with_regex=True)
        else:
            self.word_tokenizer = WordTokenizer("treebank_en",
                                                sent_tokenizer,
                                                split_with_regex=True)

        if tokenizers["bpe"] is not None:
            self.sub_level_tokenizer = tokenizers["bpe"]  # RoBERTa
        elif tokenizers["subword"] is not None:
            self.sub_level_tokenizer = tokenizers["subword"]  # BERT
        else:
            raise ValueError("'bpe' or 'subword' tokenizer is required.")