Python Tokenizerの例、syntok.tokenizer.Tokenizer Pythonの例

コード例 #1

0

ファイルを表示

 def test_sentence_with_single_letter_at_end(self):
     # Sadly, this one cannot split if we want to capture author abbreviations
     tokens = Tokenizer().split("got an A. Mathematics was")
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens], result)

コード例 #2

0

ファイルを表示

 def test_sentence_ends_in_abbreviation(self):
     tokens = Tokenizer().split("operating at 2.4 GHz. Its power stage")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #3

0

ファイルを表示

 def test_split_with_dot_following_abbreviation(self):
     tokens = Tokenizer().split("in the E.U.. But they are")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #4

0

ファイルを表示

 def test_sentences_with_Roman_enumerations(self):
     tokens = Tokenizer().split('I. This goes first. II. And here thereafter.')
     sep = 6
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #5

0

ファイルを表示

 def test_brackets_before_the_terminal(self):
     tokens = Tokenizer().split("Brackets before the terminal [2]. You know I told you so.")
     sep = 8
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #6

0

ファイルを表示

 def test_two_sentences_with_quotes_in_first(self):
     tokens = Tokenizer().split('"This is a sentence." This is another sentence.')
     sep = 7
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #7

0

ファイルを表示

 def test_sentences_with_nasty_abbreviations(self):
     tokens = Tokenizer().split('This is Capt. Motto here. And here is Sra. Smithers.')
     sep = 7
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #8

0

ファイルを表示

 def test_clean_text(self):
     self.assertEqual(
         "He3ll#o",
         Tokenizer.join_hyphenated_words_across_linebreaks("He3l- \n  l#o"))

コード例 #9

0

ファイルを表示

 def test_clean_text_Unicode(self):
     for h in Tokenizer._hyphens:
         self.assertEqual(
             "Hello",
             Tokenizer.join_hyphenated_words_across_linebreaks("Hel" + h +
                                                               " \n  lo"))

コード例 #10

0

ファイルを表示

class TestTokenizer(TestCase):
    def setUp(self) -> None:
        self.tokenizer = Tokenizer()

        with open(os.path.dirname(__file__) + '/tokenizer_test.txt',
                  'rt') as examples:
            self.examples = examples.readlines()

    def test_lines(self) -> None:
        error = False

        for i in range(0, len(self.examples), 2):
            line = self.examples[i].strip()
            output = self.examples[i + 1].split()
            result = s(self.tokenizer.split(line))

            if result != output:
                print("expected:", output)
                print("received:", result)
                error = True

        self.assertFalse(error)

    def test_clean_text(self):
        self.assertEqual(
            "He3ll#o",
            Tokenizer.join_hyphenated_words_across_linebreaks("He3l- \n  l#o"))

    def test_clean_text_Unicode(self):
        for h in Tokenizer._hyphens:
            self.assertEqual(
                "Hello",
                Tokenizer.join_hyphenated_words_across_linebreaks("Hel" + h +
                                                                  " \n  lo"))

    def test_split_dot(self):
        self.assertListEqual(s(self.tokenizer.split("abc.")), ["abc", "."])

    def test_split_camel_case(self):
        self.assertListEqual(s(self.tokenizer.split("abCd EFG")),
                             ["ab", "Cd", "EFG"])

    def test_hyphens(self):
        for h in Tokenizer._hyphens:
            self.assertListEqual(s(self.tokenizer.split("ab" + h + "cd")),
                                 ["ab", "cd"])

        self.tokenizer = Tokenizer(True)

        for h in Tokenizer._hyphens:
            self.assertListEqual(s(self.tokenizer.split("ab" + h + "cd")),
                                 ["ab", h, "cd"])

    def test_apostrophes(self):
        for a in Tokenizer._apostrophes:
            self.assertListEqual(s(self.tokenizer.split("ab" + a + "cd")),
                                 ["ab", a + "cd"])

    def test_emit_dash(self):
        self.assertListEqual(s(self.tokenizer.split("ab-cd")), ["ab", "cd"])
        self.tokenizer = Tokenizer(True)
        self.assertListEqual(s(self.tokenizer.split("ab-cd")),
                             ["ab", "-", "cd"])

    def test_emit_underscore(self):
        self.assertListEqual(s(self.tokenizer.split("ab_cd")), ["ab", "cd"])
        self.tokenizer = Tokenizer(True)
        self.assertListEqual(s(self.tokenizer.split("ab_cd")),
                             ["ab", "_", "cd"])

    def test_spacing_prefix(self):
        text = " Hi man,  spaces !! "
        output = self.tokenizer.split(text)
        reconstruction = "".join(map(str, output))
        self.assertEqual(text, reconstruction)

コード例 #11

0

ファイルを表示

    def setUp(self) -> None:
        self.tokenizer = Tokenizer()

        with open(os.path.dirname(__file__) + '/tokenizer_test.txt',
                  'rt') as examples:
            self.examples = examples.readlines()

コード例 #12

0

ファイルを表示

ファイル: tokenizer_test.py プロジェクト: Gamemaster-007/VATC

 def setUp(self) -> None:
     self.tokenizer = Tokenizer()

コード例 #13

0

ファイルを表示

ファイル: tokenizer_test.py プロジェクト: fnl/syntok

class TestTokenizer(TestCase):

    def setUp(self) -> None:
        self.tokenizer = Tokenizer()

    def test_lines(self) -> None:
        with open(os.path.dirname(__file__) + '/tokenizer_test.txt', 'rt', encoding='utf-8') as examples:
            self.examples = examples.readlines()

        error = False

        for i in range(0, len(self.examples), 2):
            line = self.examples[i].strip()
            output = self.examples[i + 1].split()
            result = s(self.tokenizer.split(line))

            if result != output:
                print("expected:", output)
                print("received:", result)
                error = True

        self.assertFalse(error)

    def test_clean_text(self):
        self.assertEqual("He3ll#o", Tokenizer.join_hyphenated_words_across_linebreaks("He3l- \n  l#o"))

    def test_clean_text_Unicode(self):
        for h in Tokenizer._hyphens:
            self.assertEqual("Hello", Tokenizer.join_hyphenated_words_across_linebreaks("Hel" + h + " \n  lo"))

    def test_split_dot(self):
        self.assertListEqual(s(self.tokenizer.split("abc.")), ["abc", "."])

    def test_split_camel_case(self):
        self.assertListEqual(s(self.tokenizer.split("abCd EFG")), ["ab", "Cd", "EFG"])

    def test_hyphens(self):
        for h in Tokenizer._hyphens:
            self.assertListEqual(s(self.tokenizer.split("ab" + h + "cd")), ["ab", "cd"])

        self.tokenizer = Tokenizer(True)

        for h in Tokenizer._hyphens:
            self.assertListEqual(s(self.tokenizer.split("ab" + h + "cd")), ["ab", h, "cd"])

    def test_apostrophes(self):
        for a in Tokenizer._apostrophes:
            self.assertListEqual(s(self.tokenizer.split("ab" + a + "cd")), ["ab", a + "cd"])

    def test_emit_dash(self):
        self.assertListEqual(s(self.tokenizer.split("ab-cd")), ["ab", "cd"])
        self.tokenizer = Tokenizer(True)
        self.assertListEqual(s(self.tokenizer.split("ab-cd")), ["ab", "-", "cd"])

    def test_emit_underscore(self):
        self.assertListEqual(s(self.tokenizer.split("ab_cd")), ["ab", "cd"])
        self.tokenizer = Tokenizer(True)
        self.assertListEqual(s(self.tokenizer.split("ab_cd")), ["ab", "_", "cd"])

    def test_spacing_prefix(self):
        text = " Hi man,  spaces of \u200Ball  kinds!! "
        output = self.tokenizer.split(text)
        reconstruction = "".join(map(str, output))
        self.assertEqual(text, reconstruction)

    def test_inner_ellipsis(self):
        text = "Lalala...or Lala Land...."
        result = self.tokenizer.split(text)
        self.assertListEqual(s(result), ["Lalala", "...", "or", "Lala", "Land", "...", "."])

    def test_nonword_prefix(self):
        text = "..A"
        result = self.tokenizer.split(text)
        self.assertListEqual(s(result), [".", ".", "A"])
        self.assertListEqual([t.offset for t in result], [0, 1, 2])

    def test_nonword_high_prefix(self):
        text = "\U0001F64C.A"
        result = self.tokenizer.split(text)
        self.assertListEqual(s(result), ["\U0001F64C", ".", "A"])
        self.assertListEqual([t.offset for t in result], [0, 1, 2])  # requires Py3.3+

    def test_apostrophe_offset_without_replace_not_contraction(self):
        # NOTE: in this case nothing is replaced, so the offsets should remain identical
        # to those in the original text
        text = "don't"
        self.tokenizer = Tokenizer(replace_not_contraction=False)
        result = self.tokenizer.split(text)
        self.assertListEqual([t.offset for t in result], [0, 2])

    def test_apostrophe_offset_with_replace_not_contraction(self):
        # NOTE: in this case, "n't" is replaced with "not", so a space is introduced.
        # e.g. "don't" -> "do not", "can't" -> "can not"
        text = "don't"
        self.tokenizer = Tokenizer(replace_not_contraction=True)
        result = self.tokenizer.split(text)
        self.assertListEqual([t.offset for t in result], [0, 2])
        self.assertListEqual([t.value for t in result], ["do", "not"])

コード例 #14

0

ファイルを表示

ファイル: model.py プロジェクト: kongriley/rounce

After eight people — including six people of Asian descent and seven women — were shot to death in Georgia this week, a deputy sheriff chalked the killings up to the suspect’s confessed “sex addiction,” adding that “yesterday was a really bad day” for the alleged shooter. That diagnosis was met with the skepticism it deserved: The same deputy promoted the sale of anti-Asian T-shirts that referred to the coronavirus as an import from “Chy-na.”

It’s difficult to disentangle the vile pathologies that lead a man to take so many innocent lives. It’s also impossible to ignore the context in which the murders were committed and the impact that the tragedy has had on communities across America. In an analysis of nearly 4,000 hate-related incidents targeting Asian-Americans documented this year and last, nearly 70 percent of the victims were women, according to a report by the group Stop AAPI Hate. New York was the second state behind California in the total number of incidents documented by the group.

“Among large American cities,” The Times reports, “New York City had the largest increase in reported hate crimes against Asians last year, according to an analysis of police data by a center at the California State University, San Bernardino. There were 28 such incidents in 2020, up from three in 2019, according to New York Police Department data.”

A bill currently before the New York State Legislature, sponsored by State Senator Brad Hoylman and Assemblywoman Karines Reyes, would mandate better collection of data about hate crimes.

After a year of vitriol and violence against Asian-Americans amid the coronavirus pandemic, it’s long past time to admit that the country has a problem. “The Asian-American community has reached a crisis point that cannot be ignored,” Representative Judy Chu, a Democrat from California, told a congressional hearing on Thursday. It was the first such hearing on anti-Asian discrimination in three decades.

A year ago this month, after the pandemic had already established a beachhead in the United States, this board wrote that there was a long history of diseases triggering waves of violence — against Jews during the Black Death right through the animus linked to Ebola, SARS and Zika. “Chinese-Americans and other Asians lumped together with them by racists are being beaten, spat on, yelled at and insulted from coast to coast, driving some members of the maligned minority to purchase firearms in the fear of worse to come as the pandemic deepens,” the board wrote.

The president then was Donald Trump, who spent his term exploiting anti-immigrant hostility for political gain. He seized every opportunity to cast the pandemic in bigoted terms, portraying China in particular as the villain. Mr. Trump said “China virus” again this week during a Fox News interview on the very night of the Georgia shootings.

President Biden and Vice President Kamala Harris are scheduled on Friday to meet with leaders from the Asian-American and Pacific Islander communities in Atlanta, a welcome display of presidential civility.

It’s impossible not to acknowledge the nation’s history of maltreatment of Asian-Americans, nor how it has manifested over the past year — from political stump speeches, to xenophobic merchandise, to the rise in hate crimes. It falls to Americans living in the shadow of this history to demand more from ourselves — more compassion, more dignity, more grace — as we work to heal our society of its many ills.
'''

tok = Tokenizer()

split_text = segmenter.split(tok.tokenize(text))
parsed_text = []
for sentence in split_text:
    parsed_text.append(''.join([str(token) for token in sentence]))

print(parsed_text)

summary = lxr.get_summary(parsed_text, threshold=None)

print(summary)

コード例 #15

0

ファイルを表示

 def test_two_sentences_with_parenthesis_in_second(self):
     tokens = Tokenizer().split("This is a sentence. (This is another sentence.)")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #16

0

ファイルを表示

 def test_emit_dash(self):
     self.assertListEqual(s(self.tokenizer.split("ab-cd")), ["ab", "cd"])
     self.tokenizer = Tokenizer(True)
     self.assertListEqual(s(self.tokenizer.split("ab-cd")),
                          ["ab", "-", "cd"])

コード例 #17

0

ファイルを表示

 def test_sentence_with_single_quotes(self):
     tokens = Tokenizer().split("This is a sentence. 'This is another sentence.'")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #18

0

ファイルを表示

 def test_emit_underscore(self):
     self.assertListEqual(s(self.tokenizer.split("ab_cd")), ["ab", "cd"])
     self.tokenizer = Tokenizer(True)
     self.assertListEqual(s(self.tokenizer.split("ab_cd")),
                          ["ab", "_", "cd"])

コード例 #19

0

ファイルを表示

 def test_two_sentences_with_quotes_and_prenthesis_in_both(self):
     tokens = Tokenizer().split('"{This is a sentence."} ["This is another sentence."]')
     sep = 9
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #20

0

ファイルを表示

ファイル: spellchecker.py プロジェクト: dbeure/semesterproject

    def automated_spellcheck(self, df):
        """
        Automated spellcheck that looks for unpredictable OCR errors in the
        data and replaces them with hopefully correct tokens.

        Args:
            df (pandas.Dataframe): the dataframe that will get spellchecked
        """
        wikidata_client = Client()
        tokenizer = Tokenizer(replace_not_contraction=False)
        ignore_regex_str = "^[0-9.!?*„_\-\—,;:<>='|\[\]\"()^«»/°•©>]+"
        ignore_regex = re.compile(ignore_regex_str)

        for index, row in df.iterrows():
            # Get the token
            token = row["TOKEN"]
            wiki_metadata = row["NEL-LIT"]

            # Autocorrect
            suggestions = self.sym_spell.lookup(
                token,
                Verbosity.TOP,
                transfer_casing=True,
                include_unknown=True,
                ignore_token=ignore_regex_str,
                max_edit_distance=Spellchecker.MAX_EDIT_DISTANCE)
            # Save the first suggestion if we have one
            if suggestions and suggestions[0].term != token.lower():
                if wiki_metadata.startswith('Q'):
                    # 1. 'Qxxxx' - Use the Wikidata column value to spellcheck
                    if ignore_regex.match(token):
                        # token should be ignored
                        continue

                    wikidata_entity = wikidata_client.get(wiki_metadata)
                    try:
                        wikidata_label = wikidata_entity.attributes['labels'][
                            'de']['value']
                    except KeyError:
                        # the wikidata has no 'de' entry for the label, ignore spellcorrection
                        continue

                    wikidata_labels = tokenizer.tokenize(wikidata_label)
                    wikidata_labels = map(lambda t: t.value, wikidata_labels)
                    wikidata_labels = filter(
                        lambda t: not ignore_regex.match(t), wikidata_labels)
                    wikidata_labels = list(wikidata_labels)

                    # Check if the token is not an abbreviation
                    is_abbreviation = False
                    for sublabel in wikidata_labels:
                        if sublabel.startswith(token):
                            print(token, "(abbrev) ->", sublabel, " | ",
                                  wiki_metadata)
                            df.at[index, 'TOKEN'] = sublabel
                            is_abbreviation = True
                            break

                    if is_abbreviation:
                        continue

                    try:
                        best_match = sorted(
                            wikidata_labels,
                            key=lambda t: distance(t, token))[0]
                    except IndexError:
                        continue

                    if distance(
                            best_match,
                            token) <= Spellchecker.MAX_LEVENSHTEIN_DISTANCE:
                        print(token, "(best_match) ->", best_match, " | ",
                              wiki_metadata)
                        df.at[index, 'TOKEN'] = best_match
                else:
                    # 2. 'NIL' / '_' - Use symspell
                    suggestion = suggestions[0].term
                    print(token, "(symspell) ->", suggestion, " | ",
                          wiki_metadata)
                    df.at[index, 'TOKEN'] = suggestion

コード例 #21

0

ファイルを表示

 def test_sentences_with_nasty_special_abbreviations(self):
     tokens = Tokenizer().split('This f. e. here. And here is unknwn. help.')
     sep = 7
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #22

0

ファイルを表示

ファイル: benchmark_sbd_tools.py プロジェクト: textractortechnologies/pySBD

import stanza

from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter

from english_golden_rules import GOLDEN_EN_RULES

pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)

nlp = spacy.blank('en')
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
#stanza.download('en')
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')

syntok_tokenizer = Tokenizer()

def blingfire_tokenize(text):
    return blingfire.text_to_sentences(text).split('\n')

def nltk_tokenize(text):
    return nltk.sent_tokenize(text)

def pysbd_tokenize(text):
    segments = pysbd_segmenter.segment(text)
    return [s.strip() for s in segments]

def spacy_tokenize(text):
    return [sent.text for sent in nlp(text).sents]

def spacy_dep_tokenize(text):

コード例 #23

0

ファイルを表示

 def test_one_word_sentences(self):
     tokens = Tokenizer().split('Who did this? I. No! Such a shame.')
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:4], tokens[4:8], tokens[8:]], result)

コード例 #24

0

ファイルを表示

Effects larger (eta(2) = .53), with cognition (eta(2) = .14) and neurocognition (eta(2) = .16).
All validations show a good approximation of the behavior of the DMFC.
In addition, a simulated application of a circuit system is explained.
Conclusions: Our data suggest CK5/6, CK7, and CK18 in the subclassification of NSCLC.
Copyright (C) 2018 S. Korgur AG, Basel.
Gelatin degradation by MMP-9.
ConclusionThis study provides clear evidence.
A sampling frequency of 780 MHz.
The figure-of-merit of the modulator is there.
Patients with prodromal DLB.
In line with the literature on DLB.
Always last, clear closing example."""

SENTENCES = OSPL.split('\n')
TEXT = ' '.join(SENTENCES)
TOKENIZER = Tokenizer()
SEGMENTED_TOKENS = [TOKENIZER.split(t) for t in SENTENCES]


class TestSegmenter(TestCase):

    def test_segmenter(self):
        def make_sentences(segmented_tokens):
            for sentence in segmented_tokens:
                yield "".join(str(token) for token in sentence).strip()

        self.maxDiff = None
        expected = "\n".join(make_sentences(SEGMENTED_TOKENS))
        received = "\n".join(make_sentences(segmenter.split(TOKENIZER.tokenize(TEXT))))
        assert expected == OSPL
        assert expected == received

コード例 #25

0

ファイルを表示

 def test_sentence_marker_after_abbreviation(self):
     tokens = Tokenizer().split("Let's meet at 14.10 in N.Y.. This happened in the U.S. last week.")
     sep = 9
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #26

0

ファイルを表示

 def test_two_exclamations(self):
     tokens = Tokenizer().split("This is a sentence! This is another sentence!")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #27

0

ファイルを表示

 def test_sentence_ends_in_single_letter_and_starts_with_starter_word(self):
     tokens = Tokenizer().split("got an A. And then he")
     sep = 4
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #28

0

ファイルを表示

 def test_two_questions(self):
     tokens = Tokenizer().split("Is this a sentence? Is this another sentence?")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #29

0

ファイルを表示

 def test_split_with_complext_abbreviation_pattern(self):
     tokens = Tokenizer().split("resp.). Indicate")
     sep = 4
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)

コード例 #30

0

ファイルを表示

 def test_sentence_with_single_letter_abbreviation(self):
     tokens = Tokenizer().split(
         "The basis for Lester B. Pearson's policy was later.")
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens], result)