def test_clause_tokenize(self):
     self.assertEqual(clause_tokenize(None), [])
     self.assertEqual(clause_tokenize(""), [])
     self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"]))
     self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list)
     self.assertIsNotNone(crfcls.segment(["ฉัน", "ทดสอบ"]))
     self.assertIsInstance(crfcls.segment(["ฉัน", "ทดสอบ"]), list)
Beispiel #2
0
def clause_tokenize(doc: List[str]) -> List[List[str]]:
    """
    Clause tokenizer. (or Clause segmentation)

    Tokenizes running word list into list of clauses (list of strings).
    split by CRF trained on LST20 Corpus.

    :param str doc: word list to be clause
    :return: list of claues
    :rtype: list[list[str]]

    :Example:

    Clause tokenizer::

        from pythainlp.tokenize import clause_tokenize

        clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
        # [['ฉัน', 'นอน'],
        # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
        # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]

    """
    from pythainlp.tokenize.crfcls import segment

    return segment(doc)
Beispiel #3
0
def word_tokenize(
    text: str,
    custom_dict: Trie = None,
    engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
    keep_whitespace: bool = True,
) -> List[str]:
    """
    Word tokenizer.

    Tokenizes running text into words (list of strings).

    :param str text: text to be tokenized
    :param str engine: name of the tokenizer to be used
    :param pythainlp.util.Trie custom_dict: dictionary trie
    :param bool keep_whitespace: True to keep whitespaces, a common mark
                                 for end of phrase in Thai.
                                 Otherwise, whitespaces are omitted.
    :return: list of words
    :rtype: list[str]
    **Options for engine**
        * *newmm* (default) - dictionary-based, Maximum Matching +
          Thai Character Cluster
        * *newmm-safe* - newmm, with a mechanism to help avoid long
          processing time for text with continuous ambiguous breaking points
        * *longest* - dictionary-based, Longest Matching
        * *icu* - wrapper for ICU (International Components for Unicode,
          using PyICU), dictionary-based
        * *attacut* - wrapper for
          `AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
          learning-based approach
        * *deepcut* - wrapper for
          `DeepCut <https://github.com/rkcosmos/deepcut>`_,
          learning-based approach
        * *nercut* - Dictionary-based maximal matching word segmentation,
          constrained with Thai Character Cluster (TCC) boundaries,
          and combining tokens that are parts of the same named-entity.
        * *sefr_cut* - wrapper for
          `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
        * *tltk* - wrapper for
          `TLTK <https://pypi.org/project/tltk/>`_.,
        * *oskut* - wrapper for
          `OSKut <https://github.com/mrpeerat/OSKut>`_.,

    :Note:
        - The parameter **custom_dict** can be provided as an argument \
          only for *newmm*, *longest*, and *attacut* engine.
    :Example:

    Tokenize text with different tokenizer::

        from pythainlp.tokenize import word_tokenize

        text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"

        word_tokenize(text, engine="newmm")
        # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']

        word_tokenize(text, engine='attacut')
        # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']

    Tokenize text by omiting whitespaces::

        text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว "

        word_tokenize(text, engine="newmm")
        # output:
        # ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' ']

        word_tokenize(text, engine="newmm", keep_whitespace=False)
        # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']

    Tokenize with default and custom dictionary::

        from pythainlp.corpus.common import thai_words
        from pythainlp.tokenize import dict_trie

        text = 'ชินโซ อาเบะ เกิด 21 กันยายน'

        word_tokenize(text, engine="newmm")
        # output:
        # ['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ',
        #  'เกิด', ' ', '21', ' ', 'กันยายน']

        custom_dict_japanese_name = set(thai_words()
        custom_dict_japanese_name.add('ชินโซ')
        custom_dict_japanese_name.add('อาเบะ')

        trie = dict_trie(dict_source=custom_dict_japanese_name)

        word_tokenize(text, engine="newmm", custom_dict=trie))
        # output:
        # ['ชินโซ', ' ', 'อาเบะ',
        #   ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
    """
    if not text or not isinstance(text, str):
        return []

    segments = []

    if engine == "newmm" or engine == "onecut":
        from pythainlp.tokenize.newmm import segment

        segments = segment(text, custom_dict)
    elif engine == "newmm-safe":
        from pythainlp.tokenize.newmm import segment

        segments = segment(text, custom_dict, safe_mode=True)
    elif engine == "attacut":
        from pythainlp.tokenize.attacut import segment

        segments = segment(text)
    elif engine == "longest":
        from pythainlp.tokenize.longest import segment

        segments = segment(text, custom_dict)
    elif engine == "mm" or engine == "multi_cut":
        from pythainlp.tokenize.multi_cut import segment

        segments = segment(text, custom_dict)
    elif engine == "deepcut":  # deepcut can optionally use dictionary
        from pythainlp.tokenize.deepcut import segment

        if custom_dict:
            custom_dict = list(custom_dict)
            segments = segment(text, custom_dict)
        else:
            segments = segment(text)
    elif engine == "icu":
        from pythainlp.tokenize.pyicu import segment

        segments = segment(text)
    elif engine == "nercut":
        from pythainlp.tokenize.nercut import segment

        segments = segment(text)
    elif engine == "sefr_cut":
        from pythainlp.tokenize.sefr_cut import segment

        segments = segment(text)
    elif engine == "tltk":
        from pythainlp.tokenize.tltk import segment

        segments = segment(text)
    elif engine == "oskut":
        from pythainlp.tokenize.oskut import segment

        segments = segment(text)
    else:
        raise ValueError(f"""Tokenizer \"{engine}\" not found.
            It might be a typo; if not, please consult our document.""")

    if not keep_whitespace:
        segments = [token.strip(" ") for token in segments if token.strip(" ")]

    return segments
Beispiel #4
0
def syllable_tokenize(
    text: str,
    engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
    keep_whitespace: bool = True,
) -> List[str]:
    """
    Syllable tokenizer.

    **syllable_tokenize is deprecated, use subword_tokenize instead**

    Tokenizes text into syllable (Thai: พยางค์), a unit of
    pronunciation having one vowel sound.  For example, the word 'รถไฟ'
    contains two syallbles including 'รถ', and 'ไฟ'.

    Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
    with *newmm* as a tokenizer. The function tokenize the text with
    the dictionary of Thai words from
    :func:`pythainlp.corpus.common.thai_words`
    and then dictionary of Thai syllable from
    :func:`pythainlp.corpus.common.thai_syllables`.
    As a result, only syllables are obtained.

    :param str text: input string to be tokenized
    :param str engine: name of the syllable tokenizer
    :return: list of syllables where whitespaces in the text **are included**
    :rtype: list[str]
    **Options for engine**
        * *dict* (default) - newmm word tokenizer with a syllable dictionary
        * *ssg* - CRF syllable segmenter for Thai
    :Example::
    ::

        from pythainlp.tokenize import syllable_tokenize

        text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
        syllable_tokenize(text)
        ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
        'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
    """
    warnings.warn(
        """syllable_tokenize will be deprecated in PyThaiNLP version 2.4,
        use subword_tokenize instead""", PendingDeprecationWarning)

    if not text or not isinstance(text, str):
        return []

    segments = []

    if engine == "dict" or engine == "default":  # use syllable dictionary
        words = word_tokenize(text)
        for word in words:
            segments.extend(
                word_tokenize(text=word,
                              custom_dict=DEFAULT_SYLLABLE_DICT_TRIE))
    elif engine == "ssg":
        from pythainlp.tokenize.ssg import segment

        segments = segment(text)
    else:
        raise ValueError(f"""Tokenizer \"{engine}\" not found.
            It might be a typo; if not, please consult our document.""")

    if not keep_whitespace:
        segments = [token.strip(" ") for token in segments if token.strip(" ")]

    return segments
Beispiel #5
0
def subword_tokenize(
    text: str,
    engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
    keep_whitespace: bool = True,
) -> List[str]:
    """
    Subword tokenizer. Can be smaller than syllable.

    Tokenizes text into inseparable units of
    Thai contiguous characters namely
    `Thai Character Clusters (TCCs) \
    <https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval>`_
    TCCs are the units based on Thai spelling feature that could not be
    separated any character further such as   'ก็', 'จะ', 'ไม่', and 'ฝา'.
    If the following units are separated, they could not be spelled out.
    This function apply the TCC rules to tokenizes the text into
    the smallest units.

    For example, the word 'ขนมชั้น' would be tokenized
    into 'ข', 'น', 'ม', and 'ชั้น'.

    :param str text: text to be tokenized
    :param str engine: the name subword tokenizer
    :return: list of subwords
    :rtype: list[str]
    **Options for engine**
        * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
        * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
        * *wangchanberta* - SentencePiece from wangchanberta model.
        * *dict* - newmm word tokenizer with a syllable dictionary
        * *ssg* - CRF syllable segmenter for Thai
        * *tltk* - syllable tokenizer from tltk

    :Example:

    Tokenize text into subword based on *tcc*::

        from pythainlp.tokenize import subword_tokenize

        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
        text_2 = "ความแปลกแยกและพัฒนาการ"

        subword_tokenize(text_1, engine='tcc')
        # output: ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก',
        #   'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง',
        #   'ศ', '์', 'ห', 'มิ', 'ง']

        subword_tokenize(text_2, engine='tcc')
        # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
        'และ', 'พัฒ','นา', 'กา', 'ร']

    Tokenize text into subword based on *etcc*::

        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
        text_2 = "ความแปลกแยกและพัฒนาการ"

        subword_tokenize(text_1, engine='etcc')
        # output: ['ยุคเริ่มแรกของ ราชวงศ์หมิง']

        subword_tokenize(text_2, engine='etcc')
        # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']

    Tokenize text into subword based on *wangchanberta*::

        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
        text_2 = "ความแปลกแยกและพัฒนาการ"

        subword_tokenize(text_1, engine='wangchanberta')
        # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง']

        subword_tokenize(text_2, engine='wangchanberta')
        # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ']
    """
    if not text or not isinstance(text, str):
        return []

    segments = []

    if engine == "tcc":
        from pythainlp.tokenize.tcc import segment
    elif engine == "etcc":
        from pythainlp.tokenize.etcc import segment
    elif engine == "wangchanberta":
        from pythainlp.wangchanberta import segment
    elif engine == "dict":  # use syllable dictionary
        words = word_tokenize(text)
        for word in words:
            segments.extend(
                word_tokenize(text=word,
                              custom_dict=DEFAULT_SYLLABLE_DICT_TRIE))
    elif engine == "ssg":
        from pythainlp.tokenize.ssg import segment
    elif engine == "tltk":
        from pythainlp.tokenize.tltk import syllable_tokenize as segment
    else:
        raise ValueError(f"""Tokenizer \"{engine}\" not found.
            It might be a typo; if not, please consult our document.""")

    if segments == []:
        segments = segment(text)

    if not keep_whitespace:
        segments = [token.strip(" ") for token in segments if token.strip(" ")]

    return segments
Beispiel #6
0
def sent_tokenize(
    text: str,
    engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
    keep_whitespace: bool = True,
) -> List[str]:
    """
    Sentence tokenizer.

    Tokenizes running text into "sentences"

    :param str text: the text to be tokenized
    :param str engine: choose among *'crfcut'*, *'whitespace'*, \
    *'whitespace+newline'*
    :return: list of splited sentences
    :rtype: list[str]
    **Options for engine**
        * *crfcut* - (default) split by CRF trained on TED dataset
        * *whitespace+newline* - split by whitespaces and newline.
        * *whitespace* - split by whitespaces. Specifiaclly, with \
                         :class:`regex` pattern  ``r" +"``
        * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
    :Example:

    Split the text based on *whitespace*::

        from pythainlp.tokenize import sent_tokenize

        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
        และได้รับมอบหมายให้ประจำในระดับภูมิภาค"

        sent_tokenize(sentence_1, engine="whitespace")
        # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']

        sent_tokenize(sentence_2, engine="whitespace")
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
        #   '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']

    Split the text based on *whitespace* and *newline*::

        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
        และได้รับมอบหมายให้ประจำในระดับภูมิภาค"

        sent_tokenize(sentence_1, engine="whitespace+newline")
        # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
        sent_tokenize(sentence_2, engine="whitespace+newline")
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
        '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']

    Split the text using CRF trained on TED dataset::

        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
        และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค"

        sent_tokenize(sentence_1, engine="crfcut")
        # output: ['ฉันไปประชุมเมื่อวันที่ 11 มีนาคม']

        sent_tokenize(sentence_2, engine="crfcut")
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ ',
        'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
    """

    if not text or not isinstance(text, str):
        return []

    segments = []

    if engine == "crfcut":
        from pythainlp.tokenize.crfcut import segment

        segments = segment(text)
    elif engine == "whitespace":
        segments = re.split(r" +", text, re.U)
    elif engine == "whitespace+newline":
        segments = text.split()
    elif engine == "tltk":
        from pythainlp.tokenize.tltk import sent_tokenize as segment

        segments = segment(text)
    else:
        raise ValueError(f"""Tokenizer \"{engine}\" not found.
            It might be a typo; if not, please consult our document.""")

    if not keep_whitespace:
        segments = [token.strip(" ") for token in segments if token.strip(" ")]

    return segments