Esempio n. 1
0
def get_corpus(filename: str) -> frozenset:
    """
    Read corpus data from file and return a frozenset.

    (Please see the filename from
    `this file
    <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_

    :param str filename: filename of the corpus to be read

    :return: :mod:`frozenset` consist of lines in the file
    :rtype: :mod:`frozenset`

    :Example:
    ::

        from pythainlp.corpus import get_corpus

        get_corpus('negations_th.txt')
        # output:
        # frozenset({'แต่', 'ไม่'})

        get_corpus('ttc_freq.txt')
        # output:
        # frozenset({'โดยนัยนี้\\t1',
        #    'ตัวบท\\t10',
        #    'หยิบยื่น\\t3',
        #     ...})
    """
    path = os.path.join(corpus_path(), filename)
    lines = []
    with open(path, "r", encoding="utf-8-sig") as fh:
        lines = fh.read().splitlines()

    return frozenset(lines)
Esempio n. 2
0
def path_pythainlp_corpus(filename: str) -> str:
    """
    Get path pythainlp.corpus data

    :param str filename: filename of the corpus to be read

    :return: : path of corpus
    :rtype: str
    """
    return os.path.join(corpus_path(), filename)
Esempio n. 3
0
def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
    """
    Read corpus data from file and return a frozenset or a list.

    Each line in the file will be a member of the set or the list.

    By default, a frozenset will be return, with whitespaces stripped, and
    empty values and duplicates removed.

    If as_is is True, a list will be return, with no modifications
    in member values and their orders.

    (Please see the filename from
    `this file
    <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_

    :param str filename: filename of the corpus to be read

    :return: :class:`frozenset` or :class:`list` consists of lines in the file
    :rtype: :class:`frozenset` or :class:`list`

    :Example:
    ::

        from pythainlp.corpus import get_corpus

        get_corpus('negations_th.txt')
        # output:
        # frozenset({'แต่', 'ไม่'})

        get_corpus('ttc_freq.txt')
        # output:
        # frozenset({'โดยนัยนี้\\t1',
        #    'ตัวบท\\t10',
        #    'หยิบยื่น\\t3',
        #     ...})
    """
    path = os.path.join(corpus_path(), filename)
    lines = []
    with open(path, "r", encoding="utf-8-sig") as fh:
        lines = fh.read().splitlines()

    if as_is:
        return lines

    lines = [line.strip() for line in lines]
    return frozenset(filter(None, lines))
Esempio n. 4
0
# -*- coding: utf-8 -*-
"""
Perceptron part-of-speech tagger
"""
import os
import pickle
from typing import List, Tuple

from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import PerceptronTagger, lst20, orchid

_ORCHID_FILENAME = "pos_orchid_perceptron.pkl"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)

_PUD_FILENAME = "pos_ud_perceptron.pkl"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_LST20_TAGGER_NAME = "pos_lst20_perceptron"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_LST20_TAGGER = None


def _orchid_tagger():
    global _ORCHID_TAGGER
    if not _ORCHID_TAGGER:
        _ORCHID_TAGGER = PerceptronTagger(path=_ORCHID_PATH)
    return _ORCHID_TAGGER

Esempio n. 5
0
def _load_tagger(filename):
    data_filename = os.path.join(corpus_path(), filename)
    with open(data_filename, "rb") as fh:
        model = pickle.load(fh)
    return model
Esempio n. 6
0
def _load_tagger(filename):
    data_filename = os.path.join(corpus_path(), filename)
    with open(data_filename, "rb") as fh:
        model = dill.load(fh)
    return model
Esempio n. 7
0
                feature_position = f"{n_gram}_{j-i}_{j-i+n_gram}"
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f"word_{feature_position}={word_}"]
                ender_ = f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f"ender_{feature_position}={ender_}"]
                starter_ = f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f"starter_{feature_position}={starter_}"]
        # append to feature per word
        doc_features.append(word_features)

    return doc_features


_CRFCUT_DATA_FILENAME = "sentenceseg-ted.model"
_tagger = pycrfsuite.Tagger()
_tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME))


def segment(text: str) -> List[str]:
    """
    CRF-based sentence segmentation.

    :param str text: text to be tokenized to sentences
    :return: list of words, tokenized from the text
    """
    toks = word_tokenize(text)
    feat = extract_features(toks)
    labs = _tagger.tag(feat)

    sentences = []
    sentence = ""
Esempio n. 8
0
# -*- coding: utf-8 -*-
"""
Unigram Part-Of-Speech tagger
"""
import json
import os
from typing import List, Tuple

from pythainlp.corpus import corpus_path
from pythainlp.tag.orchid import tag_signs, tag_to_text

_THAI_POS_ORCHID_FILENAME = "orchid_pos_th.json"
_THAI_POS_ORCHID_PATH = os.path.join(corpus_path(), _THAI_POS_ORCHID_FILENAME)
_THAI_POS_PUD_FILENAME = "ud_thai_pud_unigram_tagger.json"
_THAI_POS_PUD_PATH = os.path.join(corpus_path(), _THAI_POS_PUD_FILENAME)


def _find_tag(words: List[str], dictdata: dict) -> List[Tuple[str, str]]:
    _temp = []
    _word = list(dictdata.keys())
    for word in words:
        if word in _word:
            _temp.append((word, dictdata[word]))
        else:
            _temp.append((word, None))
    return _temp


def _orchid_tagger():
    with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f:
        model = json.load(f)
Esempio n. 9
0
# -*- coding: utf-8 -*-
"""
Perceptron part-of-speech tagger
"""
import os
import pickle
from typing import List, Tuple

from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import PerceptronTagger, lst20, orchid

_ORCHID_FILENAME = "pos_orchid_perceptron.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)

_PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_LST20_TAGGER_NAME = "pos_lst20_perceptron-v0.2.3.json"
_LST20_TAGGERD_PATH = os.path.join(corpus_path(), _LST20_TAGGER_NAME)

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_LST20_TAGGER = None


def _orchid_tagger():
    global _ORCHID_TAGGER
    if not _ORCHID_TAGGER:
        _ORCHID_TAGGER = PerceptronTagger(path=_ORCHID_PATH)
    return _ORCHID_TAGGER
Esempio n. 10
0
# -*- coding: utf-8 -*-
"""
Unigram Part-Of-Speech Tagger
"""
import json
import os
from typing import List, Tuple

import dill
import nltk.tag
from pythainlp.corpus import corpus_path

_THAI_POS_ORCHID_FILENAME = "orchid_pos_th.json"
_THAI_POS_ORCHID_PATH = os.path.join(corpus_path(), _THAI_POS_ORCHID_FILENAME)
_THAI_POS_PUD_FILENAME = "ud_thai_pud_unigram_tagger.dill"
_THAI_POS_PUD_PATH = os.path.join(corpus_path(), _THAI_POS_PUD_FILENAME)


def _orchid_tagger():
    with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f:
        model = json.load(f)
    return model


def _pud_tagger():
    with open(_THAI_POS_PUD_PATH, "rb") as handle:
        model = dill.load(handle)
    return model


def tag(words: List[str], corpus: str) -> List[Tuple[str, str]]: