def get_corpus(filename: str) -> frozenset: """ Read corpus data from file and return a frozenset. (Please see the filename from `this file <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_ :param str filename: filename of the corpus to be read :return: :mod:`frozenset` consist of lines in the file :rtype: :mod:`frozenset` :Example: :: from pythainlp.corpus import get_corpus get_corpus('negations_th.txt') # output: # frozenset({'แต่', 'ไม่'}) get_corpus('ttc_freq.txt') # output: # frozenset({'โดยนัยนี้\\t1', # 'ตัวบท\\t10', # 'หยิบยื่น\\t3', # ...}) """ path = os.path.join(corpus_path(), filename) lines = [] with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() return frozenset(lines)
def path_pythainlp_corpus(filename: str) -> str: """ Get path pythainlp.corpus data :param str filename: filename of the corpus to be read :return: : path of corpus :rtype: str """ return os.path.join(corpus_path(), filename)
def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: """ Read corpus data from file and return a frozenset or a list. Each line in the file will be a member of the set or the list. By default, a frozenset will be return, with whitespaces stripped, and empty values and duplicates removed. If as_is is True, a list will be return, with no modifications in member values and their orders. (Please see the filename from `this file <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_ :param str filename: filename of the corpus to be read :return: :class:`frozenset` or :class:`list` consists of lines in the file :rtype: :class:`frozenset` or :class:`list` :Example: :: from pythainlp.corpus import get_corpus get_corpus('negations_th.txt') # output: # frozenset({'แต่', 'ไม่'}) get_corpus('ttc_freq.txt') # output: # frozenset({'โดยนัยนี้\\t1', # 'ตัวบท\\t10', # 'หยิบยื่น\\t3', # ...}) """ path = os.path.join(corpus_path(), filename) lines = [] with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() if as_is: return lines lines = [line.strip() for line in lines] return frozenset(filter(None, lines))
# -*- coding: utf-8 -*- """ Perceptron part-of-speech tagger """ import os import pickle from typing import List, Tuple from pythainlp.corpus import corpus_path, get_corpus_path from pythainlp.tag import PerceptronTagger, lst20, orchid _ORCHID_FILENAME = "pos_orchid_perceptron.pkl" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) _PUD_FILENAME = "pos_ud_perceptron.pkl" _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME) _LST20_TAGGER_NAME = "pos_lst20_perceptron" _ORCHID_TAGGER = None _PUD_TAGGER = None _LST20_TAGGER = None def _orchid_tagger(): global _ORCHID_TAGGER if not _ORCHID_TAGGER: _ORCHID_TAGGER = PerceptronTagger(path=_ORCHID_PATH) return _ORCHID_TAGGER
def _load_tagger(filename): data_filename = os.path.join(corpus_path(), filename) with open(data_filename, "rb") as fh: model = pickle.load(fh) return model
def _load_tagger(filename): data_filename = os.path.join(corpus_path(), filename) with open(data_filename, "rb") as fh: model = dill.load(fh) return model
feature_position = f"{n_gram}_{j-i}_{j-i+n_gram}" word_ = f'{"|".join(doc[j:(j+n_gram)])}' word_features += [f"word_{feature_position}={word_}"] ender_ = f'{"|".join(doc_ender[j:(j+n_gram)])}' word_features += [f"ender_{feature_position}={ender_}"] starter_ = f'{"|".join(doc_starter[j:(j+n_gram)])}' word_features += [f"starter_{feature_position}={starter_}"] # append to feature per word doc_features.append(word_features) return doc_features _CRFCUT_DATA_FILENAME = "sentenceseg-ted.model" _tagger = pycrfsuite.Tagger() _tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME)) def segment(text: str) -> List[str]: """ CRF-based sentence segmentation. :param str text: text to be tokenized to sentences :return: list of words, tokenized from the text """ toks = word_tokenize(text) feat = extract_features(toks) labs = _tagger.tag(feat) sentences = [] sentence = ""
# -*- coding: utf-8 -*- """ Unigram Part-Of-Speech tagger """ import json import os from typing import List, Tuple from pythainlp.corpus import corpus_path from pythainlp.tag.orchid import tag_signs, tag_to_text _THAI_POS_ORCHID_FILENAME = "orchid_pos_th.json" _THAI_POS_ORCHID_PATH = os.path.join(corpus_path(), _THAI_POS_ORCHID_FILENAME) _THAI_POS_PUD_FILENAME = "ud_thai_pud_unigram_tagger.json" _THAI_POS_PUD_PATH = os.path.join(corpus_path(), _THAI_POS_PUD_FILENAME) def _find_tag(words: List[str], dictdata: dict) -> List[Tuple[str, str]]: _temp = [] _word = list(dictdata.keys()) for word in words: if word in _word: _temp.append((word, dictdata[word])) else: _temp.append((word, None)) return _temp def _orchid_tagger(): with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f: model = json.load(f)
# -*- coding: utf-8 -*- """ Perceptron part-of-speech tagger """ import os import pickle from typing import List, Tuple from pythainlp.corpus import corpus_path, get_corpus_path from pythainlp.tag import PerceptronTagger, lst20, orchid _ORCHID_FILENAME = "pos_orchid_perceptron.json" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) _PUD_FILENAME = "pos_ud_perceptron-v0.2.json" _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME) _LST20_TAGGER_NAME = "pos_lst20_perceptron-v0.2.3.json" _LST20_TAGGERD_PATH = os.path.join(corpus_path(), _LST20_TAGGER_NAME) _ORCHID_TAGGER = None _PUD_TAGGER = None _LST20_TAGGER = None def _orchid_tagger(): global _ORCHID_TAGGER if not _ORCHID_TAGGER: _ORCHID_TAGGER = PerceptronTagger(path=_ORCHID_PATH) return _ORCHID_TAGGER
# -*- coding: utf-8 -*- """ Unigram Part-Of-Speech Tagger """ import json import os from typing import List, Tuple import dill import nltk.tag from pythainlp.corpus import corpus_path _THAI_POS_ORCHID_FILENAME = "orchid_pos_th.json" _THAI_POS_ORCHID_PATH = os.path.join(corpus_path(), _THAI_POS_ORCHID_FILENAME) _THAI_POS_PUD_FILENAME = "ud_thai_pud_unigram_tagger.dill" _THAI_POS_PUD_PATH = os.path.join(corpus_path(), _THAI_POS_PUD_FILENAME) def _orchid_tagger(): with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f: model = json.load(f) return model def _pud_tagger(): with open(_THAI_POS_PUD_PATH, "rb") as handle: model = dill.load(handle) return model def tag(words: List[str], corpus: str) -> List[Tuple[str, str]]: