Ejemplo n.º 1
0
def _download_install(name: str) -> None:
    if get_corpus_path(name) is None:
        download(name, force=True, version="1.0")
        tar = tarfile.open(get_corpus_path(name), "r:gz")
        tar.extractall()
        tar.close()
    if not os.path.exists(get_full_data_path(name)):
        os.mkdir(get_full_data_path(name))
        with tarfile.open(get_corpus_path(name)) as tar:
            tar.extractall(path=get_full_data_path(name))
Ejemplo n.º 2
0
def _download(url: str, dst: str) -> int:
    """
    @param: url to download file
    @param: dst place to put the file
    """
    file_size = int(urlopen(url).info().get("Content-Length", -1))

    if os.path.exists(dst):
        first_byte = os.path.getsize(dst)
    else:
        first_byte = 0

    if first_byte >= file_size:
        return file_size

    header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
    pbar = tqdm(
        total=file_size,
        initial=first_byte,
        unit="B",
        unit_scale=True,
        desc=url.split("/")[-1],
    )
    req = requests.get(url, headers=header, stream=True)
    with (open(get_full_data_path(dst), "wb")) as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                pbar.update(1024)
    pbar.close()

    return file_size
Ejemplo n.º 3
0
 def test_path(self):
     data_filename = "ttc_freq.txt"
     self.assertTrue(
         get_full_data_path(data_filename).endswith(data_filename)
     )
     self.assertIsInstance(get_pythainlp_data_path(), str)
     self.assertIsInstance(get_pythainlp_path(), str)
Ejemplo n.º 4
0
def _download(url: str, dst: str) -> int:
    """
    Download helper.

    @param: url to download file
    @param: dst place to put the file
    """
    _CHUNK_SIZE = 64 * 1024  # 64 KiB

    file_size = int(urlopen(url).info().get("Content-Length", -1))
    r = requests.get(url, stream=True)
    with open(get_full_data_path(dst), "wb") as f:
        pbar = None
        try:
            from tqdm import tqdm

            pbar = tqdm(total=int(r.headers["Content-Length"]))
        except ImportError:
            pbar = None

        for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
            if chunk:
                f.write(chunk)
                if pbar:
                    pbar.update(len(chunk))
        if pbar:
            pbar.close()
        else:
            print("Done.")
    return file_size
Ejemplo n.º 5
0
def download_(url: str, dst: str):
    """
    @param: url to download file
    @param: dst place to put the file
    """
    file_size = int(urlopen(url).info().get("Content-Length", -1))
    if os.path.exists(dst):
        first_byte = os.path.getsize(dst)
    else:
        first_byte = 0
    if first_byte >= file_size:
        return file_size
    header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
    pbar = tqdm(
        total=file_size,
        initial=first_byte,
        unit="B",
        unit_scale=True,
        desc=url.split("/")[-1],
    )
    req = requests.get(url, headers=header, stream=True)
    with (open(get_full_data_path(dst), "wb")) as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                pbar.update(1024)
    pbar.close()
Ejemplo n.º 6
0
def get_corpus_path(name: str) -> Union[str, None]:
    """
    Get corpus path.

    :param str name: corpus name
    :return: path to the corpus or **None** of the corpus doesn't \
             exist in the device
    :rtype: str

    :Example:

    If the corpus already exists::

        from pythainlp.corpus import get_corpus_path

        print(get_corpus_path('ttc'))
        # output: /root/pythainlp-data/ttc_freq.txt

    If the corpus has not been downloaded yet::

        from pythainlp.corpus import download, get_corpus_path

        print(get_corpus_path('wiki_lm_lstm'))
        # output: None

        download('wiki_lm_lstm')
        # output:
        # Download: wiki_lm_lstm
        # wiki_lm_lstm 0.32
        # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s]
        # /root/pythainlp-data/thwiki_model_lstm.pth

        print(get_corpus_path('wiki_lm_lstm'))
        # output: /root/pythainlp-data/thwiki_model_lstm.pth
    """
    # check if the corpus is in local catalog, download if not
    corpus_db_detail = get_corpus_db_detail(name)
    if (corpus_db_detail.get("file_name") is not None
            and corpus_db_detail.get("filename") is None):
        _update_all()
    elif (corpus_db_detail.get("file") is not None
          and corpus_db_detail.get("filename") is None):
        _update_all()

    if not corpus_db_detail or not corpus_db_detail.get("filename"):
        download(name)
        corpus_db_detail = get_corpus_db_detail(name)

    if corpus_db_detail and corpus_db_detail.get("filename"):
        # corpus is in the local catalog, get full path to the file
        path = get_full_data_path(corpus_db_detail.get("filename"))
        # check if the corpus file actually exists, download if not
        if not os.path.exists(path):
            download(name)
        if os.path.exists(path):
            return path

    return None
Ejemplo n.º 7
0
def _check_hash(dst: str, md5: str) -> NoReturn:
    """
    @param: dst place to put the file
    @param: md5 place to hash the file (MD5)
    """
    if md5 and md5 != "-":
        with open(get_full_data_path(dst), "rb") as f:
            content = f.read()
            file_md5 = hashlib.md5(content).hexdigest()

            if md5 != file_md5:
                raise Exception("Hash does not match expected.")
Ejemplo n.º 8
0
def get_corpus_path(name: str) -> Union[str, None]:
    """
    Get corpus path

    :param string name: corpus name
    :return: path to the corpus or **None** of the corpus doesn't
             exist in the device
    :rtype: str

    :Example:

    If the corpus already exists::

        from pythainlp.corpus import get_corpus_path

        print(get_corpus_path('ttc'))
        # output: /root/pythainlp-data/ttc_freq.txt

    If the corpus has not been downloaded yet::

        from pythainlp.corpus import download, get_corpus_path

        print(get_corpus_path('wiki_lm_lstm'))
        # output: None

        download('wiki_lm_lstm')
        # output:
        # Download: wiki_lm_lstm
        # wiki_lm_lstm 0.32
        # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s]
        # /root/pythainlp-data/thwiki_model_lstm.pth

        print(get_corpus_path('wiki_lm_lstm'))
        # output: /root/pythainlp-data/thwiki_model_lstm.pth
    """
    db = TinyDB(corpus_db_path())
    query = Query()
    path = None

    if db.search(query.name == name):
        path = get_full_data_path(db.search(query.name == name)[0]["file"])

        if not os.path.exists(path):
            download(name)

    db.close()
    return path
Ejemplo n.º 9
0
def _download(url: str, dst: str) -> int:
    """
    @param: url to download file
    @param: dst place to put the file
    """
    _CHUNK_SIZE = 1024 * 64

    file_size = int(urlopen(url).info().get("Content-Length", -1))
    r = requests.get(url, stream=True)
    with open(get_full_data_path(dst), "wb") as f:
        pbar = tqdm(total=int(r.headers["Content-Length"]))
        for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
            if chunk:
                f.write(chunk)
                pbar.update(len(chunk))
        pbar.close()
    return file_size
Ejemplo n.º 10
0
def word_freqs():
    """
    Get word frequency from Thai Textbook Corpus (TTC)
    """
    path = get_full_data_path("ttc_freq.txt")  # try local copy first
    if not os.path.exists(path):  # if fail, download from internet
        download_data("ttc")

    with open(path, "r", encoding="utf8") as f:
        lines = f.read().splitlines()
    f.close()

    listword = []
    for line in lines:
        listindata = line.split("	")
        listword.append((listindata[0], int(listindata[1])))

    return listword
Ejemplo n.º 11
0
def get_corpus_path(name: str) -> Union[str, None]:
    """
    Get corpus path

    :param string name: corpus name
    """
    db = TinyDB(corpus_db_path())
    temp = Query()

    if len(db.search(temp.name == name)) > 0:
        path = get_full_data_path(db.search(temp.name == name)[0]["file"])
        db.close()

        if not os.path.exists(path):
            download(name)

        return path

    return None
Ejemplo n.º 12
0
def get_corpus_path(name: str) -> [str, None]:
    """
    Get corpus path

    :param string name: corpus name
    """
    db = TinyDB(corpus_db_path())
    temp = Query()

    if len(db.search(temp.name == name)) > 0:
        path = get_full_data_path(db.search(temp.name == name)[0]["file"])
        db.close()

        if not os.path.exists(path):
            download(name)

        return path

    return None
Ejemplo n.º 13
0
from pythainlp.tools import get_full_data_path, get_pythainlp_path
from tinydb import TinyDB

# Remote and local corpus databases

_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

# remote corpus catalog URL
_CORPUS_DB_URL = ("https://pythainlp.github.io/pythainlp-corpus/db.json")

# local corpus catalog filename
_CORPUS_DB_FILENAME = "db.json"

# local corpus catalog full path
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)

# create a local corpus database if it does not already exist
if not os.path.exists(_CORPUS_DB_PATH):
    TinyDB(_CORPUS_DB_PATH).close()


def corpus_path() -> str:
    """
    Get path where corpus files are kept locally.
    """
    return _CORPUS_PATH


def corpus_db_url() -> str:
    """
Ejemplo n.º 14
0
def _get_translate_path(model: str, *path: str) -> str:
    return os.path.join(get_full_data_path(model), *path)
Ejemplo n.º 15
0
def get_corpus_path(name: str,  version : str = None) -> Union[str, None]:
    """
    Get corpus path.

    :param str name: corpus name
    :return: path to the corpus or **None** of the corpus doesn't \
             exist in the device
    :rtype: str

    :Example:

    (Please see the filename from
    `this file
    <https://pythainlp.github.io/pythainlp-corpus/db.json>`_

    If the corpus already exists::

        from pythainlp.corpus import get_corpus_path

        print(get_corpus_path('ttc'))
        # output: /root/pythainlp-data/ttc_freq.txt

    If the corpus has not been downloaded yet::

        from pythainlp.corpus import download, get_corpus_path

        print(get_corpus_path('wiki_lm_lstm'))
        # output: None

        download('wiki_lm_lstm')
        # output:
        # Download: wiki_lm_lstm
        # wiki_lm_lstm 0.32
        # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s]
        # /root/pythainlp-data/thwiki_model_lstm.pth

        print(get_corpus_path('wiki_lm_lstm'))
        # output: /root/pythainlp-data/thwiki_model_lstm.pth
    """
    # Customize your the corpus path then close the line after lines 164 through 190.
    _CUSTOMIZE = {
        # "the corpus name":"path"
    }
    if name in list(_CUSTOMIZE.keys()):
        return _CUSTOMIZE[name]

    # check if the corpus is in local catalog, download if not
    corpus_db_detail = get_corpus_db_detail(name)

    if not corpus_db_detail or not corpus_db_detail.get("filename"):
        download(name,  version =  version)
        corpus_db_detail = get_corpus_db_detail(name)

    if corpus_db_detail and corpus_db_detail.get("filename"):
        # corpus is in the local catalog, get full path to the file
        path = get_full_data_path(corpus_db_detail.get("filename"))
        # check if the corpus file actually exists, download if not
        if not os.path.exists(path):
            download(name)
        if os.path.exists(path):
            return path

    return None
Ejemplo n.º 16
0
import requests
from pythainlp.tools import get_full_data_path, get_pythainlp_path
from tinydb import Query, TinyDB
from tqdm import tqdm

# Remote and local corpus databases

_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

_CORPUS_DB_URL = (
    "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/2.0/db.json"
)

_CORPUS_DB_FILENAME = "db.json"
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)

if not os.path.exists(_CORPUS_DB_PATH):
    TinyDB(_CORPUS_DB_PATH)


def corpus_path():
    return _CORPUS_PATH


def corpus_db_url():
    return _CORPUS_DB_URL


def corpus_db_path():
    return _CORPUS_DB_PATH