Beispiel #1
0
def _update_all():
    print("Update Corpus...")
    with TinyDB(corpus_db_path()) as local_db:
        item_all = local_db.all()
        query = Query()
        for item in item_all:
            name = item["name"]
            if "file_name" in item.keys():
                local_db.update({"filename": item["file_name"]},
                                query.name == name)
            elif "file" in item.keys():
                local_db.update({"filename": item["file"]}, query.name == name)
    local_db.close()
Beispiel #2
0
def get_corpus_db_detail(name: str) -> dict:
    """
    Get details about a corpus, using information from local catalog.

    :param str name: name corpus
    :return: details about a corpus
    :rtype: dict
    """
    local_db = TinyDB(corpus_db_path())
    query = Query()
    res = local_db.search(query.name == name)
    local_db.close()

    if res:
        return res[0]

    return dict()
Beispiel #3
0
def remove(name: str) -> bool:
    """
    Remove corpus

    :param str name: corpus name
    :return: **True** if the corpus is found and succesfully removed.
             Otherwise, it returns **False**.
    :rtype: bool

    :Example:
    ::

        from pythainlp.corpus import remove, get_corpus_path, get_corpus

        print(remove('ttc'))
        # output: True

        print(get_corpus_path('ttc'))
        # output: None

        get_corpus('ttc')
        # output:
        # FileNotFoundError: [Errno 2] No such file or directory:
        # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
    """
    db = TinyDB(corpus_db_path())
    query = Query()
    data = db.search(query.name == name)

    if data:
        path = get_corpus_path(name)
        os.remove(path)
        db.remove(query.name == name)
        db.close()
        return True

    db.close()
    return False
Beispiel #4
0
def download(name: str,
             force: bool = False,
             url: str = None,
             version: str = None) -> bool:
    """
    Download corpus.

    The available corpus names can be seen in this file:
    https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json

    :param str name: corpus name
    :param bool force: force download
    :param str url: URL of the corpus catalog
    :param str version: Version of the corpus
    :return: **True** if the corpus is found and succesfully downloaded.
             Otherwise, it returns **False**.
    :rtype: bool

    :Example:
    ::

        from pythainlp.corpus import download

        download('wiki_lm_lstm', force=True)
        # output:
        # Corpus: wiki_lm_lstm
        # - Downloading: wiki_lm_lstm 0.1
        # thwiki_lm.pth:  26%|██▌       | 114k/434k [00:00<00:00, 690kB/s]

    By default, downloaded corpus and model will be saved in
    ``$HOME/pythainlp-data/``
    (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
    """
    if not url:
        url = corpus_db_url()

    corpus_db = get_corpus_db(url)
    if not corpus_db:
        print(f"Cannot download corpus catalog from: {url}")
        return False

    corpus_db = corpus_db.json()

    # check if corpus is available
    if name in list(corpus_db.keys()):
        local_db = TinyDB(corpus_db_path())
        query = Query()

        corpus = corpus_db[name.lower()]
        print("Corpus:", name)
        if version is None:
            version = corpus["latest_version"]
        corpus_versions = corpus["versions"][version]
        file_name = corpus_versions["filename"]
        found = local_db.search((query.name == name)
                                & (query.version == version))

        # If not found in local, download
        if force or not found:
            print(f"- Downloading: {name} {version}")
            _download(
                corpus_versions["download_url"],
                file_name,
            )
            _check_hash(
                file_name,
                corpus_versions["md5"],
            )

            if found:
                local_db.update({"version": version}, query.name == name)
            else:
                local_db.insert({
                    "name": name,
                    "version": version,
                    "filename": file_name
                })
        else:
            if local_db.search(query.name == name
                               and query.version == version):
                # Already has the same version
                print("- Already up to date.")
            else:
                # Has the corpus but different version
                current_ver = local_db.search(query.name == name)[0]["version"]
                print(f"- Existing version: {current_ver}")
                print(f"- New version available: {version}")
                print("- Use download(data_name, force=True) to update")

        local_db.close()
        return True

    print("Corpus not found:", name)
    return False