예제 #1
0
def ckip_cut(text_list):
    if not os.path.isdir("./data"):
        print('ckip data non-exist, start download')
        data_utils.download_data_gdown("./")  # gdrive-ckip, !pip install gdown
    ws = WS("./data")
    ckip_corpus = ws(text_list)
    return ckip_corpus
예제 #2
0
import pandas as pd
from ckiptagger import data_utils, WS, POS

# read file 
clean_txt = pd.read_csv("clean-txt.csv")

# print(clean_txt.columns)
# print(clean_txt)

data_utils.download_data_gdown("./")  # 執行一次就好
ws = WS("./data")
clean_txt['token_text'] = ws(clean_txt["0"])

print(clean_txt[:5])

#save
clean_txt.to_csv("clean-txt-tokenized.csv")
예제 #3
0
 def download_model(from_gd=False):
     from ckiptagger import data_utils
     if (from_gd):
         data_utils.download_data_gdown("./")  # gdrive-ckip
     else:
         data_utils.download_data_url("./")  # iis-ckip
예제 #4
0
import nltk  # 通常用在英文
# nltk.download() 超大天荒地老
import numpy as np
import pandas as pd
# standfordNLP
import stanfordnlp
# stanfordnlp.download('en')
# stanfordnlp.download('zh')

import os
from pathlib import Path
from ckiptagger import data_utils

path = os.path.join(str(Path.home()), 'ckip/')
if not os.path.exists(path): os.mkdir(path)
data_utils.download_data_gdown(path)  # gdrive-ckip2GB

from stanfordnlp.utils.resources import DEFAULT_MODEL_DIR

chinese_sentence = '中華郵政未來智慧物流服務,將取之大眾智慧,帶給民眾更好的便利生活'

zh_pipeline = stanfordnlp.Pipeline(processors="tokenize",
                                   models_dir=DEFAULT_MODEL_DIR,
                                   lang="zh",
                                   use_gpu=False)
zh_doc = zh_pipeline(chinese_sentence)

for i, sentence in enumerate(zh_doc.sentences):
    print("sentence {}:".format(i))
    print("index\ttxt")
    for word in sentence.words:
from ckiptagger import data_utils
data_utils.download_data_gdown("./")  # gdrive-ckip
예제 #6
0
from ckiptagger import data_utils

if __name__ == '__main__':
    data_utils.download_data_gdown("./")
예제 #7
0
import os

from ckiptagger import WS, data_utils
from rouge import Rouge

cache_dir = os.environ.get("XDG_CACHE_HOME", os.path.join(os.getenv("HOME"), ".cache"))
download_dir = os.path.join(cache_dir, "ckiptagger")
data_dir = os.path.join(cache_dir, "ckiptagger/data")
os.makedirs(download_dir, exist_ok=True)
if not os.path.exists(os.path.join(data_dir, "model_ws")):
    data_utils.download_data_gdown(download_dir)

ws = WS(data_dir)


def tokenize_and_join(sentences):
    return [" ".join(toks) for toks in ws(sentences)]


rouge = Rouge()


def get_rouge(preds, refs, avg=True, ignore_empty=False):
    """wrapper around: from rouge import Rouge
    Args:
        preds: string or list of strings
        refs: string or list of strings
        avg: bool, return the average metrics if set to True
        ignore_empty: bool, ignore empty pairs if set to True
    """
    if not isinstance(preds, list):
예제 #8
0
def download():
	# 到 Google drive 上下載模型
	data_utils.download_data_gdown("./")