def ensure_data(data_name, data_url): root_path = test_data_path() dest_path = os.path.join(root_path, data_name) if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] return dest_path
def ensure_data(data_name, data_url): #保证数据,存在本地文件则返回,否则进行下载解压缩 root_path = test_data_path() dest_path = os.path.join(root_path, data_name) print("111dest_path:" + dest_path) if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] print("dest_path:" + dest_path) return dest_path
import zipfile from pyhanlp import * from pyhanlp.static import download, remove_file if not os.path.isdir('tests/data/'): print('请在项目根目录下运行本脚本') exit(1) if not os.path.isdir('tests/data/hanlp-wiki-vec-zh'): model_path = 'tests/data/hanlp-wiki-vec-zh.zip' download('http://hanlp.linrunsoft.com/release/model/hanlp-wiki-vec-zh.zip', model_path) with zipfile.ZipFile(model_path, "r") as archive: archive.extractall('tests/data/') remove_file(model_path) WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel') DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel') word2vec = WordVectorModel( 'tests/data/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt') doc2vec = DocVectorModel(word2vec) docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"] for idx, doc in enumerate(docs): doc2vec.addDocument(idx, doc) print(word2vec.nearest('语言')) for res in doc2vec.nearest('我要看比赛'): print('%s = %.2f' % (docs[res.getKey().intValue()], res.getValue().floatValue()))