def ensure_data(data_name, data_url): root_path = test_data_path() dest_path = os.path.join(root_path, data_name) if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] return dest_path
def ensure_data(data_name, data_url): #保证数据,存在本地文件则返回,否则进行下载解压缩 root_path = test_data_path() dest_path = os.path.join(root_path, data_name) print("111dest_path:" + dest_path) if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] print("dest_path:" + dest_path) return dest_path
def install_jar(name, filepath, url): dst = os.path.join(filepath, name) if os.path.isfile(dst): return dst download(url, dst) return dst
def install_jar(name, url): dst = os.path.join(STATIC_ROOT, name) if os.path.isfile(dst): return dst download(url, dst) return dst
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-04-28 10:07 import zipfile from pyhanlp import * from pyhanlp.static import download if not os.path.isdir('tests/data/'): print('请在项目根目录下运行本脚本') exit(1) if not os.path.isdir('tests/data/hanlp-wiki-vec-zh'): model_path = 'tests/data/hanlp-wiki-vec-zh.zip' download('http://hanlp.linrunsoft.com/release/model/hanlp-wiki-vec-zh.zip', model_path) with zipfile.ZipFile(model_path, "r") as archive: archive.extractall('tests/data/') WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel') DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel') word2vec = WordVectorModel( 'tests/data/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt') doc2vec = DocVectorModel(word2vec) docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"] for idx, doc in enumerate(docs): doc2vec.addDocument(idx, doc) print(word2vec.nearest('语言')) for res in doc2vec.nearest('我要看比赛'):