def install_hanlp_data(the_jar_version=None): if not the_jar_version: the_jar_version = HANLP_JAR_VERSION if HANLP_JAR_VERSION else hanlp_latest_version( )[0] for jar_version, jar_url, data_version, data_url in hanlp_releases(): if jar_version == the_jar_version: if data_version == hanlp_installed_data_version(): return False data_zip = 'data-for-{}.zip'.format(data_version) data_zip = os.path.join(STATIC_ROOT, data_zip) download(data_url, os.path.join(STATIC_ROOT, data_zip)) sys.stderr.write('\r' + ' ' * 100 + '\r') # 清理上一行 sys.stderr.flush() eprint('解压 data.zip...') try: with zipfile.ZipFile(data_zip, "r") as zip_ref: zip_ref.extractall(STATIC_ROOT) except zipfile.BadZipFile: remove_file(data_zip) eprint('解压失败,请重试') exit(1) os.remove(data_zip) write_config(root=STATIC_ROOT) with open_(PATH_DATA_VERSION, 'w', encoding='utf-8') as f: f.write(data_version) global HANLP_DATA_VERSION HANLP_DATA_VERSION = data_version return True
def download(url, path): if os.path.isfile(path): print('使用本地 {}, 忽略 {}'.format(path, url)) return True else: print('下载 {} 到 {}'.format(url, path)) tmp_path = '{}.downloading'.format(path) remove_file(tmp_path) try: downloader = Downloader(url, tmp_path, 4, headers={ 'User-agent': 'pyhanlp (' + platform.platform() + ')' }) downloader.subscribe( DownloadCallback(show_header=False, out=sys.stdout)) downloader.start_sync() except BaseException as e: eprint('下载失败 {} 由于 {}'.format(url, repr(e))) doc_url = 'https://od.hankcs.com/book/intro_nlp/' eprint('请参考 %s 执行手动安装.' % doc_url) eprint('或手动下载 {} 到 {}'.format(url, path)) if os.path.isfile(tmp_path): os.remove(tmp_path) browser_open(doc_url) exit(1) remove_file(path) os.rename(tmp_path, path) return True
def install_hanlp_jar(): jar_version, jar_url, data_version, data_url = hanlp_latest_version() jar_zip = os.path.join(STATIC_ROOT, 'hanlp-{}-release.zip'.format(jar_version)) download(jar_url, jar_zip) try: with zipfile.ZipFile(jar_zip, "r") as archive: archive.extract( 'hanlp-{}-release/hanlp-{}.jar'.format(jar_version, jar_version), STATIC_ROOT) except zipfile.BadZipFile: remove_file(jar_zip) eprint('解压失败,请重试') exit(1) zip_folder = os.path.join(STATIC_ROOT, 'hanlp-{}-release'.format(jar_version)) jar_file_name = 'hanlp-{}.jar'.format(jar_version) os.rename(os.path.join(zip_folder, jar_file_name), os.path.join(STATIC_ROOT, jar_file_name)) shutil.rmtree(zip_folder) remove_file(jar_zip) global HANLP_JAR_VERSION HANLP_JAR_VERSION = jar_version
def download(url, path): opener = urllib.build_opener() opener.addheaders = [( 'User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' )] urllib.install_opener(opener) if os.path.isfile(path): print('使用本地 {}, 忽略 {}'.format(path, url)) return True else: print('下载 {} 到 {}'.format(url, path)) tmp_path = '{}.downloading'.format(path) remove_file(tmp_path) try: def reporthook(count, block_size, total_size): global start_time, progress_size if count == 0: start_time = time.time() progress_size = 0 return duration = time.time() - start_time duration = max(1e-8, duration) # 防止除零错误 progress_size = int(count * block_size) if progress_size > total_size: progress_size = total_size speed = int(progress_size / (1024 * duration)) ratio = progress_size / total_size ratio = max(1e-8, ratio) percent = ratio * 100 eta = duration / ratio * (1 - ratio) minutes = eta / 60 seconds = eta % 60 sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒 " % (percent, progress_size / (1024 * 1024), speed, minutes, seconds)) sys.stdout.flush() import socket socket.setdefaulttimeout(10) urllib.urlretrieve(quote(url, safe='/:?='), tmp_path, reporthook) print() except BaseException as e: eprint('下载失败 {} 由于 {}'.format(url, repr(e))) doc_url = 'https://od.hankcs.com/book/intro_nlp/' eprint('请参考 %s 执行手动安装.' % doc_url) eprint('或手动下载 {} 到 {}'.format(url, path)) if os.path.isfile(tmp_path): os.remove(tmp_path) browser_open(doc_url) exit(1) remove_file(path) os.rename(tmp_path, path) return True
def download(url, path): opener = urllib.build_opener() opener.addheaders = [( 'User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' )] urllib.install_opener(opener) if os.path.isfile(path): print('使用本地 {}, 忽略 {}'.format(path, url)) return True else: print('下载 {} 到 {}'.format(url, path)) tmp_path = '{}.downloading'.format(path) remove_file(tmp_path) try: downloader = Downloader( url, tmp_path, 4, headers={ 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' }) downloader.subscribe( DownloadCallback(show_header=False, out=sys.stdout)) downloader.start_sync() except BaseException as e: eprint('下载失败 {} 由于 {}'.format(url, repr(e))) doc_url = 'https://od.hankcs.com/book/intro_nlp/' eprint('请参考 %s 执行手动安装.' % doc_url) eprint('或手动下载 {} 到 {}'.format(url, path)) if os.path.isfile(tmp_path): os.remove(tmp_path) browser_open(doc_url) exit(1) remove_file(path) os.rename(tmp_path, path) return True
def _start_jvm_for_hanlp(): global STATIC_ROOT, hanlp_installed_data_version, HANLP_JAR_PATH, PATH_CONFIG, HANLP_JAR_VERSION, HANLP_DATA_PATH # Get ENV ENVIRON = os.environ.copy() # Load variables in Environment if "HANLP_VERBOSE" in ENVIRON: HANLP_VERBOSE = int(ENVIRON["HANLP_VERBOSE"]) else: HANLP_VERBOSE = 0 if "HANLP_STATIC_ROOT" in ENVIRON: STATIC_ROOT = ENVIRON["HANLP_STATIC_ROOT"] if HANLP_VERBOSE: print('使用环境变量 HANLP_STATIC_ROOT={}'.format(STATIC_ROOT)) HANLP_DATA_PATH = os.path.join(STATIC_ROOT, 'data') def hanlp_installed_data_version(): return '手动安装' else: from pyhanlp.static import STATIC_ROOT, hanlp_installed_data_version, HANLP_DATA_PATH if "HANLP_JAR_PATH" in ENVIRON: HANLP_JAR_PATH = ENVIRON["HANLP_JAR_PATH"] if HANLP_VERBOSE: print('使用环境变量 HANLP_JAR_PATH={}'.format(HANLP_JAR_PATH)) else: from pyhanlp.static import HANLP_JAR_PATH if "HANLP_JVM_XMS" in ENVIRON: HANLP_JVM_XMS = ENVIRON["HANLP_JVM_XMS"] else: HANLP_JVM_XMS = "512m" if "HANLP_JVM_XMX" in ENVIRON: HANLP_JVM_XMX = ENVIRON["HANLP_JVM_XMX"] else: HANLP_JVM_XMX = "8g" # JVM可用到的内存上限,通常并不会达到上限 PATH_CONFIG = os.path.join(STATIC_ROOT, 'hanlp.properties') if not os.path.exists(HANLP_JAR_PATH): raise ValueError("配置错误: HANLP_JAR_PATH=%s 不存在" % HANLP_JAR_PATH) elif not os.path.isfile(HANLP_JAR_PATH) or not HANLP_JAR_PATH.endswith( '.jar'): raise ValueError("配置错误: HANLP_JAR_PATH=%s 不是jar文件" % HANLP_JAR_PATH) elif not os.path.exists(STATIC_ROOT): raise ValueError("配置错误: STATIC_ROOT=%s 不存在" % STATIC_ROOT) elif not os.path.isdir(HANLP_DATA_PATH): if HANLP_DATA_PATH.startswith(STATIC_ROOT): raise ValueError("配置错误: STATIC_ROOT=%s 目录下没有data文件夹" % STATIC_ROOT) else: raise ValueError("配置错误: 数据包 %s 不存在,请修改配置文件中的root" % HANLP_DATA_PATH) elif not os.path.isfile(PATH_CONFIG): raise ValueError("配置错误: STATIC_ROOT=%s 目录下没有hanlp.properties" % STATIC_ROOT) else: HANLP_JAR_VERSION = os.path.basename( HANLP_JAR_PATH)[len('hanlp-'):-len('.jar')] if HANLP_VERBOSE: print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH) print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT)) print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT)) java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html' pathsep = os.pathsep jvm_path = None try: jvm_path = getDefaultJVMPath() except JVMNotFoundException as e: eprint('找不到Java,请安装JDK8:%s' % java_url) browser_open(java_url) exit(1) except JVMNotSupportedException as e: eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)') browser_open(java_url) exit(1) if platform.system().startswith('CYGWIN'): if not jvm_path.startswith( '/cygdrive'): # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径 pathsep = ';' if STATIC_ROOT.startswith('/usr/lib'): cygwin_root = os.popen('cygpath -w /').read().strip().replace( '\\', '/') STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):] HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):] PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):] elif STATIC_ROOT.startswith('/cygdrive'): driver = STATIC_ROOT.split('/') cygwin_driver = '/'.join(driver[:3]) win_driver = driver[2].upper() + ':' HANLP_JAR_PATH = HANLP_JAR_PATH.replace( cygwin_driver, win_driver) STATIC_ROOT = STATIC_ROOT.replace(cygwin_driver, win_driver) PATH_CONFIG = PATH_CONFIG.replace(cygwin_driver, win_driver) JAVA_JAR_CLASSPATH = "-Djava.class.path=%s%s%s" % (HANLP_JAR_PATH, pathsep, STATIC_ROOT) # 加载插件jar for jar in glob.glob(os.path.join(STATIC_ROOT, '*.jar')): if HANLP_JAR_PATH.endswith(jar): continue JAVA_JAR_CLASSPATH = JAVA_JAR_CLASSPATH + pathsep + os.path.join( STATIC_ROOT, jar) if HANLP_VERBOSE: print("设置 JAVA_JAR_CLASSPATH [%s]" % JAVA_JAR_CLASSPATH) # 启动JVM startJVM(jvm_path, JAVA_JAR_CLASSPATH, "-Xms%s" % HANLP_JVM_XMS, "-Xmx%s" % HANLP_JVM_XMX, convertStrings=True) # 确保启动正常 try: JClass('com.hankcs.hanlp.HanLP') except java.lang.NoClassDefFoundError as e: from pyhanlp.static import install_hanlp_jar eprint('找不到jar,可能由于安装路径含有中文,或者你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH)) os.remove(HANLP_JAR_PATH) install_hanlp_jar() eprint('下载成功,请重新启动程序。如果问题依然存在,请不要安装到中文路径。') exit(1)