Ejemplo n.º 1
0
def install_hanlp_data(the_jar_version=None):
    if not the_jar_version:
        the_jar_version = HANLP_JAR_VERSION if HANLP_JAR_VERSION else hanlp_latest_version(
        )[0]
    for jar_version, jar_url, data_version, data_url in hanlp_releases():
        if jar_version == the_jar_version:
            if data_version == hanlp_installed_data_version():
                return False
            data_zip = 'data-for-{}.zip'.format(data_version)
            data_zip = os.path.join(STATIC_ROOT, data_zip)
            download(data_url, os.path.join(STATIC_ROOT, data_zip))
            sys.stderr.write('\r' + ' ' * 100 + '\r')  # 清理上一行
            sys.stderr.flush()
            eprint('解压 data.zip...')
            try:
                with zipfile.ZipFile(data_zip, "r") as zip_ref:
                    zip_ref.extractall(STATIC_ROOT)
            except zipfile.BadZipFile:
                remove_file(data_zip)
                eprint('解压失败,请重试')
                exit(1)
            os.remove(data_zip)
            write_config(root=STATIC_ROOT)
            with open_(PATH_DATA_VERSION, 'w', encoding='utf-8') as f:
                f.write(data_version)
            global HANLP_DATA_VERSION
            HANLP_DATA_VERSION = data_version
            return True
Ejemplo n.º 2
0
def download(url, path):
    if os.path.isfile(path):
        print('使用本地 {}, 忽略 {}'.format(path, url))
        return True
    else:
        print('下载 {} 到 {}'.format(url, path))
        tmp_path = '{}.downloading'.format(path)
        remove_file(tmp_path)
        try:
            downloader = Downloader(url,
                                    tmp_path,
                                    4,
                                    headers={
                                        'User-agent':
                                        'pyhanlp (' + platform.platform() + ')'
                                    })
            downloader.subscribe(
                DownloadCallback(show_header=False, out=sys.stdout))
            downloader.start_sync()
        except BaseException as e:
            eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
            doc_url = 'https://od.hankcs.com/book/intro_nlp/'
            eprint('请参考 %s 执行手动安装.' % doc_url)
            eprint('或手动下载 {} 到 {}'.format(url, path))
            if os.path.isfile(tmp_path):
                os.remove(tmp_path)
            browser_open(doc_url)
            exit(1)
        remove_file(path)
        os.rename(tmp_path, path)
    return True
Ejemplo n.º 3
0
def install_hanlp_jar():
    jar_version, jar_url, data_version, data_url = hanlp_latest_version()
    jar_zip = os.path.join(STATIC_ROOT,
                           'hanlp-{}-release.zip'.format(jar_version))
    download(jar_url, jar_zip)
    try:
        with zipfile.ZipFile(jar_zip, "r") as archive:
            archive.extract(
                'hanlp-{}-release/hanlp-{}.jar'.format(jar_version,
                                                       jar_version),
                STATIC_ROOT)
    except zipfile.BadZipFile:
        remove_file(jar_zip)
        eprint('解压失败,请重试')
        exit(1)
    zip_folder = os.path.join(STATIC_ROOT,
                              'hanlp-{}-release'.format(jar_version))
    jar_file_name = 'hanlp-{}.jar'.format(jar_version)
    os.rename(os.path.join(zip_folder, jar_file_name),
              os.path.join(STATIC_ROOT, jar_file_name))
    shutil.rmtree(zip_folder)
    remove_file(jar_zip)
    global HANLP_JAR_VERSION
    HANLP_JAR_VERSION = jar_version
Ejemplo n.º 4
0
def download(url, path):
    opener = urllib.build_opener()
    opener.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
    )]
    urllib.install_opener(opener)
    if os.path.isfile(path):
        print('使用本地 {}, 忽略 {}'.format(path, url))
        return True
    else:
        print('下载 {} 到 {}'.format(url, path))
        tmp_path = '{}.downloading'.format(path)
        remove_file(tmp_path)
        try:

            def reporthook(count, block_size, total_size):
                global start_time, progress_size
                if count == 0:
                    start_time = time.time()
                    progress_size = 0
                    return
                duration = time.time() - start_time
                duration = max(1e-8, duration)  # 防止除零错误
                progress_size = int(count * block_size)
                if progress_size > total_size:
                    progress_size = total_size
                speed = int(progress_size / (1024 * duration))
                ratio = progress_size / total_size
                ratio = max(1e-8, ratio)
                percent = ratio * 100
                eta = duration / ratio * (1 - ratio)
                minutes = eta / 60
                seconds = eta % 60
                sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒   " %
                                 (percent, progress_size /
                                  (1024 * 1024), speed, minutes, seconds))
                sys.stdout.flush()

            import socket
            socket.setdefaulttimeout(10)
            urllib.urlretrieve(quote(url, safe='/:?='), tmp_path, reporthook)
            print()
        except BaseException as e:
            eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
            doc_url = 'https://od.hankcs.com/book/intro_nlp/'
            eprint('请参考 %s 执行手动安装.' % doc_url)
            eprint('或手动下载 {} 到 {}'.format(url, path))
            if os.path.isfile(tmp_path):
                os.remove(tmp_path)
            browser_open(doc_url)
            exit(1)
        remove_file(path)
        os.rename(tmp_path, path)
    return True
Ejemplo n.º 5
0
def download(url, path):
    opener = urllib.build_opener()
    opener.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
    )]
    urllib.install_opener(opener)
    if os.path.isfile(path):
        print('使用本地 {}, 忽略 {}'.format(path, url))
        return True
    else:
        print('下载 {} 到 {}'.format(url, path))
        tmp_path = '{}.downloading'.format(path)
        remove_file(tmp_path)
        try:
            downloader = Downloader(
                url,
                tmp_path,
                4,
                headers={
                    'User-agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
                })
            downloader.subscribe(
                DownloadCallback(show_header=False, out=sys.stdout))
            downloader.start_sync()
        except BaseException as e:
            eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
            doc_url = 'https://od.hankcs.com/book/intro_nlp/'
            eprint('请参考 %s 执行手动安装.' % doc_url)
            eprint('或手动下载 {} 到 {}'.format(url, path))
            if os.path.isfile(tmp_path):
                os.remove(tmp_path)
            browser_open(doc_url)
            exit(1)
        remove_file(path)
        os.rename(tmp_path, path)
    return True
Ejemplo n.º 6
0
def _start_jvm_for_hanlp():
    global STATIC_ROOT, hanlp_installed_data_version, HANLP_JAR_PATH, PATH_CONFIG, HANLP_JAR_VERSION, HANLP_DATA_PATH
    # Get ENV
    ENVIRON = os.environ.copy()
    # Load variables in Environment
    if "HANLP_VERBOSE" in ENVIRON:
        HANLP_VERBOSE = int(ENVIRON["HANLP_VERBOSE"])
    else:
        HANLP_VERBOSE = 0

    if "HANLP_STATIC_ROOT" in ENVIRON:
        STATIC_ROOT = ENVIRON["HANLP_STATIC_ROOT"]
        if HANLP_VERBOSE:
            print('使用环境变量 HANLP_STATIC_ROOT={}'.format(STATIC_ROOT))
        HANLP_DATA_PATH = os.path.join(STATIC_ROOT, 'data')

        def hanlp_installed_data_version():
            return '手动安装'
    else:
        from pyhanlp.static import STATIC_ROOT, hanlp_installed_data_version, HANLP_DATA_PATH
    if "HANLP_JAR_PATH" in ENVIRON:
        HANLP_JAR_PATH = ENVIRON["HANLP_JAR_PATH"]
        if HANLP_VERBOSE:
            print('使用环境变量 HANLP_JAR_PATH={}'.format(HANLP_JAR_PATH))
    else:
        from pyhanlp.static import HANLP_JAR_PATH
    if "HANLP_JVM_XMS" in ENVIRON:
        HANLP_JVM_XMS = ENVIRON["HANLP_JVM_XMS"]
    else:
        HANLP_JVM_XMS = "512m"
    if "HANLP_JVM_XMX" in ENVIRON:
        HANLP_JVM_XMX = ENVIRON["HANLP_JVM_XMX"]
    else:
        HANLP_JVM_XMX = "8g"  # JVM可用到的内存上限,通常并不会达到上限
    PATH_CONFIG = os.path.join(STATIC_ROOT, 'hanlp.properties')
    if not os.path.exists(HANLP_JAR_PATH):
        raise ValueError("配置错误: HANLP_JAR_PATH=%s 不存在" % HANLP_JAR_PATH)
    elif not os.path.isfile(HANLP_JAR_PATH) or not HANLP_JAR_PATH.endswith(
            '.jar'):
        raise ValueError("配置错误: HANLP_JAR_PATH=%s 不是jar文件" % HANLP_JAR_PATH)
    elif not os.path.exists(STATIC_ROOT):
        raise ValueError("配置错误: STATIC_ROOT=%s 不存在" % STATIC_ROOT)
    elif not os.path.isdir(HANLP_DATA_PATH):
        if HANLP_DATA_PATH.startswith(STATIC_ROOT):
            raise ValueError("配置错误: STATIC_ROOT=%s 目录下没有data文件夹" % STATIC_ROOT)
        else:
            raise ValueError("配置错误: 数据包 %s 不存在,请修改配置文件中的root" %
                             HANLP_DATA_PATH)
    elif not os.path.isfile(PATH_CONFIG):
        raise ValueError("配置错误: STATIC_ROOT=%s 目录下没有hanlp.properties" %
                         STATIC_ROOT)
    else:
        HANLP_JAR_VERSION = os.path.basename(
            HANLP_JAR_PATH)[len('hanlp-'):-len('.jar')]

        if HANLP_VERBOSE:
            print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH)
            print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT))
            print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT))

    java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html'
    pathsep = os.pathsep
    jvm_path = None
    try:
        jvm_path = getDefaultJVMPath()
    except JVMNotFoundException as e:
        eprint('找不到Java,请安装JDK8:%s' % java_url)
        browser_open(java_url)
        exit(1)
    except JVMNotSupportedException as e:
        eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)')
        browser_open(java_url)
        exit(1)
    if platform.system().startswith('CYGWIN'):
        if not jvm_path.startswith(
                '/cygdrive'):  # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径
            pathsep = ';'
            if STATIC_ROOT.startswith('/usr/lib'):
                cygwin_root = os.popen('cygpath -w /').read().strip().replace(
                    '\\', '/')
                STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):]
                HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):]
                PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):]
            elif STATIC_ROOT.startswith('/cygdrive'):
                driver = STATIC_ROOT.split('/')
                cygwin_driver = '/'.join(driver[:3])
                win_driver = driver[2].upper() + ':'
                HANLP_JAR_PATH = HANLP_JAR_PATH.replace(
                    cygwin_driver, win_driver)
                STATIC_ROOT = STATIC_ROOT.replace(cygwin_driver, win_driver)
                PATH_CONFIG = PATH_CONFIG.replace(cygwin_driver, win_driver)
    JAVA_JAR_CLASSPATH = "-Djava.class.path=%s%s%s" % (HANLP_JAR_PATH, pathsep,
                                                       STATIC_ROOT)
    # 加载插件jar
    for jar in glob.glob(os.path.join(STATIC_ROOT, '*.jar')):
        if HANLP_JAR_PATH.endswith(jar):
            continue
        JAVA_JAR_CLASSPATH = JAVA_JAR_CLASSPATH + pathsep + os.path.join(
            STATIC_ROOT, jar)
    if HANLP_VERBOSE: print("设置 JAVA_JAR_CLASSPATH [%s]" % JAVA_JAR_CLASSPATH)
    # 启动JVM
    startJVM(jvm_path,
             JAVA_JAR_CLASSPATH,
             "-Xms%s" % HANLP_JVM_XMS,
             "-Xmx%s" % HANLP_JVM_XMX,
             convertStrings=True)
    # 确保启动正常
    try:
        JClass('com.hankcs.hanlp.HanLP')
    except java.lang.NoClassDefFoundError as e:
        from pyhanlp.static import install_hanlp_jar
        eprint('找不到jar,可能由于安装路径含有中文,或者你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH))
        os.remove(HANLP_JAR_PATH)
        install_hanlp_jar()
        eprint('下载成功,请重新启动程序。如果问题依然存在,请不要安装到中文路径。')
        exit(1)