Exemple #1
0
    def install_jar(name, url):
    dst = os.path.join(STATIC_ROOT, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


    install_jar('text-classification-svm-1.0.2.jar', 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
    install_jar('liblinear-1.95.jar', 'http://file.hankcs.com/bin/liblinear-1.95.jar')

    # 载入分类器
    LinearSVMClassifier = SafeJClass(
        'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
    # 保存模型的工具
    IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
    return LinearSVMClassifier(IOUtil.readObjectFrom(model_file_name))
def url_recognition():
    # URL 识别
    text = """HanLP的项目地址是https://github.com/hankcs/HanLP,
            发布地址是https://github.com/hankcs/HanLP/releases,
            我有时候会在www.hankcs.com上面发布一些消息,
            我的微博是http://weibo.com/hankcs/,会同步推送hankcs.com的新闻。
            听说.中国域名开放申请了,但我并没有申请hankcs.中国,因为穷……
                """

    Nature = SafeJClass("com.hankcs.hanlp.corpus.tag.Nature")
    # Term = SafeJClass("com.hankcs.hanlp.seg.common.Term")
    URLTokenizer = SafeJClass("com.hankcs.hanlp.tokenizer.URLTokenizer")

    term_list = URLTokenizer.segment(text)
    print(term_list)
    for term in term_list:
        if term.nature == Nature.xu:
            print(term.word)
Exemple #3
0
def divisionTrainData(trainDataPath, classificationPath):
    # 创建类别目录
    positivePath = os.path.join(classificationPath, 'positive')
    negetivePath = os.path.join(classificationPath, 'negetive')
    if not os.path.isdir(classificationPath):
        os.mkdir(classificationPath)
    if not os.path.isdir(positivePath):
        os.mkdir(positivePath)
    if not os.path.isdir(negetivePath):
        os.mkdir(negetivePath)

    # 将文本内容按照label分成两个类目并保存在不同文件夹
    with open(trainDataPath, 'r', encoding='utf-8') as fin:
        fin.readline()
        for sentence in fin.readlines():
            sentence = sentence.strip('\n')
            sentence = sentence.split('\t')
            if (sentence[2] == '0'):
                if()
                pf = open(os.path.join(positivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                pf.write(sentence[1])
                pf.close()
            else:
                nf = open(os.path.join(negetivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                nf.write(sentence[1])
                nf.close()
    print('成功加载训练集。')


##########################################################################################
# 载入分类器
IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 载入分词器
ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer')
HanLPTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer')
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

##########################################################################################

if __name__ == '__main__':
    divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH)
    classifier = NaiveBayesClassifier()
    classifier.train(CLASSIFICATION_DATA_PATH)
    print(classifier.classify("我去挂机了"))
Exemple #4
0
def install_jar(name, filepath, url):
    dst = os.path.join(filepath, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

##########################################################################################

if __name__ == '__main__':
    divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH)
    classifier = LinearSVMClassifier()
    classifier.train(CLASSIFICATION_DATA_PATH)
    # 保存模型
    model = classifier.getmodel()
Exemple #5
0
        fin.readline()
        for sentence in fin.readlines():
            sentence = sentence.strip('\n')
            sentence = sentence.split('\t')
            if (sentence[2] == '0'):
                pf = open(os.path.join(positivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                pf.write(sentence[1])
                pf.close()
            else:
                nf = open(os.path.join(negetivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                nf.write(sentence[1])
                nf.close()
    print('成功加载训练集。')

##########################################################################################
# 载入分类器
IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

##########################################################################################

if __name__ == '__main__':
    pass
Exemple #6
0
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')
Evaluator = JClass(
    'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator')


##########################################################################################
# 对数据集进行预处理
def dataPreprocessing(dataPath):
    print("开始修正数据......")
    fi = open(dataPath, "r", encoding="utf-8")
Exemple #7
0
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')
Evaluator = JClass(
    'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator')


##########################################################################################
# 对数据集进行预处理
def dataPreprocessing(dataPath):
    fi = open(dataPath, "r", "utf-8")
    fi.readline()
Exemple #8
0
    print("dest_path:" + dest_path)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
        print("dest_path:" + dest_path)
    return dest_path


NaiveBayesClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
# sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
#                                 'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip')

ChnSentiCorp_path = ensure_data('酒店评论情感分析', \
              'http://hanlp.linrunsoft.com/release/corpus/ChnSentiCorp.zip')


def train_or_load_classifier(path):
    model_path = path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(ChnSentiCorp_path)
    model = classifier.getModel()
Exemple #9
0
if sys.version_info[0] < 3:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    # raise "Must be using Python 3"

from absl import flags  # absl-py
from absl import logging  # absl-py

FLAGS = flags.FLAGS
import unittest
import threading
import time
from pyhanlp import HanLP, SafeJClass

# 在线程体外部用SafeJClass线程安全地引入类名
CRFLexicalAnalyzer = SafeJClass(
    "com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer")


class MyThread(threading.Thread):
    def __init__(self, name, counter, analyzer):
        threading.Thread.__init__(self)
        self.thread_name = name
        self.counter = counter
        self.analyzer = analyzer

    def run(self):
        print("Starting " + self.thread_name)
        while self.counter:
            time.sleep(1)
            sentence = self.analyzer.analyze("商品和服务")
            print("%s: %s, seg: %s" %