コード例 #1
0
    def tag_ltp(self, inp, seged):
        """
        pos tag using ltp, size of inp should be limited to lower than 10k
        :param inp:
        :param seged:
        :return:
        """

        params = copy.copy(self.data)
        socket.setdefaulttimeout(10)

        if seged:
            inp = map(lambda x: unicode(x).encode('utf-8'), inp)
            ltml = LTML()
            ltml.build_from_words(inp)
            params.update({'text': ltml.tostring(), 'xml_input': 'true'})

        else:
            inp = inp.encode('utf-8') if isinstance(inp, unicode) else inp
            params.update({'text': urllib.quote(inp)})

        params = urllib.urlencode(params)
        try:
            request = urllib2.Request(self.ltp_url)
            content = urllib2.urlopen(request, params).read().strip()
            for r in content.split():
                yield r.split('_')[0].decode('utf-8'), r.split('_')[1]
        except socket.timeout:
            print 'time out'
        except Exception, e:
            print inp
            print e
コード例 #2
0
def ParsingWithCustomPostags():
    ltml = LTML()
    ltml.build_from_words([("这",       "r"),
                           ("是",       "v"),
                           ("自定义",   "a"),
                           ("分词",     "n"),
                           ("结果",     "n"),
                           ("的",       "u"),
                           ("示例",     "n")])
    xml  = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
            "api_key"   : "YourApiKey",
            "text"      : xml,
            "format"    : "conll",
            "pattern"   : "dp",
            "xml_input" : "true"
            }

    params = urllib.urlencode(data)

    try:
        request  = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content  = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
コード例 #3
0
def ParsingWithCustomPostags():
    ltml = LTML()
    ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"),
                           ("结果", "n"), ("的", "u"), ("示例", "n")])
    xml = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
        "api_key": "YourApiKey",
        "text": xml,
        "format": "conll",
        "pattern": "dp",
        "xml_input": "true"
    }

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
コード例 #4
0
def POSTagWithCustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
    xml = ltml.tostring()

    uri_base = "http://api.ltp-cloud.com/analysis/?"

    data = {"api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true"}

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
コード例 #5
0
def POSTagWithCustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
    xml = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
        "api_key": "YourApiKey",
        "text": xml,
        "format": "plain",
        "pattern": "pos",
        "xml_input": "true"
    }

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
コード例 #6
0
ファイル: tfidfname.py プロジェクト: qingyunpkdd/pylib
    def fenci(self):
        ltml = LTML()
        ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
        xml = ltml.tostring()

        uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

        data = {
            "api_key": "11x3Q768B0mY9KGZ2sZlinNc1n0jFVwfSW2GVVPx",
            "text": xml,
            "format": "plain",
            "pattern": "pos",
            "xml_input": "true"
        }

        params = urllib.urlencode(data)

        try:
            request = urllib2.Request(uri_base)
            response = urllib2.urlopen(request, params)
            content = response.read().strip()
            print content
        except urllib2.HTTPError, e:
            print >> sys.stderr, e.reason
コード例 #7
0
LTP_DATA_DIR = '../data/ltp_data'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR,
                              'cws.model')  # 分词模型路径,模型名称为`cws.model`
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(
    cws_model_path, '../data/new_dictionary.txt')  # 加载模型,第二个参数是您的增量模型路径

line = '虚拟化驱动不正常时网络、存储性能降低。'
words = segmentor.segment(line)  # 分词
words_list = list(words)
print(words_list)

# LTML用于构建自定义分词的xml,用于向LTP云传入args
ltml = LTML()
ltml.build_from_words(words_list)
xml = ltml.tostring()
#print(xml)

url_get_base = "https://api.ltp-cloud.com/analysis/"

# 这个是加入自定义词典的参数
args = {
    'api_key': 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC',
    'pattern': 'sdp',
    'format': 'json',
    'xml_input': 'true',
    'text': xml
}
# args_others = {
#     'api_key' : 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC',
#     'pattern' : 'sdp',
コード例 #8
0
def CustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["天安门", "上", "太阳升"])
    print ltml.tostring()
コード例 #9
0
def CustomPOSTags():
    ltml = LTML()
    ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")])
    print ltml.tostring()
コード例 #10
0
def CustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["天安门", "上", "太阳升"])
    print ltml.tostring()
コード例 #11
0
def CustomPOSTags():
    ltml = LTML()
    ltml.build_from_words([("天安门", "N"),
                           ("上", "P"),
                           ("太阳升", "V")])
    print ltml.tostring()