def ParsingWithCustomPostags():
    ltml = LTML()
    ltml.build_from_words([("这",       "r"),
                           ("是",       "v"),
                           ("自定义",   "a"),
                           ("分词",     "n"),
                           ("结果",     "n"),
                           ("的",       "u"),
                           ("示例",     "n")])
    xml  = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
            "api_key"   : "YourApiKey",
            "text"      : xml,
            "format"    : "conll",
            "pattern"   : "dp",
            "xml_input" : "true"
            }

    params = urllib.urlencode(data)

    try:
        request  = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content  = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
def ParsingWithCustomPostags():
    ltml = LTML()
    ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"),
                           ("结果", "n"), ("的", "u"), ("示例", "n")])
    xml = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
        "api_key": "YourApiKey",
        "text": xml,
        "format": "conll",
        "pattern": "dp",
        "xml_input": "true"
    }

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
    def analysis(self, input, opt = None):

        request = urllib2.Request(addr + uris);
        request.add_header("Authorization", 
                "Basic %s" % (base64.encodestring(self.au).replace("\n", "")))

        if isinstance(input, str):
            data = {
                    's': input,
                    'x': 'n',
                    'c': self.encoding,
                    't': (opt if opt is not None else LTPOption.ALL)}

            params = urllib.urlencode(data)
            result = urllib2.urlopen(request, params)
            content = result.read().strip()
            return LTML(content)

        elif isinstance(input, LTML):
            data = {
                    's' : input.tostring(self.encoding),
                    'x' : 'y',
                    'c' : self.encoding,
                    't' : (opt if opt is not None else LTPOption.ALL),}

            params = urllib.urlencode(data)
            result = urllib2.urlopen(request, params)
            content = result.read().strip()
            return LTML(content)
Exemple #4
0
    def tag_ltp(self, inp, seged):
        """
        pos tag using ltp, size of inp should be limited to lower than 10k
        :param inp:
        :param seged:
        :return:
        """

        params = copy.copy(self.data)
        socket.setdefaulttimeout(10)

        if seged:
            inp = map(lambda x: unicode(x).encode('utf-8'), inp)
            ltml = LTML()
            ltml.build_from_words(inp)
            params.update({'text': ltml.tostring(), 'xml_input': 'true'})

        else:
            inp = inp.encode('utf-8') if isinstance(inp, unicode) else inp
            params.update({'text': urllib.quote(inp)})

        params = urllib.urlencode(params)
        try:
            request = urllib2.Request(self.ltp_url)
            content = urllib2.urlopen(request, params).read().strip()
            for r in content.split():
                yield r.split('_')[0].decode('utf-8'), r.split('_')[1]
        except socket.timeout:
            print 'time out'
        except Exception, e:
            print inp
            print e
def POSTagWithCustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
    xml = ltml.tostring()

    uri_base = "http://api.ltp-cloud.com/analysis/?"

    data = {"api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true"}

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
Exemple #6
0
def pos(input):
    payload = {
        's': input,
        'x': 'n',
        'c': 'utf-8',
        't': LTPOption.POS
    }
    try:
        r = requests.post(ServiceURL, data=payload)
    except Exception as e:
        print e

    result = LTML(r.content)
    pids = result.count_paragraph()
    pos_result = []
    for pid in xrange(pids):
        for sid in xrange(result.count_sentence(pid)):
            print "|".join(
                [word.encode('utf8') for word in result.get_words_by_pos(pid, sid)])
            pos_result = pos_result + [word.encode('utf8') for word in result.get_words_by_pos(pid, sid)]
    return pos_result
Exemple #7
0
def tokenizer(input):
    payload = {
        's': input,
        'x': 'n',
        'c': 'utf-8',
        't': LTPOption.WS
    }
    try:
        r = requests.post(ServiceURL, data=payload)
    except Exception as e:
        print e

    result = LTML(r.content)
    tokenizer_sentences = []
    pids = result.count_paragraph()
    ws_result = []
    original_sentences = []
    for pid in xrange(pids):
        for sid in xrange(result.count_sentence(pid)):
            tokenizer_sentences.append(
                [word.encode('utf8') for word in result.get_words(pid, sid)])
            original_sentences.append(result.get_sentences(pid, sid))
            ws_result = ws_result + [word.encode(
                'utf8') for word in result.get_words(pid, sid)]
    return (original_sentences, tokenizer_sentences, ws_result)
Exemple #8
0
def POSTagWithCustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
    xml = ltml.tostring()

    uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

    data = {
        "api_key": "YourApiKey",
        "text": xml,
        "format": "plain",
        "pattern": "pos",
        "xml_input": "true"
    }

    params = urllib.urlencode(data)

    try:
        request = urllib2.Request(uri_base)
        response = urllib2.urlopen(request, params)
        content = response.read().strip()
        print content
    except urllib2.HTTPError, e:
        print >> sys.stderr, e.reason
Exemple #9
0
    def fenci(self):
        ltml = LTML()
        ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"])
        xml = ltml.tostring()

        uri_base = "http://ltpapi.voicecloud.cn/analysis/?"

        data = {
            "api_key": "11x3Q768B0mY9KGZ2sZlinNc1n0jFVwfSW2GVVPx",
            "text": xml,
            "format": "plain",
            "pattern": "pos",
            "xml_input": "true"
        }

        params = urllib.urlencode(data)

        try:
            request = urllib2.Request(uri_base)
            response = urllib2.urlopen(request, params)
            content = response.read().strip()
            print content
        except urllib2.HTTPError, e:
            print >> sys.stderr, e.reason
# 先进行自定义的分词
LTP_DATA_DIR = '../data/ltp_data'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR,
                              'cws.model')  # 分词模型路径,模型名称为`cws.model`
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(
    cws_model_path, '../data/new_dictionary.txt')  # 加载模型,第二个参数是您的增量模型路径

line = '虚拟化驱动不正常时网络、存储性能降低。'
words = segmentor.segment(line)  # 分词
words_list = list(words)
print(words_list)

# LTML用于构建自定义分词的xml,用于向LTP云传入args
ltml = LTML()
ltml.build_from_words(words_list)
xml = ltml.tostring()
#print(xml)

url_get_base = "https://api.ltp-cloud.com/analysis/"

# 这个是加入自定义词典的参数
args = {
    'api_key': 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC',
    'pattern': 'sdp',
    'format': 'json',
    'xml_input': 'true',
    'text': xml
}
# args_others = {
Exemple #11
0
def CustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["天安门", "上", "太阳升"])
    print ltml.tostring()
Exemple #12
0
def CustomPOSTags():
    ltml = LTML()
    ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")])
    print ltml.tostring()
def CustomSegmentation():
    ltml = LTML()
    ltml.build_from_words(["天安门", "上", "太阳升"])
    print ltml.tostring()
def CustomPOSTags():
    ltml = LTML()
    ltml.build_from_words([("天安门", "N"),
                           ("上", "P"),
                           ("太阳升", "V")])
    print ltml.prettify()