def ParsingWithCustomPostags(): ltml = LTML() ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"), ("结果", "n"), ("的", "u"), ("示例", "n")]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key" : "YourApiKey", "text" : xml, "format" : "conll", "pattern" : "dp", "xml_input" : "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def ParsingWithCustomPostags(): ltml = LTML() ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"), ("结果", "n"), ("的", "u"), ("示例", "n")]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "YourApiKey", "text": xml, "format": "conll", "pattern": "dp", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def analysis(self, input, opt = None): request = urllib2.Request(addr + uris); request.add_header("Authorization", "Basic %s" % (base64.encodestring(self.au).replace("\n", ""))) if isinstance(input, str): data = { 's': input, 'x': 'n', 'c': self.encoding, 't': (opt if opt is not None else LTPOption.ALL)} params = urllib.urlencode(data) result = urllib2.urlopen(request, params) content = result.read().strip() return LTML(content) elif isinstance(input, LTML): data = { 's' : input.tostring(self.encoding), 'x' : 'y', 'c' : self.encoding, 't' : (opt if opt is not None else LTPOption.ALL),} params = urllib.urlencode(data) result = urllib2.urlopen(request, params) content = result.read().strip() return LTML(content)
def tag_ltp(self, inp, seged): """ pos tag using ltp, size of inp should be limited to lower than 10k :param inp: :param seged: :return: """ params = copy.copy(self.data) socket.setdefaulttimeout(10) if seged: inp = map(lambda x: unicode(x).encode('utf-8'), inp) ltml = LTML() ltml.build_from_words(inp) params.update({'text': ltml.tostring(), 'xml_input': 'true'}) else: inp = inp.encode('utf-8') if isinstance(inp, unicode) else inp params.update({'text': urllib.quote(inp)}) params = urllib.urlencode(params) try: request = urllib2.Request(self.ltp_url) content = urllib2.urlopen(request, params).read().strip() for r in content.split(): yield r.split('_')[0].decode('utf-8'), r.split('_')[1] except socket.timeout: print 'time out' except Exception, e: print inp print e
def POSTagWithCustomSegmentation(): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://api.ltp-cloud.com/analysis/?" data = {"api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true"} params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def pos(input): payload = { 's': input, 'x': 'n', 'c': 'utf-8', 't': LTPOption.POS } try: r = requests.post(ServiceURL, data=payload) except Exception as e: print e result = LTML(r.content) pids = result.count_paragraph() pos_result = [] for pid in xrange(pids): for sid in xrange(result.count_sentence(pid)): print "|".join( [word.encode('utf8') for word in result.get_words_by_pos(pid, sid)]) pos_result = pos_result + [word.encode('utf8') for word in result.get_words_by_pos(pid, sid)] return pos_result
def tokenizer(input): payload = { 's': input, 'x': 'n', 'c': 'utf-8', 't': LTPOption.WS } try: r = requests.post(ServiceURL, data=payload) except Exception as e: print e result = LTML(r.content) tokenizer_sentences = [] pids = result.count_paragraph() ws_result = [] original_sentences = [] for pid in xrange(pids): for sid in xrange(result.count_sentence(pid)): tokenizer_sentences.append( [word.encode('utf8') for word in result.get_words(pid, sid)]) original_sentences.append(result.get_sentences(pid, sid)) ws_result = ws_result + [word.encode( 'utf8') for word in result.get_words(pid, sid)] return (original_sentences, tokenizer_sentences, ws_result)
def POSTagWithCustomSegmentation(): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def fenci(self): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "11x3Q768B0mY9KGZ2sZlinNc1n0jFVwfSW2GVVPx", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
# 先进行自定义的分词 LTP_DATA_DIR = '../data/ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon( cws_model_path, '../data/new_dictionary.txt') # 加载模型,第二个参数是您的增量模型路径 line = '虚拟化驱动不正常时网络、存储性能降低。' words = segmentor.segment(line) # 分词 words_list = list(words) print(words_list) # LTML用于构建自定义分词的xml,用于向LTP云传入args ltml = LTML() ltml.build_from_words(words_list) xml = ltml.tostring() #print(xml) url_get_base = "https://api.ltp-cloud.com/analysis/" # 这个是加入自定义词典的参数 args = { 'api_key': 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC', 'pattern': 'sdp', 'format': 'json', 'xml_input': 'true', 'text': xml } # args_others = {
def CustomSegmentation(): ltml = LTML() ltml.build_from_words(["天安门", "上", "太阳升"]) print ltml.tostring()
def CustomPOSTags(): ltml = LTML() ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")]) print ltml.tostring()
def CustomPOSTags(): ltml = LTML() ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")]) print ltml.prettify()