def tag_ltp(self, inp, seged): """ pos tag using ltp, size of inp should be limited to lower than 10k :param inp: :param seged: :return: """ params = copy.copy(self.data) socket.setdefaulttimeout(10) if seged: inp = map(lambda x: unicode(x).encode('utf-8'), inp) ltml = LTML() ltml.build_from_words(inp) params.update({'text': ltml.tostring(), 'xml_input': 'true'}) else: inp = inp.encode('utf-8') if isinstance(inp, unicode) else inp params.update({'text': urllib.quote(inp)}) params = urllib.urlencode(params) try: request = urllib2.Request(self.ltp_url) content = urllib2.urlopen(request, params).read().strip() for r in content.split(): yield r.split('_')[0].decode('utf-8'), r.split('_')[1] except socket.timeout: print 'time out' except Exception, e: print inp print e
def ParsingWithCustomPostags(): ltml = LTML() ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"), ("结果", "n"), ("的", "u"), ("示例", "n")]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key" : "YourApiKey", "text" : xml, "format" : "conll", "pattern" : "dp", "xml_input" : "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def ParsingWithCustomPostags(): ltml = LTML() ltml.build_from_words([("这", "r"), ("是", "v"), ("自定义", "a"), ("分词", "n"), ("结果", "n"), ("的", "u"), ("示例", "n")]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "YourApiKey", "text": xml, "format": "conll", "pattern": "dp", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def POSTagWithCustomSegmentation(): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://api.ltp-cloud.com/analysis/?" data = {"api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true"} params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def POSTagWithCustomSegmentation(): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "YourApiKey", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
def fenci(self): ltml = LTML() ltml.build_from_words(["自定义", "分词", "结果", "的", "示例"]) xml = ltml.tostring() uri_base = "http://ltpapi.voicecloud.cn/analysis/?" data = { "api_key": "11x3Q768B0mY9KGZ2sZlinNc1n0jFVwfSW2GVVPx", "text": xml, "format": "plain", "pattern": "pos", "xml_input": "true" } params = urllib.urlencode(data) try: request = urllib2.Request(uri_base) response = urllib2.urlopen(request, params) content = response.read().strip() print content except urllib2.HTTPError, e: print >> sys.stderr, e.reason
LTP_DATA_DIR = '../data/ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon( cws_model_path, '../data/new_dictionary.txt') # 加载模型,第二个参数是您的增量模型路径 line = '虚拟化驱动不正常时网络、存储性能降低。' words = segmentor.segment(line) # 分词 words_list = list(words) print(words_list) # LTML用于构建自定义分词的xml,用于向LTP云传入args ltml = LTML() ltml.build_from_words(words_list) xml = ltml.tostring() #print(xml) url_get_base = "https://api.ltp-cloud.com/analysis/" # 这个是加入自定义词典的参数 args = { 'api_key': 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC', 'pattern': 'sdp', 'format': 'json', 'xml_input': 'true', 'text': xml } # args_others = { # 'api_key' : 'a1R923E7s37daeNz7dsMeXiTexWGoookJX2HONwC', # 'pattern' : 'sdp',
def CustomSegmentation(): ltml = LTML() ltml.build_from_words(["天安门", "上", "太阳升"]) print ltml.tostring()
def CustomPOSTags(): ltml = LTML() ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")]) print ltml.tostring()
def CustomSegmentation(): ltml = LTML() ltml.build_from_words(["天安门", "上", "太阳升"]) print ltml.tostring()
def CustomPOSTags(): ltml = LTML() ltml.build_from_words([("天安门", "N"), ("上", "P"), ("太阳升", "V")]) print ltml.tostring()