def analyze(self, target_text): in_text = nlp_pb2.InputText() try: in_text.text = target_text except Exception: target_text = unicode(target_text, 'euc-kr').encode('utf-8') in_text.text = target_text in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.use_space = self.args.space in_text.level = 0 in_text.keyword_frequency_level = 0 ret = self.stub.Analyze(in_text) # Result to Json format # json_text = json_format.MessageToJson(ret, True, True) # data = json.loads(json_text) # self.json_printer.pprint(data) result_list = list() for idx in range(len(ret.sentences)): nlp_word = str() morph_word = str() # text = ret.sentences[idx].text analysis = ret.sentences[idx].morps for ana_idx in range(len(analysis)): if analysis[ana_idx].type in ['VV', 'VA', 'VX', 'VCP', 'VCN']: nlp_word += ' {0}다'.format(analysis[ana_idx].lemma) morph_word += ' {0}다/{1}'.format(analysis[ana_idx].lemma, analysis[ana_idx].type) else: nlp_word += ' {0}'.format(analysis[ana_idx].lemma) morph_word += ' {0}/{1}'.format(analysis[ana_idx].lemma, analysis[ana_idx].type) nlp_sent = nlp_word.encode('utf-8').strip() morph_sent = morph_word.encode('utf-8').strip() result_list.append((target_text, nlp_sent, morph_sent)) return result_list
def analyze(self, text, level, keyword_level): in_text = nlp_pb2.InputText() in_text.text = text in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.level = level in_text.keyword_frequency_level = keyword_level ret = self.stub.Analyze(in_text) # JSON Object 로 만들어 낸다. printer = json_format._Printer(True, True) doc = printer._MessageToJsonObject(ret) print doc # JSON text로 만들어낸다. json_text = json_format.MessageToJson(ret, True, True) print json_text for i in range(len(ret.sentences)): text = ret.sentences[i].text analysis = ret.sentences[i].morps morp = "" for j in range(len(analysis)): morp = morp + " " + analysis[j].lemma + "/" + analysis[j].type morp = morp.encode('utf-8').strip() addstr = 'morp -> ' + morp print addstr ner = ret.sentences[i].nes for j in range(len(ner)): ne = ner[j].text + "/" + ner[j].type ne = ne.encode('utf-8').strip() addNE = 'NE -> ' + ne print addNE
def analyze(self, target_text): in_text = nlp_pb2.InputText() try: in_text.text = target_text except Exception: in_text.text = unicode(target_text, 'euc-kr').encode('utf-8') target_text = unicode(target_text, 'euc-kr').encode('utf-8') in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.use_space = self.args.space in_text.level = 1 in_text.keyword_frequency_level = 0 ret = self.stub.Analyze(in_text) # Result to Json format # json_text = json_format.MessageToJson(ret, True, True) # data = json.loads(json_text) # self.json_printer.pprint(data) result_list = list() for idx in range(len(ret.sentences)): nlp_word_list = list() morph_word_list = list() # text = ret.sentences[idx].text analysis = ret.sentences[idx].morps for ana_idx in range(len(analysis)): morphs_word = analysis[ana_idx].lemma morphs_type = analysis[ana_idx].type if morphs_type in ['VV', 'VA', 'VX', 'VCP', 'VCN']: nlp_word_list.append('{0}다'.format(morphs_word)) morph_word_list.append('{0}다/{1}'.format(morphs_word, morphs_type)) elif ana_idx > 0 and morph_word_list[-1].split('/')[1] == 'SL' and morphs_type in ['SN', 'SW']: before_word = nlp_word_list.pop() morph_word_list.pop() nlp_word_list.append('{0}{1}'.format(before_word, morphs_word)) morph_word_list.append('{0}{1}/NNG'.format(before_word, morphs_word)) elif ana_idx > 0 and morph_word_list[-1].split('/')[1] == 'SN' and morphs_type in ['SL', 'SW']: before_word = nlp_word_list.pop() morph_word_list.pop() nlp_word_list.append('{0}{1}'.format(before_word, morphs_word)) morph_word_list.append('{0}{1}/NNG'.format(before_word, morphs_word)) elif ana_idx > 2 and morphs_type == 'SN': if morph_word_list[-2].split('/')[1] == 'SN' and morph_word_list[-1].split('/')[1] == 'SP': middle_word = nlp_word_list.pop() head_word = nlp_word_list.pop() morph_word_list.pop() morph_word_list.pop() nlp_word_list.append('{0}{1}{2}'.format(head_word, middle_word, morphs_word)) morph_word_list.append('{0}{1}{2}/NNG'.format(head_word, middle_word, morphs_word)) else: nlp_word_list.append('{0}'.format(morphs_word)) morph_word_list.append('{0}/{1}'.format(morphs_word, morphs_type)) nlp_sent = ' '.join(nlp_word_list).encode('utf-8').strip() morph_sent = ' '.join(morph_word_list).encode('utf-8').strip() result_list.append((target_text, nlp_sent, morph_sent)) return result_list
def analyze(self, text, level, keyword_level): in_text = nlp_pb2.InputText() in_text.text = text in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.level = level in_text.keyword_frequency_level = keyword_level ret = self.stub.Analyze(in_text) # JSON Object 로 만들어 낸다. printer = json_format._Printer(True, True) doc = printer._MessageToJsonObject(ret) #print doc # JSON text로 만들어낸다. json_text = json_format.MessageToDict(ret, True, True) return json_text
def analyze(self, target_text): in_text = nlp_pb2.InputText() try: in_text.text = target_text except Exception: in_text.text = unicode(target_text, 'euc-kr').encode('utf-8') in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.use_space = False in_text.level = 1 in_text.keyword_frequency_level = 0 try: ret = self.stub.Analyze(in_text) if ret: return True else: return False except Exception: return False
def analyze(self, text, level, keyword_level): in_text = nlp_pb2.InputText() in_text.text = text in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.level = level in_text.keyword_frequency_level = keyword_level ret = self.stub.Analyze(in_text) # JSON Object 로 만들어 낸다. printer = json_format._Printer(True, True) doc = printer._MessageToJsonObject(ret) ret_txt = text_format.MessageToString(ret, False, False) # print doc # JSON text 로 만들어낸다. json_text = json_format.MessageToJson(ret, True, True) # print json_text readable_text = '' for idx in range(len(ret.sentences)): text = ret.sentences[idx].text analysis = ret.sentences[idx].morps morp = "" for ana_idx in range(len(analysis)): morp += " {0}/{1}".format(analysis[ana_idx].lemma, analysis[ana_idx].type) morp = morp.encode('utf-8').strip() add_morp = "morp -> {0}".format(morp) # print add str readable_text += add_morp + '\n' ner = ret.sentences[idx].nes for ner_idx in range(len(ner)): ne = "{0}/{1}".format(ner[ner_idx].text, ner[ner_idx].type) ne = ne.encode('utf-8').strip() add_ne = 'NE -> ' + ne # print add NE readable_text += add_ne + '\n' return readable_text, json_text, ret
def analyze(self, text, level, keyword_level): in_text = nlp_pb2.InputText() in_text.text = text in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False in_text.level = level in_text.keyword_frequency_level = keyword_level ret = self.stub.Analyze(in_text) # JSON Object 로 만들어 낸다. printer = json_format._Printer(True, True) doc = printer._MessageToJsonObject(ret) ret_txt = text_format.MessageToString(ret, False, False) # print doc # JSON text 로 만들어낸다. json_text = json_format.MessageToJson(ret, True, True) # print json_text readable_text = '' for idx in range(len(ret.sentences)): text = ret.sentences[idx].text analysis = ret.sentences[idx].morps morp = "" for ana_idx in range(len(analysis)): if analysis[ana_idx].type in ['VV', 'VA', 'VX', 'VCP']: morp += " {0}다/{1}".format(analysis[ana_idx].lemma, analysis[ana_idx].type) else: morp += " {0}/{1}".format(analysis[ana_idx].lemma, analysis[ana_idx].type) morp = morp.encode('utf-8').strip() add_morp = "morp -> {0}".format(morp) # print add str readable_text += add_morp + '\n' ner = ret.sentences[idx].nes for ner_idx in range(len(ner)): if ner[ner_idx].type == 'VV' or ner[ ner_idx].type == 'VA' or ner[ ner_idx].type == 'VX' or ner[ner_idx].type == 'VCP': ne = "{0}다/{1}".format(ner[ner_idx].text, ner[ner_idx].type) else: ne = "{0}/{1}".format(ner[ner_idx].text, ner[ner_idx].type) ne = ne.encode('utf-8').strip() add_ne = 'NE -> ' + ne # print add NE readable_text += add_ne + '\n' # Make nlp sentence json_data = json.loads(json_text) word_list = list() for sentence in json_data['sentences']: for words in sentence['words']: tagged_text = words['tagged_text'] tagged_text_list = tagged_text.split() for tagged_word in tagged_text_list: word = tagged_word.split("/")[0] tag = tagged_word.split("/")[1] if tag in ['VV', 'VA', 'VX', 'VCP', 'VCN']: word += u"\ub2e4" word_list.append(word) nlp_sent = " ".join(word_list) # Modify json data for sentence in json_data['sentences']: for words in sentence['words']: tagged_text = words['tagged_text'] if '/VV' in tagged_text: words['tagged_text'] = words['tagged_text'].replace( "/VV", u"\ub2e4/VV") if '/VA' in tagged_text: words['tagged_text'] = words['tagged_text'].replace( "/VA", u"\ub2e4/VA") if '/VX' in tagged_text: words['tagged_text'] = words['tagged_text'].replace( "/VX", u"\ub2e4/VX") if '/VCP' in tagged_text: words['tagged_text'] = words['tagged_text'].replace( "/VCP", u"\ub2e4/VCP") if '/VCN' in tagged_text: words['tagged_text'] = words['tagged_text'].replace( "/VCN", u"\ub2e4/VCN") for sentence in json_data['sentences']: for morps in sentence['morps']: if morps['type'] == 'VV': morps['lemma'] += u"\ub2e4" elif morps['type'] == 'VA': morps['lemma'] += u"\ub2e4" elif morps['type'] == 'VX': morps['lemma'] += u"\ub2e4" elif morps['type'] == 'VCP': morps['lemma'] += u"\ub2e4" elif morps['type'] == 'VCN': morps['lemma'] += u"\ub2e4" return nlp_sent, json.dumps(json_data), ret
def analyze(self, target_text): in_text = nlp_pb2.InputText() try: in_text.text = target_text.replace('O', ' ').decode('utf-8') target_text = target_text.decode('utf-8') except Exception: in_text.text = unicode(target_text.replace('O', ' '), 'euc-kr') target_text = unicode(target_text, 'euc-kr') in_text.lang = lang_pb2.kor in_text.split_sentence = True in_text.use_tokenizer = False # in_text.use_space = False in_text.level = 1 in_text.keyword_frequency_level = 0 ret = self.stub.Analyze(in_text) result_list = list() for idx in range(len(ret.sentences)): nlp_word_list = [text for text in target_text] new_nlp_word_list = list() morph_word_list = list() analysis = ret.sentences[idx].morps for ana_idx in range(len(analysis)): morphs_word = analysis[ana_idx].lemma morphs_type = analysis[ana_idx].type if morphs_type in ['VV', 'VA', 'VX', 'VCP', 'VCN']: new_nlp_word_list.append('{0}다'.format(morphs_word)) morph_word_list.append('{0}다/{1}'.format( morphs_word, morphs_type)) elif ana_idx > 0 and morph_word_list[-1].split( '/')[1] == 'SL' and morphs_type in ['SN', 'SW']: before_word = nlp_word_list.pop() morph_word_list.pop() new_nlp_word_list.append('{0}{1}'.format( before_word, morphs_word)) morph_word_list.append('{0}{1}/NNG'.format( before_word, morphs_word)) elif ana_idx > 0 and morph_word_list[-1].split( '/')[1] == 'SN' and morphs_type in ['SL', 'SW']: before_word = nlp_word_list.pop() morph_word_list.pop() new_nlp_word_list.append('{0}{1}'.format( before_word, morphs_word)) morph_word_list.append('{0}{1}/NNG'.format( before_word, morphs_word)) elif ana_idx > 2 and morphs_type == 'SN': if morph_word_list[-2].split( '/')[1] == 'SN' and morph_word_list[-1].split( '/')[1] == 'SP': middle_word = nlp_word_list.pop() head_word = nlp_word_list.pop() morph_word_list.pop() morph_word_list.pop() new_nlp_word_list.append('{0}{1}{2}'.format( head_word, middle_word, morphs_word)) morph_word_list.append('{0}{1}{2}/NNG'.format( head_word, middle_word, morphs_word)) else: new_nlp_word_list.append('{0}'.format(morphs_word)) morph_word_list.append('{0}/{1}'.format( morphs_word, morphs_type)) else: new_nlp_word_list.append('{0}'.format(morphs_word)) morph_word_list.append('{0}/{1}'.format( morphs_word, morphs_type)) nlp_sent = ' '.join(new_nlp_word_list).encode('utf-8').strip() morph_sent = ' '.join(morph_word_list).encode('utf-8').strip() result_list.append((target_text, nlp_sent, morph_sent)) return result_list