Beispiel #1
0
    def analyze(self, target_text):
        in_text = nlp_pb2.InputText()
        try:
            in_text.text = target_text
        except Exception:
            target_text = unicode(target_text, 'euc-kr').encode('utf-8')
            in_text.text = target_text
        in_text.lang = lang_pb2.kor
        in_text.split_sentence = True
        in_text.use_tokenizer = False
        in_text.use_space = self.args.space
        in_text.level = 0
        in_text.keyword_frequency_level = 0
        ret = self.stub.Analyze(in_text)
        # Result to Json format
#        json_text = json_format.MessageToJson(ret, True, True)
#        data = json.loads(json_text)
#        self.json_printer.pprint(data)
        result_list = list()
        for idx in range(len(ret.sentences)):
            nlp_word = str()
            morph_word = str()
#            text = ret.sentences[idx].text
            analysis = ret.sentences[idx].morps
            for ana_idx in range(len(analysis)):
                if analysis[ana_idx].type in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                    nlp_word += ' {0}다'.format(analysis[ana_idx].lemma)
                    morph_word += ' {0}다/{1}'.format(analysis[ana_idx].lemma, analysis[ana_idx].type)
                else:
                    nlp_word += ' {0}'.format(analysis[ana_idx].lemma)
                    morph_word += ' {0}/{1}'.format(analysis[ana_idx].lemma, analysis[ana_idx].type)
            nlp_sent = nlp_word.encode('utf-8').strip()
            morph_sent = morph_word.encode('utf-8').strip()
            result_list.append((target_text, nlp_sent, morph_sent))
        return result_list
Beispiel #2
0
    def analyze(self, text, level, keyword_level):
        in_text = nlp_pb2.InputText()
        in_text.text = text
        in_text.lang = lang_pb2.kor
        in_text.split_sentence = True
        in_text.use_tokenizer = False
        in_text.level = level
        in_text.keyword_frequency_level = keyword_level

        ret = self.stub.Analyze(in_text)

        # JSON Object 로 만들어 낸다.
        printer = json_format._Printer(True, True)
        doc = printer._MessageToJsonObject(ret)
        print doc

        # JSON text로 만들어낸다.
        json_text = json_format.MessageToJson(ret, True, True)
        print json_text
        for i in range(len(ret.sentences)):
            text = ret.sentences[i].text
            analysis = ret.sentences[i].morps
            morp = ""
            for j in range(len(analysis)):
                morp = morp + " " + analysis[j].lemma + "/" + analysis[j].type
            morp = morp.encode('utf-8').strip()
            addstr = 'morp -> ' + morp
            print addstr
            ner = ret.sentences[i].nes
            for j in range(len(ner)):
                ne = ner[j].text + "/" + ner[j].type
                ne = ne.encode('utf-8').strip()
                addNE = 'NE -> ' + ne
                print addNE
Beispiel #3
0
    def analyze(self, target_text):
        in_text = nlp_pb2.InputText()
        try:
            in_text.text = target_text
        except Exception:
            in_text.text = unicode(target_text, 'euc-kr').encode('utf-8')
            target_text = unicode(target_text, 'euc-kr').encode('utf-8')
        in_text.lang = lang_pb2.kor
        in_text.split_sentence = True
        in_text.use_tokenizer = False
        in_text.use_space = self.args.space
        in_text.level = 1
        in_text.keyword_frequency_level = 0
        ret = self.stub.Analyze(in_text)
        # Result to Json format
#        json_text = json_format.MessageToJson(ret, True, True)
#        data = json.loads(json_text)
#        self.json_printer.pprint(data)
        result_list = list()
        for idx in range(len(ret.sentences)):
            nlp_word_list = list()
            morph_word_list = list()
#            text = ret.sentences[idx].text
            analysis = ret.sentences[idx].morps
            for ana_idx in range(len(analysis)):
                morphs_word = analysis[ana_idx].lemma
                morphs_type = analysis[ana_idx].type
                if morphs_type in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                    nlp_word_list.append('{0}다'.format(morphs_word))
                    morph_word_list.append('{0}다/{1}'.format(morphs_word, morphs_type))
                elif ana_idx > 0 and morph_word_list[-1].split('/')[1] == 'SL' and morphs_type in ['SN', 'SW']:
                    before_word = nlp_word_list.pop()
                    morph_word_list.pop()
                    nlp_word_list.append('{0}{1}'.format(before_word, morphs_word))
                    morph_word_list.append('{0}{1}/NNG'.format(before_word, morphs_word))
                elif ana_idx > 0 and morph_word_list[-1].split('/')[1] == 'SN' and morphs_type in ['SL', 'SW']:
                    before_word = nlp_word_list.pop()
                    morph_word_list.pop()
                    nlp_word_list.append('{0}{1}'.format(before_word, morphs_word))
                    morph_word_list.append('{0}{1}/NNG'.format(before_word, morphs_word))
                elif ana_idx > 2 and morphs_type == 'SN':
                    if morph_word_list[-2].split('/')[1] == 'SN' and morph_word_list[-1].split('/')[1] == 'SP':
                        middle_word = nlp_word_list.pop()
                        head_word = nlp_word_list.pop()
                        morph_word_list.pop()
                        morph_word_list.pop()
                        nlp_word_list.append('{0}{1}{2}'.format(head_word, middle_word, morphs_word))
                        morph_word_list.append('{0}{1}{2}/NNG'.format(head_word, middle_word, morphs_word))
                else:
                    nlp_word_list.append('{0}'.format(morphs_word))
                    morph_word_list.append('{0}/{1}'.format(morphs_word, morphs_type))
            nlp_sent = ' '.join(nlp_word_list).encode('utf-8').strip()
            morph_sent = ' '.join(morph_word_list).encode('utf-8').strip()
            result_list.append((target_text, nlp_sent, morph_sent))
        return result_list
Beispiel #4
0
    def analyze(self, text, level, keyword_level):
        in_text = nlp_pb2.InputText()
        in_text.text = text
        in_text.lang = lang_pb2.kor
        in_text.split_sentence = True
        in_text.use_tokenizer = False
        in_text.level = level
        in_text.keyword_frequency_level = keyword_level

        ret = self.stub.Analyze(in_text)

        # JSON Object 로 만들어 낸다.
        printer = json_format._Printer(True, True)
        doc = printer._MessageToJsonObject(ret)
        #print doc

        # JSON text로 만들어낸다.
        json_text = json_format.MessageToDict(ret, True, True)
        return json_text
Beispiel #5
0
 def analyze(self, target_text):
     in_text = nlp_pb2.InputText()
     try:
         in_text.text = target_text
     except Exception:
         in_text.text = unicode(target_text, 'euc-kr').encode('utf-8')
     in_text.lang = lang_pb2.kor
     in_text.split_sentence = True
     in_text.use_tokenizer = False
     in_text.use_space = False
     in_text.level = 1
     in_text.keyword_frequency_level = 0
     try:
         ret = self.stub.Analyze(in_text)
         if ret:
             return True
         else:
             return False
     except Exception:
         return False
Beispiel #6
0
 def analyze(self, text, level, keyword_level):
     in_text = nlp_pb2.InputText()
     in_text.text = text
     in_text.lang = lang_pb2.kor
     in_text.split_sentence = True
     in_text.use_tokenizer = False
     in_text.level = level
     in_text.keyword_frequency_level = keyword_level
     ret = self.stub.Analyze(in_text)
     # JSON Object 로 만들어 낸다.
     printer = json_format._Printer(True, True)
     doc = printer._MessageToJsonObject(ret)
     ret_txt = text_format.MessageToString(ret, False, False)
     # print doc
     # JSON text 로 만들어낸다.
     json_text = json_format.MessageToJson(ret, True, True)
     # print json_text
     readable_text = ''
     for idx in range(len(ret.sentences)):
         text = ret.sentences[idx].text
         analysis = ret.sentences[idx].morps
         morp = ""
         for ana_idx in range(len(analysis)):
             morp += " {0}/{1}".format(analysis[ana_idx].lemma,
                                       analysis[ana_idx].type)
         morp = morp.encode('utf-8').strip()
         add_morp = "morp -> {0}".format(morp)
         # print add str
         readable_text += add_morp + '\n'
         ner = ret.sentences[idx].nes
         for ner_idx in range(len(ner)):
             ne = "{0}/{1}".format(ner[ner_idx].text, ner[ner_idx].type)
             ne = ne.encode('utf-8').strip()
             add_ne = 'NE -> ' + ne
             # print add NE
             readable_text += add_ne + '\n'
     return readable_text, json_text, ret
Beispiel #7
0
 def analyze(self, text, level, keyword_level):
     in_text = nlp_pb2.InputText()
     in_text.text = text
     in_text.lang = lang_pb2.kor
     in_text.split_sentence = True
     in_text.use_tokenizer = False
     in_text.level = level
     in_text.keyword_frequency_level = keyword_level
     ret = self.stub.Analyze(in_text)
     # JSON Object 로 만들어 낸다.
     printer = json_format._Printer(True, True)
     doc = printer._MessageToJsonObject(ret)
     ret_txt = text_format.MessageToString(ret, False, False)
     # print doc
     # JSON text 로 만들어낸다.
     json_text = json_format.MessageToJson(ret, True, True)
     # print json_text
     readable_text = ''
     for idx in range(len(ret.sentences)):
         text = ret.sentences[idx].text
         analysis = ret.sentences[idx].morps
         morp = ""
         for ana_idx in range(len(analysis)):
             if analysis[ana_idx].type in ['VV', 'VA', 'VX', 'VCP']:
                 morp += " {0}다/{1}".format(analysis[ana_idx].lemma,
                                            analysis[ana_idx].type)
             else:
                 morp += " {0}/{1}".format(analysis[ana_idx].lemma,
                                           analysis[ana_idx].type)
         morp = morp.encode('utf-8').strip()
         add_morp = "morp -> {0}".format(morp)
         # print add str
         readable_text += add_morp + '\n'
         ner = ret.sentences[idx].nes
         for ner_idx in range(len(ner)):
             if ner[ner_idx].type == 'VV' or ner[
                     ner_idx].type == 'VA' or ner[
                         ner_idx].type == 'VX' or ner[ner_idx].type == 'VCP':
                 ne = "{0}다/{1}".format(ner[ner_idx].text,
                                        ner[ner_idx].type)
             else:
                 ne = "{0}/{1}".format(ner[ner_idx].text, ner[ner_idx].type)
             ne = ne.encode('utf-8').strip()
             add_ne = 'NE -> ' + ne
             # print add NE
             readable_text += add_ne + '\n'
     # Make nlp sentence
     json_data = json.loads(json_text)
     word_list = list()
     for sentence in json_data['sentences']:
         for words in sentence['words']:
             tagged_text = words['tagged_text']
             tagged_text_list = tagged_text.split()
             for tagged_word in tagged_text_list:
                 word = tagged_word.split("/")[0]
                 tag = tagged_word.split("/")[1]
                 if tag in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                     word += u"\ub2e4"
                 word_list.append(word)
     nlp_sent = " ".join(word_list)
     # Modify json data
     for sentence in json_data['sentences']:
         for words in sentence['words']:
             tagged_text = words['tagged_text']
             if '/VV' in tagged_text:
                 words['tagged_text'] = words['tagged_text'].replace(
                     "/VV", u"\ub2e4/VV")
             if '/VA' in tagged_text:
                 words['tagged_text'] = words['tagged_text'].replace(
                     "/VA", u"\ub2e4/VA")
             if '/VX' in tagged_text:
                 words['tagged_text'] = words['tagged_text'].replace(
                     "/VX", u"\ub2e4/VX")
             if '/VCP' in tagged_text:
                 words['tagged_text'] = words['tagged_text'].replace(
                     "/VCP", u"\ub2e4/VCP")
             if '/VCN' in tagged_text:
                 words['tagged_text'] = words['tagged_text'].replace(
                     "/VCN", u"\ub2e4/VCN")
     for sentence in json_data['sentences']:
         for morps in sentence['morps']:
             if morps['type'] == 'VV':
                 morps['lemma'] += u"\ub2e4"
             elif morps['type'] == 'VA':
                 morps['lemma'] += u"\ub2e4"
             elif morps['type'] == 'VX':
                 morps['lemma'] += u"\ub2e4"
             elif morps['type'] == 'VCP':
                 morps['lemma'] += u"\ub2e4"
             elif morps['type'] == 'VCN':
                 morps['lemma'] += u"\ub2e4"
     return nlp_sent, json.dumps(json_data), ret
Beispiel #8
0
 def analyze(self, target_text):
     in_text = nlp_pb2.InputText()
     try:
         in_text.text = target_text.replace('O', ' ').decode('utf-8')
         target_text = target_text.decode('utf-8')
     except Exception:
         in_text.text = unicode(target_text.replace('O', ' '), 'euc-kr')
         target_text = unicode(target_text, 'euc-kr')
     in_text.lang = lang_pb2.kor
     in_text.split_sentence = True
     in_text.use_tokenizer = False
     # in_text.use_space = False
     in_text.level = 1
     in_text.keyword_frequency_level = 0
     ret = self.stub.Analyze(in_text)
     result_list = list()
     for idx in range(len(ret.sentences)):
         nlp_word_list = [text for text in target_text]
         new_nlp_word_list = list()
         morph_word_list = list()
         analysis = ret.sentences[idx].morps
         for ana_idx in range(len(analysis)):
             morphs_word = analysis[ana_idx].lemma
             morphs_type = analysis[ana_idx].type
             if morphs_type in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                 new_nlp_word_list.append('{0}다'.format(morphs_word))
                 morph_word_list.append('{0}다/{1}'.format(
                     morphs_word, morphs_type))
             elif ana_idx > 0 and morph_word_list[-1].split(
                     '/')[1] == 'SL' and morphs_type in ['SN', 'SW']:
                 before_word = nlp_word_list.pop()
                 morph_word_list.pop()
                 new_nlp_word_list.append('{0}{1}'.format(
                     before_word, morphs_word))
                 morph_word_list.append('{0}{1}/NNG'.format(
                     before_word, morphs_word))
             elif ana_idx > 0 and morph_word_list[-1].split(
                     '/')[1] == 'SN' and morphs_type in ['SL', 'SW']:
                 before_word = nlp_word_list.pop()
                 morph_word_list.pop()
                 new_nlp_word_list.append('{0}{1}'.format(
                     before_word, morphs_word))
                 morph_word_list.append('{0}{1}/NNG'.format(
                     before_word, morphs_word))
             elif ana_idx > 2 and morphs_type == 'SN':
                 if morph_word_list[-2].split(
                         '/')[1] == 'SN' and morph_word_list[-1].split(
                             '/')[1] == 'SP':
                     middle_word = nlp_word_list.pop()
                     head_word = nlp_word_list.pop()
                     morph_word_list.pop()
                     morph_word_list.pop()
                     new_nlp_word_list.append('{0}{1}{2}'.format(
                         head_word, middle_word, morphs_word))
                     morph_word_list.append('{0}{1}{2}/NNG'.format(
                         head_word, middle_word, morphs_word))
                 else:
                     new_nlp_word_list.append('{0}'.format(morphs_word))
                     morph_word_list.append('{0}/{1}'.format(
                         morphs_word, morphs_type))
             else:
                 new_nlp_word_list.append('{0}'.format(morphs_word))
                 morph_word_list.append('{0}/{1}'.format(
                     morphs_word, morphs_type))
         nlp_sent = ' '.join(new_nlp_word_list).encode('utf-8').strip()
         morph_sent = ' '.join(morph_word_list).encode('utf-8').strip()
         result_list.append((target_text, nlp_sent, morph_sent))
     return result_list