def generate_per_file(self, file_name, path_list, all_reg):
     kkma = Kkma()
     with open(file_name, 'w') as file_write:
         for path in path_list:
             print path + "-start"
             for item in self.corpus_generator(path):
                 '''
                 item = item.decode('mbcs').encode('utf-8')
                 sliced = item[:len(item)-1]
                 subed = all_reg.sub('', sliced)
                 if len(subed) != 0:
                     continue
                 file_write.write((item + '\n'))
                 '''
                 for sub_item in kkma.sentences(item.decode('utf-8')):
                     sliced = sub_item[:len(sub_item)-1]
                     subed = all_reg.sub('', sliced.encode('utf-8'))
                     if len(subed) != 0:
                         continue
                     if '.' in sub_item:
                         if self.whatisthis(sub_item) == "not":
                             file_write.write(sub_item.encode('utf-8') + '\n')
                         else:
                             file_write.write(sub_item + '\n')
             print path + "-end"
Example #2
0
def tagPOS(filename):
    try:
        # Read file
        f = open(filename, 'r')
        text = f.read().decode('utf-8') # read file as utf8 decoded
        f.close()
        
        # tagging
        from konlpy.tag import Kkma
        #from konlpy.utils import pprint
        kkma = Kkma()
        print ('now tagging...')
        tagged = kkma.pos(text)
        
        # Write tagged file
        (path,fnameExt) = os.path.split(filename)
        (fname,fext) = os.path.splitext(fnameExt)
        tagged_file = fname+'_'+'tagged'+fext
        fw = open(tagged_file,'w')
        for line in tagged:
            strs="\t".join(x for x in line).encode('utf-8')
            fw.write(strs+"\n")
        fw.close()
        print '%s is created' % (tagged_file)
    except:
        print '\nERROR:'
        print '"%s" is not a valid text\nCheck your text file\nor file path\n' % (filename)
        sys.exit(1)
Example #3
0
def get_tags(text, ntags=50, multiplier=30):           # 폰트 크기 조절은 multiplier값을 조정해서
    # h = Hannanum()
    r = lambda: random.randint(0,255)
    color = lambda: (r(), r(), r())
    h = Kkma()
    text = unicode(text, 'utf-8')
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
Example #4
0
    def _kkma_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        kkma = Kkma()
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(kkma.pos(str(data)), tag_combine=tag_combine)
        return return_arr
Example #5
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Example #6
0
    def __init__(self):
        self.kkma = Kkma()
        self.conn = sqlite3.connect('yebi.db')
        self.cursor = self.conn.cursor()
        self.count = 20

        reload(sys)
        sys.setdefaultencoding('utf-8')
Example #7
0
 def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
     """
     Allocate kkma or twitter diction instance
     :param on_kkma: kkma instance
     :param on_twitter: twitter instance
     """
     if on_kkma is True:
         self.kkma = Kkma()
     if on_twitter is True:
         self.twitter = Twitter()
Example #8
0
def getKeywords(src):
    kkma = Kkma()

    words = kkma.nouns(src)
    words = list(set(words))
    words_calc = []

    words_num = len(words)
    for word in words:
        if not word.isdigit() and not u'서울' in word and re.match('(.*)?\d+(.*)?', word) is None:
            word_count = src.count(word)
            word_idf = word_count * math.log(len(word))
            if word_idf > 1:
                words_calc.append((word, word_idf))

    words_sort = sorted(words_calc, key = lambda w: w[1], reverse = True)
    words_real = []

    for word in words_sort:
        words_real.append(word[0])

    print (" / ".join(words_real[:5])).encode('utf-8')
Example #9
0
def SortSentence(filename):
    # Read file
    f = open(filename, 'r')
    text = f.read().decode('utf-8') # read file as utf8 decoded
    f.close()
        
    # tagging
    from konlpy.tag import Kkma
    #from konlpy.utils import pprint
    kkma = Kkma()
    print ('now dividing sentences...')
    tagged = kkma.sentences(text)
    
    # Write tagged file
    (path,fnameExt) = os.path.split(filename)
    (fname,fext) = os.path.splitext(fnameExt)
    tagged_file = fname+'_'+'sentence'+fext
    fw = open(tagged_file,'w')
    for line in tagged:
        strs = line.encode('utf-8')
        fw.write(strs+"\n")
    fw.close()
    print '%s is created' % (tagged_file)    
Example #10
0
#!/usr/bin/python
# vim: set fileencoding=utf8 :
from konlpy.tag import Kkma
from konlpy.utils import pprint
from convert import convert
import fileinput

kkma = Kkma()
pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
#poss = kkma.pos(u'(오류보고는) "실행환경", 에러메세지와함께 설명을 최대한상세히!^^')


for line in fileinput.input():
    poss = kkma.pos(unicode(line, "utf-8"))
    for tup in poss:
        print tup[0],
        print convert(tup[1])
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.'

pprint(kkma.nouns(string))
Example #12
0
def excel_noun():

	def excel_write(row_val, column_val, data):
		new_sheet.cell(row = row_val, column = column_val, value="%s" %data)

	wb=load_workbook('reference.xlsx')
	sheetList = wb.get_sheet_names()
	sheet = wb.get_sheet_by_name(sheetList[0])
	row_count = sheet.get_highest_row()
	
	new_sheet = wb.create_sheet(title='extraction')
	
	for i in range(2, row_count):
		if sheet.row_dimensions[i].visible :
			pass
		else :
			excel_write(i,1,'')
			new_sheet.row_dimensions[i].hidden = True
			#new_sheet.row_dimensions[i].outlineLevel = 1
			continue
	
		noun_val = ""
		full_qua = ""

		cellValue_name = sheet.cell(row=i, column=1).value
		cellValue = sheet.cell(row=i, column=2).value

		try :
			QUA = cellValue.count(u'\u201c')
		except :
			continue 

		if QUA != -1:
			if QUA == 1 :
				START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark
				CELL_VALUE_LEN = len(cellValue)

				cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN]
				END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark

				cellValue_final = cellValue_re[0:END_QUA]
				print str(i) + "  "+ cellValue_name + "  "  + cellValue_final

				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, cellValue_final)
				excel_write(i, 3, noun_val)

			elif QUA == 0 :
				#print str(i) + " " + cellValue
				ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark
				ANOTHER_QUA_LEN = len(cellValue)

				another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN]
				ANOTHER_END_QUA = another_cellValue.find("\"")

				another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA]
				#print str(i) + "  " + cellValue_name + "  " + another_cellValue_final
				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(another_cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, another_cellValue_final)
				excel_write(i, 3, noun_val)

			elif QUA > 1 :
				#print str(i) + " " + str(QUA)
				for q in range(0,QUA):
					arr = cellValue.split(u"\u201d")
					arr_start_qua = arr[q].find(u"\u201c") + 1
					arr_len = len(arr[q]) 

					arr_cellValue = arr[q][arr_start_qua:arr_len]

					full_qua = full_qua + arr_cellValue

					kkma = Kkma()
					#pprint (kkma.nouns(cellValue_final))
					s = (kkma.nouns(arr_cellValue))

					for j in range(0,len(s)):
						noun_val = noun_val + s[j].encode('utf-8') + ','
						#print str(i) + " " + arr_cellValue

					excel_write(i, 1, cellValue_name)
					excel_write(i, 2, full_qua)
					excel_write(i, 3, noun_val)

	wb.save('reference.xlsx')
Example #13
0
# 3. install google cloud language
# 4. set your creds and java directories in the os.environ calls
# 5. set the in and out filenames below

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "g_creds.json"
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-14.0.2"

in_filename = "singihan hangari.txt"
out_filename = "singihan out.txt"


translate_client = translate_v2.Client()
translate = translate_client.translate

okt = Okt()
kkma = Kkma()

vocab_list = []

def extract_korean_vocab(in_filename, out_filename):
    with open(in_filename, "r", encoding="utf-8") as korean_text_file_object:
        with open(out_filename, "w", encoding="utf-8") as outfile:
            story = korean_text_file_object.read()
            print("Splitting sentences...")
            sentences = kkma.sentences(story)
            print(f"Parsing words in {len(sentences)} sentences...")
            for sentence in sentences:
                tags = okt.pos(sentence, norm=True, stem=True)
                for t in tags:
                    if t[1] not in ['Foreign',
                                    'Punctuation',
Example #14
0
# -*- coding: utf-8 -*-

import csv, json, time
from konlpy.tag import Kkma


origdata = open('seoul2015.tsv', 'r')
data = csv.reader(origdata, delimiter='\t')

output = []
kkma = Kkma()
i = 0

for line in data:
    i += 1

    if(line[8].strip().isdigit()):
        obj = {
            'name': line[7].strip(),
            'sum': int(line[8].strip()) * 1000,
            'categories': [
                line[2].strip(),
                line[3].strip(),
                line[4].strip(),
                line[5].strip()
            ]
        }

        words = kkma.nouns(line[7].strip().decode('utf-8'))
        for j, word in enumerate(words):
            words[j] = word.encode('utf-8')
Example #15
0
            # print('직전일자 : ', prevDate)
            print()

            # print(soup.prettify())
            # print(soup)
            newsbody = soup.find(id="articleBodyContents")
            # print(newsbody.contents)
            bodystr = ""
            try:
                for child in newsbody.children:
                    if (isinstance(child, NavigableString) and not isinstance(child, Comment)):
                        # print(child.string.strip())
                        bodystr += child.string.strip()

                # 형태소 분석
                kkma = Kkma()
                # pprint(kkma.nouns(bodystr))
                # pprint(kkma.pos(bodystr))
                wordList = kkma.nouns(bodystr)
                print('k : ', k)
                if k == 0:
                    testEntry = wordList
                    testIssueDate = issueDate
                    testTitle = soup.title.string
                    k = k + 1
                else:
                    if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) > int(df[df['날짜'] < issueDate].head(1)['종가'])):
                        print('up')
                        docList.append(wordList)
                        classList.append(1)
                    else:
Example #16
0
# -*- coding: utf-8 -*-

# 품사 태깅 실습
# 꼬꼬마와 트위터 형태소 분석기 사용해서 토큰화 수행. 명사 등을 추출한다.

from konlpy.tag import Kkma
kkma = Kkma()

print('kkma 문장분리 : ', kkma.sentences(u'안녕하세요. 반갑습니다. 저는 인공지능입니다.'))
# sentences : 문장분리
print('kkma 명사만추출 : ', kkma.nouns(u'을지로 3가역 주변 첨단빌딩숲 사이에 자리 잡은 커피집'))
# nouns : 명사 추출

print('='*80)

from konlpy.tag import Twitter
tagger = Twitter()

print('Twitter 명사만 추출 : ', tagger.nouns(u'을지로 3가역 주변 첨단빌딩숲 사이에 자리 잡은 커피집'))
print('Twitter 품사 추출 : ', tagger.pos(u'이것도 처리되나욕ㅋㅋ')) # pos : 품사 부착(Part-of-speech tagging)
print('Twitter 오타와 원형처리 : ', tagger.pos(u'이것도되나욕ㅋㅋ', norm=True, stem=True))  # nouns : 명사 추출, norm=True.. 단어의 오타를 자동 정정, stem=True.. '이다'처럼 원형을 리턴. 기본값은 False
Example #17
0
# -*- coding: utf-8 -*-
"""
Konlpy : 한글 형태소 분석을 제공하는 패키지 
pip install konlpy
"""

#import konlpy
from konlpy.tag import Kkma  # class

kkma = Kkma()  # 생성자 -> object 생성

# 문단 -> 문장
para = "나는 홍길동 입니다. 나이는 23세 입니다. 대한민국 만세 입니다."
ex_sent = kkma.sentences(para)
ex_sent  # list
# ['나는 홍길동 입니다.', '나이는 23세 입니다.', '대한민국 만세 입니다.']
len(ex_sent)  # 3

# 문단 -> 단어(명사)
ex_nouns = kkma.nouns(para)
ex_nouns
# ['나', '홍길동', '나이', '23', '23세', '세', '대한', '대한민국', '민국', '만세']

# 문단 -> 품사(형태소)
ex_pos = kkma.pos(para)
ex_pos
type(ex_pos)  # list [(word, 품사),(word, 품사)]

# NNG 일반 명사 NNP 고유 명사  NP 대명사
nouns = []  # 명사 저장
    def test(self, keyword):

        FLAGS = tf.flags.FLAGS
        #print("\nParameters:")
        #for attr, value in sorted(FLAGS.__flags.items()):
        #    print("{}={}".format(attr.upper(), value))
        #print("")

        #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
        #print(x_raw)
        #print(y_test)
        x_raw = [keyword]
        print(keyword)
        y = [[1,0]]
        y_test = np.concatenate([y], 0)
        print(y_test)
        y_test = np.argmax(y_test, axis=1)

        kkma=Kkma() 
        x_raw=[" ".join(kkma.morphs(x2)) for x2 in x_raw]

        print("x_raw",x_raw)
        # Map data into vocabulary
        vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
        vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
        x_test = np.array(list(vocab_processor.transform(x_raw)))

        print("\nEvaluating...\n")

        # Evaluation
        # ==================================================
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        graph = tf.Graph()



        with graph.as_default():
            session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_x = graph.get_operation_by_name("input_x").outputs[0]
                # input_y = graph.get_operation_by_name("input_y").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

                # Tensors we want to evaluate
                predictions = graph.get_operation_by_name("output/predictions").outputs[0]

                # Generate batches for one epoch
                batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

                # Collect the predictions here
                all_predictions = []

                for x_test_batch in batches:
                    batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                    all_predictions = np.concatenate([all_predictions, batch_predictions])
                    print(all_predictions[0])
                    return all_predictions[0]

        return "text"
Example #19
0
        links =html.select('a[class="link_txt"]')
        
        one_page_data = []
        
        for link in links:
            link_str = str(link.string)
            one_page_data.append(link_str.strip())
            
        one_day_data.extend(one_page_data[:10])
    print(date) 
    return one_day_data[:5]

data = [Crawling(date)[0]  for date in sdate]   


kkma = Kkma()
nouns_word = []
for sent in data:
    for noun in kkma.nouns(sent):
        nouns_word.append(noun)

from re import match
nouns_count = {}

for noun in nouns_word:
    if len(noun) > 1 and not(match('^[0-9]', noun)):
        nouns_count[noun] = nouns_count.get(noun, 0) + 1


from collections import Counter # class
Example #20
0
def save_news_words(s_date=None, e_date=None):
    nouns_tools = [Kkma(), Twitter()]  # 분석기
    token = NewsTokenization(nouns_tools=nouns_tools)

    mongo = MongoManager()
    news = [
        DaumNews(s_date.strftime('%Y%m%d')),
        NaverNews(s_date.strftime('%Y%m%d'))
    ]  # load할 news 목록

    while s_date.strftime('%Y%m%d') < e_date.strftime('%Y%m%d'):
        day = s_date.strftime('%Y%m%d')
        con = {'date': day}

        if not mongo.find_one(NewsTokenization.collection, con):
            articles = token.load_news(day,
                                       repository_manager=mongo,
                                       news_list=news)
            print('[NewsTokenization][day: %s][article len: %d]' %
                  (day, len(articles)))
            # articles = articles[2:3]

            datas = list()
            for article in articles:
                data = dict()
                data['_id'] = article['_id']

                # title(0) + contents(1~)
                lines = list()
                lines.append(article['article']['title'])
                lines.extend(article['article']['contents'])

                data['raw_data'] = lines
                data['data_words'] = [[item for item in token.get_words(line)]
                                      for line in data['raw_data']]
                data['data_frequency'] = token.get_term_frequency(
                    data['data_words'])

                if 'summary' in article['article']:
                    data['raw_label'] = article['article']['summary']
                    data['label_words'] = [[
                        item for item in token.get_words(line)
                    ] for line in data['raw_label']]
                    data['label_frequency'] = token.get_term_frequency(
                        data['label_words'])

                print(data)
                datas.append(data)
            else:
                #save
                obj = dict()
                obj['date'] = day
                obj['article'] = datas
                mongo.save_one(collection=NewsTokenization.collection,
                               data=obj)
        else:
            news_words_list = mongo.find(NewsTokenization.collection, con)
            for news_words in news_words_list:
                print('news_words len:', len(news_words))

        # day + 1
        s_date = s_date + datetime.timedelta(days=1)
Example #21
0
content = ""
docList = []

for line in reader:
    content = re.sub('[^0-9a-zA-Zㄱ-힗 .]', ' ', line[2])

    reversed_content = ''.join(reversed(content))
    for i in range(0, len(content)):
        if reversed_content[i:i + 2] == '.다':
            content = ''.join(reversed(reversed_content[i:]))
            break

    content = content.replace('.', '. ')

    kkma = Kkma()

    noun = ' '.join(kkma.nouns(content))

    docList.append(noun)

    print(noun)

tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(docList)

document_distances = (tfidf_matrix * tfidf_matrix.T)
print(document_distances)

print("Result: " + str(document_distances.get_shape()[0]) +\
      'x' + str(document_distances.get_shape()[1]))
Example #22
0
x = word_tokenize(text)
pos_tag(x)
# [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]
# Penn Treebank POG Tags에서 PRP는 인칭 대명사, VBP는 동사, RB는 부사, VBG는 현재부사, IN은 전치사, NNP는 고유 명사, NNS는 복수형 명사, CC는 접속사, DT는 관사를 의미합니다.

from konlpy.tag import Okt
okt = Okt()
print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# [('열심히','Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['코딩', '당신', '연휴', '여행']

from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# [('열심히','MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['코딩', '당신', '연휴', '여행']
'''
한글 형태소 분석기 중에 가장 속도가 빠른 Mecab은 konlpy 엔진에 포함되어 있지 않다.
아래는 eunjeon 패키지를 이용하여 python에서 mecab을 활용하는 예시이다.
'''
from eunjeon import Mecab  # KoNLPy style mecab wrapper
tagger = Mecab()
print(tagger.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['열심히', '코딩', '한', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '봐요']
print(tagger.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
Example #23
0
    Sorted_Dict_Values = sorted(wordInfo.values(), reverse=True)
    Sorted_Dict_Keys = sorted(wordInfo, key=wordInfo.get, reverse=True)

    plt.bar(range(len(wordInfo)), Sorted_Dict_Values, align='center')
    plt.xticks(range(len(wordInfo)), list(Sorted_Dict_Keys), rotation='70')

    plt.show()


file = open("test.txt", mode='r', encoding='utf-8')
doc = file.read()
file.close()
print(doc)

kkma = Kkma()

ex_sent = kkma.sentences(doc)

kkma.pos

nouns = []
for sent in ex_sent:
    for noun in kkma.nouns(sent):
        # 단어 전처리 : 2음절 이상, 수사 제외
        if len(str(noun)) >= 2 and not (match('^[0-9]', noun)):
            nouns.append(noun)

print(nouns)

word_count = {}
Example #24
0
    poets = get_reviews()

    for poet in poets:
        sentences = poet.split('\n')

        for sentence in sentences:
            try:
                c += Counter(kkma.nouns(sentence))
            except NameError:
                c = Counter(kkma.nouns(sentence))
            except:
                pass

#poets = get_poets()
poets = get_reviews()
kkma = Kkma()

for idx, poet in enumerate(poets):
    tags = []
    for noun in kkma.nouns(poet):
        if noun in TAGS:
            tags.append(noun)

    hash_object = hashlib.sha1(poet.encode('utf-8', 'ignore'))
    hex_dig = hash_object.hexdigest()

    results = collection.find_one({'hex':hex_dig})
    if not results:
        document = {'text': poet, 'index': idx, 'tags': tags, 'hex': hex_dig, 'like': 0, 'date': datetime.datetime.utcnow()}
        collection.insert(document)
Example #25
0
ip = 'localhost'
id = 'testuser'
pw = 'AsDf1234!'
db = 'qnaboard'

conn = pymysql.connect(ip, id, pw, db, charset="utf8")

curs = conn.cursor()
sql = "select * from board"

curs.execute(sql)
result = curs.fetchall()
i = 0
sen = []

kkma = Kkma()
w_count = {}

for t, c in result:
    s = t + " " + c
    kk = kkma.nouns(s)
    for lst in kk:
        try:
            w_count[lst] += 1
        except:
            w_count[lst] = 1

sorted_w = sorted(w_count.items(), key=operator.itemgetter(1))
print(sorted_w)

conn.close()
Example #26
0
def excel_noun():

	def excel_write(row_val, column_val, data):
		new_sheet.cell(row = row_val, column = column_val, value="%s" %data)

	wb=load_workbook(REFERENCE_EXCEL)

	sheetList = wb.get_sheet_names()
	sheet = wb.get_sheet_by_name(sheetList[0])
	row_count = sheet.get_highest_row()
	
	new_sheet = wb.create_sheet(title='extraction')
	
	news_info = {}
	
	for i in range(1, row_count):
		noun_val = ""
		full_qua = ""

		cellValue_name = sheet.cell(row=i, column=1).value
		cellValue = sheet.cell(row=i, column=2).value
		cellValue_id = sheet.cell(row=i, column=3).value

		# u201c 'LEFT DOUBLE QUOTATION MARK'
		# u201d 'RIGHT DOUBLE QUOTATION MARK'

		try :
			QUA = cellValue.count(u'\u201c')  # u201c 'LEFT DOUBLE QUOTATION MARK'
		except :
			continue 

		if QUA != -1:
			if QUA == 1 :
				START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark
				CELL_VALUE_LEN = len(cellValue)

				cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN]
				END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark

				cellValue_final = cellValue_re[0:END_QUA]
				#print str(i) + "  "+ cellValue_name + "  "  + cellValue_final

				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
				news_info[i]={news_tuple}

				MyPrettyPrinter().pprint(news_info[i])

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, cellValue_final)
				excel_write(i, 3, noun_val)
				excel_write(i, 4, cellValue_id)

			elif QUA == 0 :
				#print str(i) + " " + cellValue
				ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark
				ANOTHER_QUA_LEN = len(cellValue)

				another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN]
				ANOTHER_END_QUA = another_cellValue.find("\"")

				another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA]
				#print str(i) + "  " + cellValue_name + "  " + another_cellValue_final
				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(another_cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
				news_info[i]={news_tuple}

				MyPrettyPrinter().pprint(news_info[i])

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, another_cellValue_final)
				excel_write(i, 3, noun_val)
				excel_write(i, 4, cellValue_id)

			elif QUA > 1 :
				#print str(i) + " " + str(QUA)
				for q in range(0,QUA):
					arr = cellValue.split(u"\u201d")

					if arr is not None:
						try :
							arr_start_qua = arr[q].find(u"\u201c") + 1
						except :
							continue

						arr_len = len(arr[q]) 

						arr_cellValue = arr[q][arr_start_qua:arr_len]
						full_qua = full_qua + arr_cellValue

						kkma = Kkma()
						#pprint (kkma.nouns(cellValue_final))
						s = (kkma.nouns(arr_cellValue))

						for j in range(0,len(s)):
							noun_val = noun_val + s[j].encode('utf-8') + ','
							#print str(i) + " " + arr_cellValue

						news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
						news_info[i]={news_tuple}

						MyPrettyPrinter().pprint(news_info[i])

						excel_write(i, 1, cellValue_name)
						excel_write(i, 2, full_qua)
						excel_write(i, 3, noun_val)
						excel_write(i, 4, cellValue_id)

	wb.save(REFERENCE_EXCEL)
	nt.saveObjectBinaryFast(news_info, DICT_NEWS_INFO) 
#*--coding:utf-8--*
import os
import pickle
from konlpy.tag import Kkma
from codecs import open as copen

kkma = Kkma()

feature_list = set()

#Q&A 데이터에서 명사와 형용사 feature 추출
for i in os.listdir('./DataSet/Leave/'):
    print i
    f = copen('./DataSet/Leave/' + str(i), 'r', 'utf-8')
    temp = f.read().replace('\n', ' ')
    for j in kkma.pos(unicode(temp)):
        if j[1] in ['NNG', 'NNP', 'NNB', 'NP', 'VA']:
            feature_list.add(j[0])

    f.close()
print len(feature_list)

p = open('DataSet/feature_4.txt', 'wb')
pickle.dump(list(feature_list), p)
p.close()
Example #28
0
def Training():
    for article in article_list:
        # print(article)
        # title = article[0]
        # link = article[1]
        # newspaper = article[2]
        kkma = Kkma()

        try:
            content, issueDateTime = NateNews.get_content(article['link'])
            issueDateTime = pd.to_datetime(issueDateTime)
            # issueDate = time.strftime('%Y-%m-%d', issueDateTime)
            # issueTime = time.strftime('%H:%M:%S', issueDateTime)
            issueDate = issueDateTime.date()
            issueTime = issueDateTime.time()

            # 형태소 분석
            # wordList = kkma.pos(content)

            # [보통명사 동사 형용사 보조동사 명사추정범주] 필터링


            # print(title)
            # print('wordList : ', wordList)
            # print(issueDateTime)
            # print(link)
            # print(newspaper)
            # print(issueDate)
            # print('wordList : ', wordList)
            wordList = list(getWords(kkma.pos(content)))

            ws = set(wordList)
            print('ws : ', ws)
            dic = {}
            for word in ws:
                print('word : ', word)
                dic.update({word: wordList.count(word)})

            print('dic : ', dic)
            n = 10
            listdic = sorted(dic.items(), key=operator.itemgetter(1), reverse=True)[:n]
            print('listdic : ', listdic)

            for l in listdic:
                print('l : ', l)
                wordList.append(l[0])

            baseDate = ''
            if issueTime > pd.to_datetime('15:30:00').time():
                # print('장 마감 이후')
                baseDate = stockDF[stockDF['datetime'] > issueDate].head(1)['datetime']
            else:
                # print('장 마감 이전')
                baseDate = stockDF[stockDF['datetime'] >= issueDate].head(1)['datetime']
            print('issueTime : ', issueTime)
            print('baseDate : ', baseDate)
            # print(type(baseDate))
            if issueDate > pd.to_datetime(testSetFromDate).date() or len(baseDate) == 0:
                # test set
                testEntry.append({'issueDateTime': issueDateTime, 'wordList': wordList})
            else:
                # trainning set
                baseDate = pd.Series(baseDate).values[0]
                # print('해당 일자 주식 확인 : ', baseDate)
                trainingSet.append({'issueDateTime': issueDateTime, 'wordList': wordList})
                print(trainingSet)
                # print(int(stockDF[stockDF['날짜'] == baseDate]['종가']))
                # print(int(stockDF[stockDF['날짜'] < baseDate].tail(1)['종가']))

                todayPrice = int(stockDF[stockDF['datetime'] == baseDate]['close'])
                prevPrice = int(stockDF[stockDF['datetime'] < baseDate].tail(1)['close'])
                if (todayPrice > prevPrice):
                    # print(baseDate, ' : up')
                    classList.append(1)
                else:
                    if (todayPrice < prevPrice):
                        # print(baseDate, ' : down')
                        classList.append(0)
                    else:
                        # print(baseDate, ' : hold')
                        classList.append(0)
        except:
            pass
Example #29
0
from konlpy.tag import Kkma

#"C:/Program Files/Java/jre1.8.0_171/bin/server/jvm.dll"
kkma = Kkma()
print("asdasd")

sentences = kkma.sentences(u'네, 안녕하세요. 반갑습니다.')
print(sentences)

nouns = kkma.nouns(u'명사만을 추출하여 다빈도 분석을 합니다.')
print(nouns)

pos = kkma.pos(u'오류보고는 실행환경, 에러메세지와함께 설명을 최대한상세히!^^')
print(pos)
from konlpy.tag import Kkma

kkma = Kkma()
malist = kkma.pos("아버지 가방에 들어가신다.")
print(malist)
Example #31
0
#!/usr/bin/env python3

from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Okt

hannanum = Hannanum()
print('[Hannanum]')
print(hannanum.analyze('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.'))

kkma = Kkma()
print('[Kkma]')
print(kkma.morphs('공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.'))

komoran = Komoran()
print('[Komoran]')
print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))

mecab = Mecab()
print('[Mecab]')
print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))

okt = Okt()
print('[Okt]')
print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))
Example #32
0
    p = sum((first['counter'] & second['counter']).values())
    q = sum((first['counter'] | second['counter']).values())
    return p / q if q else 0


def build_graph(sentences):
    graph = networkx.Graph()
    graph.add_nodes_from(sentences)
    for first, second in combinations(sentences, 2):
        weight = occurrence(first[1], second[1])
        if weight:
            graph.add_edge(first[0], second[0], weight=weight)
    return graph


STOPWORDS = get_stopwords()
TAGGER = Kkma()


def get_summarize(text, count=3):
    sentences = [(num, {
        'text':
        line + '.',
        'counter':
        Counter(filter(lambda x: x not in STOPWORDS, TAGGER.nouns(line)))
    }) for num, line in enumerate(text.split('. '))]
    pagerank = networkx.pagerank(build_graph(sentences), weight='weight')
    reordered = sorted(pagerank, key=pagerank.get, reverse=True)
    for index in reordered[:count]:
        yield sentences[index][1]['text']
# -*- coding: utf-8 -*-
"""
 오전 수업때 만든 'new_data.pck' word cloud
 
1. pickle file 읽기
2. 명사 추출 : kkma.nouns()
3. 전처리 : 단어길이 제한, 숫자 제외 
4. WordCloud
"""
import konlpy
from konlpy.tag import Kkma
from wordcloud import WordCloud  # class

# object 생성
kkma = Kkma()

# 1. pickle file 읽기 : news_data.pck
file = open('../data/new_data.pck', mode='rb')
news_data = pickle.load(file)
news_data

file.close()

len(news_data)  # 11600
type(news_data)  # list

# docs -> sentence
# <error> news_sent = kkma.sentences(news_data)
news_sent = [kkma.sentences(sent)[0] for sent in news_data]
news_sent
from konlpy.tag import Kkma
from konlpy.utils import pprint
from openpyxl import load_workbook, Workbook

dic={}
kkma=Kkma()
wb=load_workbook(filename='menulist.xlsx',read_only=True)
ws=wb['Sheet1']
for i in range(1,5897):
    for l,k in kkma.pos(ws['A'+str(i)].value):
        if l not in dic.keys():
            dic[l]=0
        else:
            dic[l]+=1

wb=Workbook()

dest_filename="determine.xlsx"

ws1=wb.active
ws1.title="result"
num=1
for l,k in dic.items():
    ws1['A'+str(num)]=l
    ws1['B'+str(num)]=k
    num+=1

wb.save(filename=dest_filename)
Example #35
0
def get_intent_type(dbconn, cursor, u_text):
    try:
        cursor.execute(f"""
			SELECT 
				Q_TEXT, A_TEXT, Q_MORPHEMES, INTENT, ENTITIES
			FROM
				TBL_AP_QNA_CHAT_SET_LIST
		""")
        rows = cursor.fetchall()
    except Exception as e:
        print(f'error! >> select_ap_qna_content >> {e}')
    finally:
        intent = 0
        all_values = []
        for row in rows:
            all_values.append(row)

        match_intent_list = []
        for all_value in all_values:
            matchPer = round(
                (SequenceMatcher(None, u_text, all_value[0]).ratio() * 100), 2)
            if matchPer >= 65:
                # print(70)
                match_intent_list.append([matchPer, all_value[3]])
                # print(f'[{matchPer}% 일치][Q_type : {all_value[3]}] {all_value[0]} >> {all_value[1]}')

        # print('match_intent_list', match_intent_list)

        top = [0, 0]
        if len(match_intent_list) > 0:
            for idx, match_intent in enumerate(match_intent_list):
                if match_intent[0] > top[0]:
                    top = match_intent
            intent = top[1]

        # 정의해놓은 대화뭉치가 없는 경우 > 답변을 직접 등록할 수 있도록 유도
        # 새로운 질문과 기존 질문들의 유사도 체크하여 높은 유사도의 질문을 (최소 70% 이상) 노출
        if intent == 0:
            kkma = Kkma()
            # 사용자 질문 명사
            u_text_nouns = kkma.nouns(u_text)
            q_text_nouns_group = []
            for all_value in all_values:
                # 텍스트 뭉치 명사
                if all_value[2] != '[]':
                    print('텍스트 뭉치 명사', all_value[2], '|||', all_value[3])
                    q_text_nouns_group.append(
                        [ast.literal_eval(all_value[2]), all_value[3]])

            point_list = []
            for q_text_nouns in q_text_nouns_group:
                match_point = 0
                for q_noun in q_text_nouns[0]:
                    for u_noun in u_text_nouns:
                        if q_noun == u_noun:
                            match_point += 1
                if match_point > 0:
                    point_list.append([match_point, q_text_nouns[1]])

            top = [0, 0]
            if len(point_list) > 0:
                for idx, point in enumerate(point_list):
                    if point[0] > top[0]:
                        top = point
                intent = top[1]

            # print('point_list', point_list)

        print('intent', intent)
        return [intent, all_values]
Example #36
0
from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다'));
pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.'));
pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
Example #37
0
 def __init__(self):
     self.nlp_engine = Kkma()
     self.nlp_engine.pos('시작')
Example #38
0
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()
pprint(kkma.sentences(u'저는 대학생이구요. 소프트웨어 관련학과 입니다.'))
[저는 대학생이구요., 소프트웨어 관련학과 입니다.]
Example #39
0
def get_vs(line):
    korean_analyzer = Kkma()
    return [word for word, tag in korean_analyzer.pos(line) if tag in ['VA', 'VX', 'VC']]
# 01 한국 법률 말뭉치
from konlpy.corpus import kolaw
c = kolaw.open('constitution.txt').read()
print(c[:10])

#%%
from konlpy.corpus import kobill
d = kobill.open('1809890.txt').read()
print(d[:15])

#%%
# 사전
## 문장, 명사, 품사 태깅
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

#%%
pprint(kkma.sentences('네 안녕하세요. 반갑습니다'))
pprint(kkma.nouns('질문이나 건의사항은 여기에 남겨주세요.'))
pprint(kkma.pos('우리는 데이터 과학자입니다. 멋진 과학자입니다.'))

#%%
# 02. 문서 탐색
from collections import Counter

from konlpy.corpus import kolaw
from konlpy.tag import Hannanum
from konlpy.utils import concordance, pprint
from matplotlib import pyplot
Example #41
0
def test2_kor():
    train_df = pd.read_table(
        'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_train.txt'
    )
    test_df = pd.read_table(
        'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_test.txt'
    )

    print(train_df.head())

    print('훈련 데이터 샘플의 개수 : {}'.format(len(train_df)))
    print('테스트 데이터 샘플의 개수 : {}'.format(len(test_df)))

    tokenizer = Kkma()  # .morphs() ---> 너무 느리다.

    ID = torchtext.data.Field(
        sequential=False,
        use_vocab=False)  # 실제 사용은 하지 않을 예정   ---> txt파일에 ID column이 있어서...
    TEXT = torchtext.data.Field(
        sequential=True,
        include_lengths=True,
        use_vocab=True,
        tokenize=tokenizer.morphs,  # 토크나이저로는 Kkma 사용.
        lower=True,
        batch_first=
        True,  # batch_firt=True ---> (N,fix_length)   False ---> (fix_length,N)
        fix_length=20)

    LABEL = torchtext.data.Field(sequential=False,
                                 use_vocab=False,
                                 is_target=True)

    # tsv: Tab-separated values
    if False:
        train_data, test_data = torchtext.data.TabularDataset.splits(
            path=
            'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in',
            train='ratings_train.txt',
            test='ratings_test.txt',
            format='tsv',
            fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
            skip_header=True)
    else:
        train_data = torchtext.data.TabularDataset(
            path=
            'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_train_small.txt',
            format='tsv',
            fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
            skip_header=True)

        train_data, test_data = train_data.split(split_ratio=0.7,
                                                 random_state=random.seed(100))

    print('훈련 샘플의 개수 : {}'.format(len(train_data)))
    print('테스트 샘플의 개수 : {}'.format(len(test_data)))

    print(vars(train_data[0]))
    print(train_data.examples[0].id, train_data.examples[0].text,
          train_data.examples[0].label)

    TEXT.build_vocab(train_data, min_freq=10,
                     max_size=10000)  # build_vocab 단계를 거처야, 단어가 숫자로 mapping된다.
    print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))
    print(TEXT.vocab.stoi)  # 단어 dict

    # DataLoader 만들기
    batch_size = 2

    if False:
        train_loader = torchtext.data.Iterator(
            dataset=train_data, batch_size=batch_size,
            shuffle=True)  # shuffle=True epoch 사이에 shuffle 여부.
        test_loader = torchtext.data.Iterator(dataset=test_data,
                                              batch_size=batch_size)

    else:
        # data.BucketIterator ----> padding이 최소화 되도록 한다.
        train_loader, test_loader = torchtext.data.BucketIterator.splits(
            (train_data, test_data),
            batch_size=batch_size,
            device='cpu',
            shuffle=False,
            sort_key=lambda x: len(x.text))

    # train_data에 직접 접근 & 직접 단어를 숫자로 mapping 시키기
    for i, d in enumerate(train_data):
        # d: Example
        print(d.text, TEXT.numericalize(([d.text], 1)),
              d.label)  # tuple([xx],batch_size)을 넘겨야 한다.
        if i >= 2: break

    for i, d in enumerate(train_loader):
        print(i, d.text, d.label
              )  # d.text[0], d.text[1] ----> Field에서 include_lengths=True로 설정.
        print(''.join([TEXT.vocab.itos[x] for x in d.text[0][0].numpy()]))
        if i >= 2: break

    print('=' * 20)
    it = iter(train_loader)
    for i in range(2):
        batch = next(it)
        print(batch.text, batch.label)
Example #42
0
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


# In[69]:


kkma = Kkma()

stopWord_Ingre = {"재료" , "계량법" , "안내" , "조금"}


# In[113]:


mystr = getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6879000_6880000")
mystr += getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6870000_6871000")


# In[ ]:


tokenized = kkma.pos(mystr)
Example #43
0
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.twitter = Twitter()
        self.stopwords = [
            '중인',
            '만큼',
            '마찬가지',
            '꼬집었',
            "연합뉴스",
            "데일리",
            "동아일보",
            "중앙일보",
            "조선일보",
            "기자",
            "아",
            "휴",
            "아이구",
            "아이쿠",
            "아이고",
            "어",
            "나",
            "우리",
            "저희",
            "따라",
            "의해",
            "을",
            "를",
            "에",
            "의",
            "가",
        ]

    def url2sentences(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        sentences = self.kkma.sentences(article.text)

        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx - 1] += (' ' + sentences[idx])
                sentences[idx] = ''

        return sentences

    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx - 1] += (' ' + sentences[idx])
                sentences[idx] = ''

        return sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([
                    noun for noun in self.twitter.nouns(str(sentence))
                    if noun not in self.stopwords and len(noun) > 1
                ]))

        return nouns
Example #44
0
__author__ = 'woojin'
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()
pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.'))
pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.'))
pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
Example #45
0
constitution = kolaw.open('./constitution.txt').read()

print(constitution)

# 몇번째 줄에 '민주'라는 단어가 있는지 찾아줌
r = concordance(u'민주', constitution, show=False)
print("show=False => ", r)

# show=False => 문장으로 안나타나고
# show=True => 문장으로 나타남

# 텍스트 마이닝 작업 시 고려사항 : 정확성, 속도
from konlpy.tag import Kkma  # 정확성 때문에 Kkma 사용, 맨 처음 시작시 시간이 좀 걸림 (variable들 날리면 더 걸림)
from konlpy.utils import pprint

kkma = Kkma()

text = u'네, 안녕하세요. 반갑습니다.'

# 문장 단위로 찾아냄
text_s = kkma.sentences(text)
print("text_s => ", text_s)

# 리스트에 담겨서 나옴
print("type(text_s) => ", type(text_s))
print("type_s[0] => ", text_s[0])
print("type_s[0] => ", text_s[-1])

# tagset : 형식들에 대한 정보 파악
kkma = Kkma()
print(kkma.tagset)
Example #46
0
# -*- coding: utf-8 -*- 
import zmq
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8') #set utf8 as a default


#init KoNLPy
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

from multiprocessing import Pool

port = 46000

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind('tcp://127.0.0.1:%s' % port)

while True:
	print 'in the loop'
	# Wait for next request from client
	message = socket.recv()
	result = kkma.nouns(message);
	result = ', '.join(result)
	print '------'
	print result
	socket.send_string(result) # for socker.end unicode is not allowed use send_string
Example #47
0
from konlpy.tag import Kkma
from konlpy.corpus import kolaw
from threading import Thread
import jpype

def do_concurrent_tagging(start, end, lines, result):
    jpype.attachThreadToJVM()
    l = [k.pos(lines[i]) for i in range(start, end)]
    result.append(l)
    return

if __name__=="__main__":
    import time

    print('Number of lines in document:')
    k = Kkma()
    lines = kolaw.open('constitution.txt').read().splitlines()
    nlines = len(lines)
    print(nlines)

    print('Batch tagging:')
    s = time.clock()
    result = []
    l = [k.pos(line) for line in lines]
    result.append(l)
    t = time.clock()
    print(t - s)

    print('Concurrent tagging:')
    result = []
    t1 = Thread(target=do_concurrent_tagging, args=(0, int(nlines/2), lines, result))
Example #48
0
class Crawler:
    def __init__(self):
        self.kkma = Kkma()
        self.conn = sqlite3.connect('yebi.db')
        self.cursor = self.conn.cursor()
        self.count = 20

        reload(sys)
        sys.setdefaultencoding('utf-8')

    def do(self):
        print '트위터 타임라인 탐색 중.'

        for x in TwitterFetcher().get_time_line(self.count):
            user_id = x['user']['id']
            print ''
            print '=' * 80
            print '... @%s: %s' % (x['user']['name'],  x['text'])

            t = (user_id, )
            self.cursor.execute('select count(*) from users where id=?', t)
            count_user = self.cursor.fetchone()[0]

            if count_user == 0: #DB안에 User가 없으면 ( 0 )
                name = x['user']['name']
                screen_name = x['user']['screen_name']
                profile_image = x['user']['profile_image_url_https']
                t = (user_id, name, screen_name, profile_image)
                self.cursor.execute('insert into users values(?, ?, ?, ?)', t)
                self.conn.commit()
                print "... 유저 %s를 User 디비에 추가중" % x['user']['name']

            i = 1

            tweet_id = x['id']
            t = (tweet_id, )
            self.cursor.execute('select count(*) from tweets where id=?', t)
            count_tweets = self.cursor.fetchone()[0]

            print "... 트윗 디비를 검색중"

            if count_tweets == 0:
                print "... 아직 디비에 없어요."
                text = x['text']
                created_at = x['created_at']
                t = (tweet_id, text, created_at, user_id)
                self.cursor.execute('insert into tweets values(?, ?, ?, ?)', t)
                self.conn.commit()
                print '... %s 추가 중' % x['text']

                for n in self.kkma.nouns(x['text']):
                    t = (user_id, n)
                    self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t)
                    count_noun = self.cursor.fetchone()

                    screen_name = x['user']['screen_name']
                    if count_noun is not None:
                        print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." % \
                              (screen_name, n, count_noun[0])

                    if count_noun is None:
                        print "... %s가 명사 \"%s\"를 처음 사용하였습니다." % (screen_name, n)
                        #t = (user_id, n)
                        self.cursor.execute('insert into user_nouns values(?, ?, 1)', t)
                    else:
                        self.cursor.execute('update user_nouns set count=count+1 where user_id=? and noun=?',
                                            t)
            else:
                print "... 이미 디비에 있어요. (그래도 명사를 분석하겠습니다.)"
                for n in self.kkma.nouns(x['text']):
                #     print "...... %s" % n
                    t = (user_id, n)
                    self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t)
                    count_noun = self.cursor.fetchone()

                    screen_name = x['user']['screen_name']
                    if count_noun is not None:
                        print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." \
                              % (screen_name, n, count_noun[0])

            i += 1
         for link in links :
             cont = link.string
             crawling_news.append(str(cont).strip())            
        

# 클로러 함수 호출 
crawler_func(5, '20190505')

print('크롤링 news 수 =', len(crawling_news)) # 크롤링 news 수 = 380
print(crawling_news)
 

# 형태소 분석
from konlpy.tag import Kkma

kkma = Kkma() # object

str_news = str(crawling_news)
print(str_news)

ex_sent = kkma.sentences(str_news)
print(ex_sent)

from re import match

# 1. 문장 -> 단어 -> 전처리 -> 워드카운트 
news_count = {}
for sent in ex_sent :
    ex_noun = kkma.nouns(sent)
    for noun in ex_noun :
        if len(str(noun)) > 1 and not(match("^[0-9]", noun)) :
Example #50
0
#-*- coding: utf-8 -*-
__author__ = 'KC'

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
Example #51
0
'''
konlpy test
'''

from konlpy.tag import Kkma

# object 생성
kkma = Kkma()

# 문단 -> 문장 추출
para = "형태소 분석을 시작합니다. 나는 홍길동 이고 age는 28세 입니다."
ex_sen = kkma.sentences(para)
print(ex_sen)  # list

# 문단 -> 명사 추출
ex_noun = kkma.nouns(para)
print(ex_noun)
# ['형태소', '분석', '나', '홍길동', '28', '28세', '세']

# 문단 -> 형태소 추출
ex_pos = kkma.pos(para)
print(ex_pos)
# [('형태소', 'NNG'), ('분석', 'NNG'), ('을', 'JKO'), ('시작하', 'VV'), ('ㅂ니다', 'EFN'), ('.', 'SF'), ('나', 'NP'), ('는', 'JX'), ('홍길동', 'NNG'), ('이', 'VCP'), ('고', 'ECE'), ('age', 'OL'), ('는', 'JX'), ('28', 'NR'), ('세', 'NNM'), ('이', 'VCP'), ('ㅂ니다', 'EFN'), ('.', 'SF')]
Example #52
0
    ],
    [
        """서민금융진흥원은 지난 18일 서울 청계천로 본원에서 제2차 서민금융 전문가 간담회를 개최했다소 19일 밝혔다.

이번 간담회는 서민금융, 복지, 자활사업 등 각 분야 전문가들이 참석한 가운데, 정책서민금융 지원의 방향성에 대해서 의견을 청취하기 위해 마련됐다. 이날 이 원장은 "소득양극화와 고용부진 심화 등으로 서민·취약계층, 자영업자들의 경제적 어려움이 커지는 가운데 사회안전망으로서 서민금융의 역할이 중요한 시점"이라며, "현재 8등급 이하자가 263만명이고 이들중 74%가 연체중인 상황에서 정상적인 금융 이용이 어려운 취약계층에게 꼭 필요한 서민금융 지원을 위해 노력해야 한다"고 강조했다.

이어서 이 원장은 "현장 전문가의 의견을 반영하여 취약계층을 위한 금융과 함께 금융교육, 컨설팅, 종합상담 등 자활기반을 구축하도록 힘쓰겠다"고 밝혔다. 이날 참석자들은 '정책서민금융지원에 대한 방향성'에 대하여 다양한 의견을 제시했다.

진흥원은 이날 간담회의 다양한 제언들을 바탕으로 수요자가 체감할 수 있는 실질적인 방안 마련을 위해 더욱 노력하고, 지속적으로 서민금융 현장의 폭넓은 의견을 청취할 계획이다.
"""
    ]
]

# In[3]:

kkma = Kkma()
sentences = []
list_vec = []
for da in data:
    print(da)
    sentences.append(kkma.sentences(da[0]))
    for s in sentences:
        for w in s:
            list_vec.append(kkma.nouns(w))

word_list = []
for l in list_vec:
    empty_vec = []
    for w in l:
        if len(w) >= 2:
            empty_vec.append(w)
Example #53
0
def main():
    #   Arguments  #
    parser = argparse.ArgumentParser(
        description='Pengtai Instagram RNN LSTM Model')
    parser.add_argument(
        '-t',
        '--type',
        type=str,
        help="run type Options: 'n' for new | 'o' for overwrite",
        default='o',
        nargs='+')
    # parser.add_argument('-d', '--dest_dir', type=str, help='CSV data file')
    parser.add_argument('-i',
                        '--input_dir',
                        type=str,
                        help='Input Raw CSV directory')
    parser.add_argument('-u', '--user_id', type=str, help='Instagram User ID')
    parser.add_argument('-v',
                        '--version',
                        help='current version',
                        action='store_true')

    args = parser.parse_args()
    #  End Argparse #

    # VERSION CONTROL #
    if args.version:
        with open(settings.VERSION_JSON, "r") as jsonFile:
            data = json.load(jsonFile)

        return print(data['version'])

    if args.type:
        if args.type[0] == 'n' and args.type[1]:
            with open(settings.VERSION_JSON, "r") as jsonFile:
                data = json.load(jsonFile)

            data["version"] = args.type[1]

            with open(settings.VERSION_JSON, "w") as jsonFile:
                json.dump(data, jsonFile)

            VERSION = args.type[1]

        elif args.type[0] == 'o':
            with open(settings.VERSION_JSON, "r") as jsonFile:
                data = json.load(jsonFile)

            VERSION = data["version"]

    # End VERSION CONTROL #

    with open('./dic/polarity.csv', 'r', encoding='UTF-8') as file:
        csvreader = csv.DictReader(file)
        kosac = [row for row in csvreader]

    total_arr = []
    rowI = 0
    rowDict = {}

    # File List in the directory from the arguments
    for filename in glob.glob(os.path.join(args.input_dir, '*.csv')):
        # i = ['id', 'img', 'text', 'has_tag', 'write_date', 'reg_date']
        with open(filename, 'r', encoding='UTF-8') as f:
            csvreader = csv.DictReader(f)
            # csvreader = csv.reader(f)
            for row in csvreader:
                if rowI == 0:
                    rowDict = {"user_id": row['user_id'], "posts": []}
                else:
                    # print(user_id, row['user_id'], rowDict)
                    if rowDict['user_id'] != row['user_id']:
                        total_arr.append(rowDict)
                        rowDict = {"user_id": row['user_id'], "posts": []}

                # text preprocess
                text = re.sub(r'@\w+', '', row['text'])
                text = re.sub(
                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                    '', text)
                text = re.sub(r'[\[]|[\]]', '', text)
                text = re.sub(r'[\r]|[\n]', ' ', text)
                text = re.sub(r'[.]|[ㆍ]', '', text)
                text = re.sub(r'#', ' ', text)

                rowDict['posts'].append({
                    "datetime": row['write_date'],
                    "text": text
                })
                rowI = rowI + 1

    # print(total_arr)
    trg_res = [item for item in total_arr if item["user_id"] == args.user_id]
    temp = []
    kkma = Kkma()
    t = Twitter()

    for post in trg_res[0]['posts']:
        date = datetime.datetime(int(post['datetime'][0:4]),
                                 int(post['datetime'][5:7]),
                                 int(post['datetime'][8:10]),
                                 int(post['datetime'][11:13]),
                                 int(post['datetime'][14:16]),
                                 int(post['datetime'][17:19]))
        text = post['text']
        temp.append((date, text))

    temp = sorted(temp, key=lambda t: t[0], reverse=False)

    sentArr = []
    newArr = []
    tokens_ko = []
    index = 0
    nounsArr = []

    for data in temp:
        sentPosArr = kkma.pos(data[1])
        # sentNouns = kkma.nouns(data[1])

        inArr = []
        for outA in sentPosArr:
            # for inA in outA:
            inArr.append("/".join(outA))

        morph_arr = t.morphs(data[1])
        morphWords = [word for word in morph_arr if not word in tokens_ko]
        for word in morphWords:
            if not word in nounsArr:
                nounsArr.append(word)

        tokens_ko.extend(morphWords)

        newArr.append({"sentence": "", "words": morph_arr, "score": 0})

        index = index + 1
        sentArr.append(";".join(inArr))

    index = 0
    for eaSent in sentArr:
        sentiScore = 0
        for corp in kosac:
            if eaSent.find(corp['ngram']) > -1:
                if corp['max.value'] == 'NEG':
                    sentiScore = sentiScore - float(corp['max.prop'])
                elif corp['max.value'] == 'POS':
                    sentiScore = sentiScore + float(corp['max.prop'])

        newArr[index]["sentence"] = eaSent
        newArr[index]["score"] = sentiScore

        index = index + 1

    # ACO 알고리즘

    # doc_ko = " ".join([row[1] for row in temp])
    # text_arr = [row[1] for row in temp]
    # for text in text_arr:
    #     morph_arr = t.morphs(text)
    #     temp = [word for word in morph_arr if not word in tokens_ko]
    #     tokens_ko.extend(temp)

    print(tokens_ko)
    ko = nltk.Text(tokens_ko)  # For Python 2, input `name` as u'유니코드'

    # # print(len(set(ko.tokens)))  # returns number of unique tokens
    vocab = dict([(item[0], index + 1)
                  for index, item in enumerate(ko.vocab().items())])
    # pprint(vocab)  # returns number of tokens (document length)
    minTimeVal = int(temp[0][0].timestamp())
    maxTimeVal = int(temp[len(temp) - 1][0].timestamp() - minTimeVal)

    tenPow = len(str(int(temp[len(temp) - 1][0].timestamp() - minTimeVal)))
    tenPow = pow(10, tenPow)

    index = 0
    nodes = []

    for data in temp:
        # print(data[0].utctimetuple)
        # print(data[0].time())
        diffTimeVal = int(data[0].timestamp() - minTimeVal)

        opt1 = float(diffTimeVal / tenPow)
        opt2 = float(diffTimeVal / maxTimeVal)
        print(diffTimeVal, opt1, opt2)

        nodes.append((opt2, newArr[index]["words"]))
        index = index + 1

    # print(nounsArr)
    nodes2 = []
    for noun in nounsArr:
        for corp in kosac:
            hts = "%s/NNG" % (noun)
            if hts.find(corp['ngram']) > -1:
                if corp['max.value'] == 'NEG':
                    nodes2.append({
                        "noun": noun,
                        "score": -float(corp['max.prop'])
                    })
                elif corp['max.value'] == 'POS':
                    nodes2.append({
                        "noun": noun,
                        "score": float(corp['max.prop'])
                    })

    print()
    antCount = len(newArr)
    rhoVal = 0.3

    # ACO 알고리즘 예시
    # nodes = []
    # for _ in range(20):
    #     x = random.uniform(-10, 10)
    #     y = random.uniform(-10, 10)
    #     nodes.append((x, y))
    #
    def euclidean(a, b):
        return math.sqrt(pow(a[1] - b[1], 2) + pow(a[0] - b[0], 2))

    #
    world = pants.World(nodes, euclidean)
    #
    solver = pants.Solver(rho=rhoVal, )
Example #54
0
from konlpy.tag import Kkma

file = open("text.txt", mode='r', encoding='utf-8')
doc = file.read()
file.close()
print(doc)

kkma = Kkma()

ex_sent = kkma.sentences(doc)

nouns = []
for sent in ex_sent:
    for noun in kkma.sentences(sent):
        #단어 전처리 : 2음절 이상, 수사 제외
        if len(str(noun)) >= 2 and not (match('^[0-9]', noun)):
            nouns.append(noun)
#-----------------------------------------------------------------------------------------------------

para = "형태소 분석을 시작합니다. 저는 김유진 입니다."

ex_sent = kkma.sentences(para)
print(len(ex_sent))
print(ex_sent)

ex_nouns = kkma.nouns(para)
print(len(ex_nouns))
print(ex_nouns)

ex_pos = kkma.pos(para)
print(len(ex_pos))
Example #55
0
import cx_Oracle
import os
import re
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
import getpass

os.environ['NLS_LANG'] = '.UTF8'

f = open(r"D:\user\Desktop\testtest.txt", 'rb')
lines = f.read()
text = lines.decode(encoding='utf-8')

kkma = Kkma()
keyword = ' '.join(kkma.nouns(text))


def makeDictFactory(cursor):
    columnNames = [d[0] for d in cursor.description]

    def createRow(*args):
        return dict(zip(columnNames, args))

    return createRow


def OutputTypeHandler(cursor, name, defaultType, size, precision, scale):
    if defaultType == cx_Oracle.CLOB:
        return cursor.var(cx_Oracle.LONG_STRING, arraysize=cursor.arraysize)
Example #56
0
def get_tags(text, ntags=40, multiplier=1):
    h = Kkma()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'tag': n, 'count': c }\
                for n, c in count.most_common(ntags)]
Example #57
0
#ci -*- coding: utf8 -*-
import xlrd
from konlpy.utils import pprint
from konlpy.tag import Twitter
from konlpy.tag import Kkma
import json
import codecs

twitter = Twitter()
kkma = Kkma()

result_file = codecs.open('streaming_result.txt', 'w', 'utf-8')

dic_list = []
dic_count = 0
valence_list = []
with xlrd.open_workbook('dic.xlsx') as workbook:

    sheet = workbook.sheet_by_index(1)
    sheet.cell_value(0, 0)
    sheet.nrows
    sheet.ncols
    for row in range(sheet.nrows):
        verb = sheet.cell_value(row, 0)

        if verb == u'' or verb == u'원형' or verb == u'ㄱ' or verb == u'ㄴ' or verb == u'ㄷ' or verb == u'ㅁ' or verb == u'ㅂ' or verb == u'ㅅ' or verb == u'ㅇ' or verb == u'ㅈ' or verb == u'ㅊ' or verb == u'ㅋ' or verb == u'ㅌ' or verb == u'ㅍ' or verb == u'ㅎ' or verb == u'이모티콘' or verb == u'숫자':
            continue
        #dic_list.append([])
        valence_list.append([])
        dic_count = dic_count + 1
Example #58
0
class Classifier(metaclass=Singleton):
    """
     Convolutional Neural Networks 모델을 이용하여 Label의 Concept_id를 추측하는 클래스

    Attributes
    ----------
    konlpy
        형태소 추출기 Hannanum, Kkma, Komoran, Mecab, Okt 설정 가능
        자세한 사항은 http://konlpy.org/ 참고
    word_dict
        단어 Lookup Table
    concept_dict
        concept_id Lookup Table
    model
        CNN 모델
    is_load
        CNN 모델 및 Lookup Table 로딩 여부
    """
    def __init__(self):
        self.konlpy = Kkma()
        self._dataset = None
        self.word_dict = None
        self.concept_dict = None
        self._x_train = None
        self._y_train = None
        self.model = None
        self.is_load = False

    def extract_nouns(self, text):
        """
        KoNLPy을 이용하여 명사만 추출

        Parameters
        ----------
        text: str
            추출할 문장

        Returns
        -------
        list of str
            추출된 명사 리스트

        """
        return self.konlpy.nouns(text)

    def gen_dataset(self, reports):
        """

        Report들에서 XBRL 파일을 추출후 Concept_id와 Label 값을 이용하여 CNN 모델 학습

        Parameters
        ----------
        reports: list of Report
            추출할 Report 리스트
        """
        self._extract_dataset(reports)
        self._gen_word_dict()
        self._gen_concept_dict()
        self._gen_x_train()
        self._gen_y_train()
        self.is_load = True

    def _extract_dataset(self, reports: List[Report]):
        """
        Report에 포함된 XBRL 파일에서 Concept_id 와 Label 값 추출

        Parameters
        ----------
        reports: list of Report
            추출할 Report 리스트
        """
        if is_notebook():
            from tqdm import tqdm_notebook as tqdm
        else:
            from tqdm import tqdm

        dataset = []
        for report in tqdm(reports,
                           desc='Extracting concept_id and label_ko',
                           unit='report'):
            df_fs = analyze_xbrl(report)
            if df_fs is None:
                continue
            for tp in df_fs:
                df = df_fs[tp]
                if df is not None:
                    concept_column = find_all_columns(df, 'concept_id')[0]
                    label_ko_column = find_all_columns(df, 'label_ko')[0]
                    for idx in range(len(df)):
                        concept_id = df[concept_column].iloc[idx]
                        label_ko = df[label_ko_column].iloc[idx]
                        if concept_id and label_ko:
                            try:
                                label = self.extract_nouns(label_ko)
                                dataset.append((concept_id, label))
                            except BaseException:
                                continue

        self._dataset = dataset

    def _gen_word_dict(self):
        """ 단어 Lookup Table 생성 """
        word_index = dict()
        for _, nouns in self._dataset:
            for noun in nouns:
                if word_index.get(noun) is None:
                    word_index[noun] = 0
                word_index[noun] += 1

        word_dict = dict()
        for idx, (noun, _) in enumerate(
                sorted(word_index.items(), key=lambda x: x[1], reverse=True)):
            word_dict[noun] = idx + 1

        self.word_dict = word_dict

    def _gen_concept_dict(self):
        """ concept_id Lookup Table 생성 """
        concepts = set()
        for concept, _ in self._dataset:
            concepts.add(concept)

        concept_dict = dict()
        for idx, concept in enumerate(concepts):
            concept_dict[concept] = idx + 1
        self.concept_dict = concept_dict

    def _gen_x_train(self):
        """ 입력값 변환 """
        dataset = []
        for concept_id, label_ko in self._dataset:
            dataset.append([self.word_dict[x] for x in label_ko])
        x_train = self.vectorize_sequences(dataset)
        self._x_train = x_train

    def _gen_y_train(self):
        """ 결과값 변환 """
        dataset = [self.concept_dict[concept] for concept, _ in self._dataset]
        y_train = tf.keras.utils.to_categorical(dataset)
        self._y_train = y_train

    @property
    def input_length(self):
        return len(self.word_dict) + 1

    @property
    def output_length(self):
        return len(self.concept_dict) + 1

    def gen_model(self,
                  units: int = 256,
                  dropout: float = 0.2,
                  epochs: int = 50,
                  batch_size: int = 512):
        """
        Keras를 이용한 CNN 모델 생성 및 학습

        Parameters
        ----------
        units: int
            unit 수
        dropout: float
            dropout rate
        epochs: int
            학습 반복 횟수
        batch_size: int
             batch_size 수
        """
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(units,
                                  activation='relu',
                                  input_shape=(self.input_length, )),
            tf.keras.layers.Dropout(rate=dropout),
            tf.keras.layers.Dense(units, activation='relu'),
            tf.keras.layers.Dense(self.output_length, activation='softmax')
        ])

        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        length = int(len(self._x_train) / 5)

        x_val = self._x_train[:length * 3]
        partial_x_train = self._x_train[length * 3:length * 4]
        x_test = self._x_train[length * 4:]

        y_val = self._y_train[:length * 3]
        partial_y_train = self._y_train[length * 3:length * 4]
        y_test = self._y_train[length * 4:]

        print("\n==========Model Fit==========\n")
        model.fit(x_val,
                  y_val,
                  epochs=epochs,
                  batch_size=batch_size,
                  validation_data=(partial_x_train, partial_y_train))
        print("\n==========Model Evaluation==========\n")
        model.evaluate(x_test, y_test)
        self.model = model

    def vectorize_sequences(self,
                            sequences: List[List[str]]) -> List[List[int]]:
        """ Label에 포함된 단어를 0과 1의 리스트로 변환"""
        results = np.zeros((len(sequences), self.input_length))
        for i, sequence in enumerate(sequences):
            results[i, sequence] = 1.
        return results

    def save(self, path=None):
        """
        Convolutional Neural Networks 모델 및 Dictionary 저장

        Parameters
        ----------
        path: str
            데이터 저장 위치
        """
        if path is None:
            path = pkg_resources.resource_filename('dart_fss_classifier',
                                                   'data/')
        create_folder(path)
        file = os.path.join(path, 'dict.json')

        config = {
            'word_dict': self.word_dict,
            'concept_dict': self.concept_dict,
        }
        model_file = os.path.join(path, 'model.h5')
        self.model.save(model_file)
        with open(file, 'w') as outfile:
            json.dump(config, outfile)

    def load(self, path: str = None) -> str:
        """
        Convolutional Neural Networks 모델 및 Dictionary 로딩

        Parameters
        ----------
        path: str
            데이터 위치
        """
        if path is None:
            path = pkg_resources.resource_filename('dart_fss_classifier',
                                                   'data/')
        file = os.path.join(path, 'dict.json')
        if not os.path.isfile(file):
            raise FileExistsError(
                "The dictionary does not exist. Please run 'generate_default_dataset_and_cnn_model'."
            )

        model_file = os.path.join(path, 'model.h5')
        if not os.path.isfile(model_file):
            raise FileExistsError(
                "The Keras model does not exist. Please run 'generate_default_dataset_and_cnn_model'."
            )

        self.model = tf.keras.models.load_model(model_file)
        with open(file) as json_file:
            data = json.load(json_file)
            self.word_dict = data['word_dict']
            self.concept_dict = data['concept_dict']

        self.is_load = True

    def guess(self, text: str) -> str:
        """
        Concept_id 추측 Method

        Parameters
        ----------
        text: str
            Label 명

        Returns
        -------
        str
            추측한 Concept_id

        """
        if not self.is_load:
            self.load()
        data = []
        for noun in self.extract_nouns(text):
            try:
                word = self.word_dict[noun]
                data.append(word)
            except BaseException:
                pass

        d = self.vectorize_sequences([data])
        prediction = np.argmax(self.model.predict(d))
        for key, value in self.concept_dict.items():
            if value == prediction:
                return key
        return None
Example #59
0
# This is script to test KoNLPy.
# Project started at 01/18/2016. Author by Jaehyun Ahn([email protected])
__author__ = 'Sogo'

from konlpy.tag import Kkma
from collections import Counter

print('Number of lines in document:')
k = Kkma()
f = open('test.txt', 'r')
lines = f.read().splitlines()
nlines = len(lines)
print(nlines)

nouns = [k.nouns(lines[i]) for i in range(0, nlines)]

cnt = Counter()
for i in range(len(nouns)):
    for j in range(len(nouns[i])):
        cnt[nouns[i][j]] += 1
print(cnt.most_common(15))
# let's get words! It's a steal!
print(cnt.most_common(15)[0][0])
print(cnt.most_common(15)[1])