Beispiel #1
0
def view_post(request, pk):
    the_post = get_object_or_404(Post, pk=pk)
    the_comment = Comment.objects.filter(post=the_post)
    mecab = Mecab()
    morph = mecab.pos(the_post.content)
    the_morph = ' '.join(str(e) for e in morph)



    if request.method == 'GET':
        pass
    elif request.method =='POST':
        new_comment = Comment()
        new_comment.content = request.POST.get('content')
        new_comment.post = the_post
        new_comment.save()




    return render(request, 'view_post.html',{
        'post' : the_post,
        'comments' : the_comment,
        'morph' : the_morph,
    })
Beispiel #2
0
    def pre_process(self, json, istrain):
        mecab = Mecab()

        data = []

        for cnt, article in enumerate(json):
            if cnt % 10000 == 0:
                print(cnt)
                
            text = bs(article["text"], "html.parser").text
            #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]
            #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]
            text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

            data.append({
                #"title_pos": title_pos,
                #"title_pos_sentences" : " ".join(title_pos),
                #"author_pos": author_pos,
                #"author_pos_sentences" : " ".join(author_pos),
                "text":article["text"],
                "text_pos": text_pos,
                "text_pos_sentences" : " ".join(text_pos),
                #"forumid": article["forumid"],                    
                "pk": article["pk"]
            })

            if istrain == True:
                data[cnt]["istroll"] = article["is_troll"]

        data = pd.DataFrame.from_dict(data)
        data = data.set_index('pk')

        return data
Beispiel #3
0
def main():
	mecab = Mecab()
	if len(sys.argv) < 2:
		result = {'result':'none'}
		print json.dumps(result)
		sys.exit(0)

	morphem_list = mecab.pos(sys.argv[1].decode('utf-8'))
	result_dict = {}
	result_dict['result'] = [x[0].encode('utf-8') for x in morphem_list]
	print json.dumps(result_dict)
Beispiel #4
0
    def _mecab_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine)
        return return_arr
    def _pos_raw_data(self, lt):
        """

        :param lt: list type value
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr= []
        for raw in lt :
            pos = mecab.pos(raw)
            for word, tag in pos:
                return_arr.append("{0}/{1}".format(word, tag))
        return return_arr
Beispiel #6
0
    def _pos_tag_predict_data(self, x_input, word_len):
        """

        :param x_input:
        :return:
        """
        word_list = []
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        for word_tuple in self._pad_predict_input(mecab.pos(x_input), word_len):
            if (len(word_tuple[1]) > 0):
                word = ''.join([word_tuple[0], "/", word_tuple[1]])
            else:
                word = word_tuple[0]
            word_list.append(word)
        return word_list
    def _conv_type_b(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')

        i = 0
        for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
            words = []
            if (self.use_mecab):
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else:
                words = str(line).split(' ')
            match_keys = self._check_all_match(words)
            aug_data = self._aug_sent(match_keys, words, [])
            self._intent_formatter(aug_data, key, idx)

            if(i%100 == 0) :
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1
	def __init__( self ):
		self.driver = webdriver.Firefox()
		self.classifier = cf.classifier()
		self.URLs = []
		self.contexts = []

		self.bag = utils.load_dictionary()
		self.tagger = Mecab()
Beispiel #9
0
	def __init__(self):
		# initalize Mecab tagger
		self.tagger = Mecab()
	
		# initalize regular expression	
		self.exp = re.compile(self.POS, re.IGNORECASE)
		
		# load sentiment dictionary
		self.bag = utils.load_dictionary()
	
		# load model if exist
		with open("../Resources/models/model", "rb") as model_file:
			self.model = pickle.load(model_file)
Beispiel #10
0
def learning(request, pk):
    the_post = get_object_or_404(Post, pk=pk)
    mecab = Mecab()
    morph = mecab.pos(the_post.content)


    if request.method=="GET":
        pass
    elif request.method=="POST" and the_post.sentiword_set.exists()==False:
        for m in range(len(morph)):
            the_word = Sentiword()
            the_word.word = str(morph[m])
            the_word.post = the_post
            the_post.senti = request.POST.get('senti')
            the_post.save()
            the_word.save()
        return redirect('view_post', pk=pk)
    else:
        return redirect('view_post', pk=pk)

    return render(request, 'learning.html',{
        'post':the_post,
    })
Beispiel #11
0
	def parse(self, data_path = "data"):
		file_list = glob.glob("%s/*.json" % data_path)
		json_list=[]

		shuffle(file_list)
		for json_file_name in file_list:
			json_file = json.loads(open(json_file_name).read())
			json_list += json_file["articles"]

		mecab = Mecab()

		dataframe = []

		for article in json_list:
			text = bs(article["text"], "html.parser").text
			title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]
			author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]
			text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

			dataframe.append({
				"title_pos": title_pos,
				"title_pos_sentences" : " ".join(title_pos),
				"author_pos": author_pos,
				"author_pos_sentences" : " ".join(author_pos),
				"text":article["text"],
				"text_pos": text_pos,
				"text_pos_sentences" : " ".join(text_pos),
				"forumid": article["forumid"],                    
				"istroll": article["is_troll"],
				"pk": article["pk"]
			})

		dataframe = pd.DataFrame.from_dict(dataframe)
		dataframe = dataframe.set_index("pk")

		return dataframe
Beispiel #12
0
	def __init__( self, date, news_limit = 5, net_limit = 50 ):
		self.section = util.load_file("section.txt")
		self.date = date
		self.news_limit = news_limit
		self.net_limit = net_limit
		self.refer = 0

		self.mecab = Mecab()
		self.exp = re.compile("NN|XR|VA|VV|MAG|VX")
		
		self.temp_net = {}
		self.temp_list = {}
		self.word_net = []	   # relative word and its frequency
		self.word_list = []	   # total word and its frequency (using for PMI)
		self.news = []		   # top # of news
		self.sentiment = [0, 0] # [neg, pos]
		self.counter = [ 0 for i in range(16) ]
Beispiel #13
0
    def _conv_type_a(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')
        i = 0
        for line in df_csv_read['encode'].values:

            words = []
            if(self.use_mecab) :
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else :
                words = str(line).split(' ')

            match_keys = self._check_all_match(words)
            if(self.out_format_type == 'plain') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._plain_formatter(aug_data,idx)
            elif(self.out_format_type == 'iob') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._iob_formatter(aug_data,idx)
            else :
                raise Exception (' '.join(['not', 'plain', 'or iob']))
            if (i % 100 == 0):
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1

# da = DataAugmentation({
#                      "use_mecab": True,
#                      "max_file_size": 100000000,
#                      "pattern_data_path": "/hoya_model_root/aug/pattern.csv",
#                      "augmented_out_path": "/hoya_model_root/aug/aug_0810/",
#                      "dict_path": "/hoya_model_root/aug/dict.csv",
#                      "out_format_type": "iob",
#                      "dict_sample_size" : 3,
#                      "dict_sample_iter" : 500,
#                      "thread_num" : 8
#                  })
# da.run()
Beispiel #14
0
class SearchCluster:
    def __init__(self, app):
        self.app = app
        self.mecab = Mecab()
        self.load_models()

    def load_models(self):
        self.word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True)
        self.cluster_pipe = joblib.load(PIPE_DUMPING)

    def __task_to_vector(self, task):
        words = [key for key, pos in self.mecab.pos(task)]
        # aggregation word vectors
        vector = np.mean(np.array([self.word2vec[word] for word in words if word in self.word2vec]), axis=0)
        return vector

    def __predict_label(self, task):
        vector = self.__task_to_vector(task)
        return self.cluster_pipe.predict(vector)[0]

    def get_articles(self, user_id, task, topn=3):
        label = self.__predict_label(task)
        article_id_list = list(self.app.query_pool2.get_same_cluster_articles(user_id, label, topn))
        return list(self.app.query_pool2.get_article_list_by_id(article_id_list))
Beispiel #15
0
import re
from konlpy.tag import Mecab
from typing import List

split_morphs = Mecab().morphs


def split_jamos(string: str) -> List[str]:
    # 유니코드 한글 시작 : 44032, 끝 : 55199
    _base_code = 44032
    _chosung = 588
    _jungsung = 28
    # 초성 리스트. 00 ~ 18
    _chosung_list = [
        'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
        'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]
    # 중성 리스트. 00 ~ 20
    _jungsung_list = [
        'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
        'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
    ]
    # 종성 리스트. 00 ~ 27 + 1(1개 없음)
    _jongsung_list = [
        ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ',
        'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]

    def split(sequence):
        split_string = list(sequence)
        list_of_tokens = []
Beispiel #16
0

if __name__ == "__main__":
    data = []
    label = []
    count = {}
    conn = pymysql.connect(host='192.168.1.10',
                           user='******',
                           password='******',
                           charset='utf8',
                           db='crolls')
    cursor = conn.cursor()
    sql = "SELECT title, content, etc FROM data_set2"
    cursor.execute(sql)
    res = cursor.fetchall()
    mecab = Mecab()
    for one in tqdm(res):
        article = get_vector(one[0])
        article += get_vector(one[1])
        word_count = len(article)
        if word_count > 40:
            data.append(article[0:40])
        elif word_count < 40:
            for i in range(0, 40 - word_count):
                article.append([0 for j in range(0, 16)])
            data.append(article)
        else:
            data.append(article)

        if one[2].split(',')[0] == "장애인":
            label.append([1, 0, 0, 0, 0])
Beispiel #17
0
# -*- coding: utf-8 -*-
import csv
from konlpy.tag import Mecab
import gensim
from collections import namedtuple
import time

j=1
tmp_list=[]
doc_list=[]
main_str = str(0)
words = str(0)
csv_file = "C:/Users/int_sub05/.spyder-py3/sample/2017_01_0{}.csv"
csv_file2 = "C:/Users/int_sub05/.spyder-py3/sample/2017_01_{}.csv"
mecab = Mecab(dicpath="C:\mecab\mecab-ko-dic")
doc_vectorizer = gensim.models.Doc2Vec(
        dm=0,
        dbow_words=1,
        window=8,
        size=300,
        alpha=0.025,
        seed=1234,
        min_count=20,
        min_alpha=0.025,
        hs=1,
        negative=10)

for i in range(1,31):
    if i<=9:
        f = open(csv_file.format(i), 'r', encoding='utf-8')
        rdr = csv.reader(f)
Beispiel #18
0
class stopwordFilter:
    def __init__(self, myDB):
        self.stopword = set()
        self.myDB = myDB
        self.tagger = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
        # self.typoList = list()
        # self.initTypoChanger()

    # 불용어가 잘 처리되는지 확인하기 위해 DB의 재료를 ingredient.txt 로 받은 뒤
    # 불용어 처리한 재료를 ingredientListElimStopword.txt 에 다시 써서 제대로 가공됬는지 확인한다
    # (이 처리가 잘됨을 확인하면 그때 DB의 자료를 실제로 update 할것)
    def eliminateStopwordFromIngredient(self):
        self.initStopword()
        # self.makeIngredientToText()

        rf = open('textFile/ingredientList.txt', mode='rt', encoding='utf-8')
        wf = open('textFile/ingredientListElimStopword.txt',
                  mode='wt',
                  encoding='utf-8')
        for line in rf:
            writeStr = self.linePreprocess2(line)
            if writeStr != str():
                writeStr = writeStr.lstrip(' ') + '\n'
                wf.write(writeStr)
            if not line:
                break

    def initStopword(self):
        self.deDuplicationStopword()
        f = open('textFile/stopwordList.txt', mode='rt', encoding='utf-8')
        for line in f:
            self.stopword.add(line.rstrip('\n'))
            if not line:
                break
        f.close()

    def linePreprocess(self, line):
        line = re.sub(pattern=patternBlank, repl='', string=line)
        line = re.sub(pattern=patternSymbol, repl='', string=line)

        line = line.rstrip('\n')
        ingredientArr = line.split(' ')
        writeStr = str()
        for ingredient in ingredientArr:
            if ingredient not in self.stopword:
                # writeStr += (' ' + ingredient)
                writeStr += (ingredient)
        return writeStr

    def linePreprocess2(self, line):
        line = re.sub(pattern=patternBlank, repl='', string=line)
        line = re.sub(pattern=patternOR, repl='', string=line)

        nouns = self.tagger.nouns(line)
        writeStr = str()
        for noun in nouns:
            if noun not in self.stopword:
                # writeStr += (' ' + ingredient)
                writeStr += noun
        return writeStr

    def makeIngredientToText(self):
        ingredientList = self.myDB.select_ingredient_iname()
        f = open('textFile/ingredientList.txt', mode='wt', encoding='utf-8')
        for ingredient in ingredientList:
            f.write(ingredient['iname'] + '\n')
        f.close()

    def deDuplicationStopword(self):
        f = open('textFile/stopwordList.txt', mode='rt', encoding='utf-8')
        mySet = set()
        for line in f:
            mySet.add(line.rstrip('\n'))
            if not line:
                break
        f.close()

        f = open('textFile/stopwordList.txt', mode='wt', encoding='utf-8')
        for ingredient in mySet:
            f.write(ingredient + '\n')
        f.close()

    def morphemeAnalysis(self, line):
        return list(self.tagger.morphs(line))
        # print(self.tagger.nouns(line))
        # print(self.tagger.pos(line))

    def initTypoChanger(self):
        self.typoList.append({
            'typos': ["머스타드", "머스터드", '허니머스트', '머스타트'],
            'except': [],
            'wrong': '머스타드'
        })
        self.typoList.append({'typos': ["양파"], 'except': [], 'wrong': '양파'})
        self.typoList.append({'typos': ["카레"], 'except': [], 'wrong': '카레'})
        self.typoList.append({
            'typos': ["쌀국수"],
            'except': ['소스', '스톡'],
            'wrong': '쌀국수'
        })
        self.typoList.append({
            'typos': ["파프리카"],
            'except': [],
            'wrong': '파프리카'
        })
        self.typoList.append({'typos': ["베이컨"], 'except': [], 'wrong': '베이컨'})
        self.typoList.append({'typos': ["베이컨"], 'except': [], 'wrong': '베이컨'})
        self.typoList.append({'typos': ["우동면"], 'except': [], 'wrong': '우동면'})
        self.typoList.append({'typos': ["오트밀"], 'except': [], 'wrong': '오트밀'})
        self.typoList.append({
            'typos': ["케찹", '케첩', '캐찹', '캐첩'],
            'except': [],
            'wrong': '케첩'
        })
        self.typoList.append({
            'typos': ["소시지", "소세지"],
            'except': [],
            'wrong': '소세지'
        })
        self.typoList.append({'typos': ["경기미"], 'except': [], 'wrong': '백미'})
        self.typoList.append({'typos': ["액젓"], 'except': [], 'wrong': '액젓'})
        self.typoList.append({
            'typos': ["후추", "후춧"],
            'except': [],
            'wrong': '후추'
        })
        self.typoList.append({'typos': ["식초"], 'except': [], 'wrong': '식초'})
        self.typoList.append({
            'typos': ["칼국수"],
            'except': ['스프'],
            'wrong': '칼국수'
        })
        self.typoList.append({'typos': ["지단"], 'except': [], 'wrong': '지단'})
        self.typoList.append({
            'typos': ["어묵", '오뎅'],
            'except': ['어묵'],
            'wrong': '어묵'
        })
        self.typoList.append({
            'typos': ['와사비'],
            'except': ['마요'],
            'wrong': '와사비'
        })
        self.typoList.append({
            'typos': ['후리카케', '후리가깨', '후리가캐', '후리가께', '후리가'],
            'except': [],
            'wrong': '후리카케'
        })
        self.typoList.append({
            'typos': ['파슬리', '파아슬리'],
            'except': [],
            'wrong': '파슬리'
        })

    def typoChanger(self, line):
        for typo in self.typoList:
            aFlag = False
            tFlag = False
            for e in typo['except']:
                if line.find(e) != -1:
                    aFlag = True
            for t in typo['typos']:
                if line.find(t) != -1:
                    tFlag = True
            if aFlag is False and tFlag is True:
                return typo['wrong']
        return line
! pip install mecab_python-0.996_ko_0.9.2_msvc-cp36-cp36m-win_amd64.whl

import MeCab
m = MeCab.Tagger()
OUTPUT = m.parse('Mecab 설치를 확인합니다.')
print(OUTPUT)

!pip install JPype1-1.0.2-cp36-cp36m-win_amd64.whl
! pip install konlpy

from konlpy.tag import Kkma
K = Kkma()
out = K.nouns('코엔엘파이 설치를 확인합니다')
print(out)

from konlpy.tag import Mecab
m = Mecab()

m.nouns('메켑이 설치되었는지 확인')
Beispiel #20
0
class keyword_anaylze():
	def __init__( self, date, news_limit = 5, net_limit = 50 ):
		self.section = util.load_file("section.txt")
		self.date = date
		self.news_limit = news_limit
		self.net_limit = net_limit
		self.refer = 0

		self.mecab = Mecab()
		self.exp = re.compile("NN|XR|VA|VV|MAG|VX")
		
		self.temp_net = {}
		self.temp_list = {}
		self.word_net = []	   # relative word and its frequency
		self.word_list = []	   # total word and its frequency (using for PMI)
		self.news = []		   # top # of news
		self.sentiment = [0, 0] # [neg, pos]
		self.counter = [ 0 for i in range(16) ]


	def _add_news( self, context, url, title ):
		if len(self.news) < self.news_limit:
			self.news.append([len(context), url, title])
			self.news.sort()
		else:
			self.news[0] = [len(context), url, title]
			self.news.sort()


	def _add_word( self, words, word_list, senti ):
		for w in words:
			if len(w) < 2: continue

			if w in word_list:
				word_list[w][0] += 1
				word_list[w][int(senti)+1] += 1
			else:
				word_list[w] = [1, 0, 0]
				word_list[w][int(senti)+1] += 1


	def _make_morp( self, context ):
		context = re.sub(r"(\"|\')", "", context)
		words = re.findall(r"[\w']+", context)
			
		for i, v in enumerate(words):
			pos = self.mecab.pos(v)
			w = [ p[0] for p in pos if not re.search("NN|XR|VA|VV|MAG|VX|SL|SN", p[1]) ]
			for x in w:
				words[i] = words[i].replace(x, "")

		# remove '' in words
		return [ w for w in words if not w == "" ]
	

	def _arrange_word_list( self, dictionary ):
		words = sorted(dictionary.items(), key=itemgetter(1), reverse=True)
		word_list = []
		for w in words:
			pos = self.mecab.pos(w[0])
			if re.search("NN|XR", pos[0][1]):
				word_list.append(w)

		return word_list


	def _traverse_news( self, keyword ):
		global news_loc

		keyword_list = keyword.split(" ")
		for s in self.section:
			idx = 0
			loc = news_loc+self.date+"/"+s

			print(loc+"/")
			while os.path.isfile(loc+"/"+str(idx)):
				f = open(loc+"/"+str(idx), "r")
				senti   = f.readline().replace("\n", "")
				url     = f.readline().replace("\n", "")
				title   = f.readline().replace("\n", "")
				context = f.read().replace("\n", "")
				words   = self._make_morp(context)
				f.close()

				self._add_word(words, self.temp_list, senti)
			
				is_key = True
				for key in keyword_list:
					have_word = False
					for w in words:
						if key in w:
							have_word = True
					if not have_word: is_key = False
				
				if is_key:
					self.counter[0+int(senti)] += 1
					self.refer += 1
					self.sentiment[int(senti)] += 1
					self._add_news(context, url, title)
					self._add_word(words, self.temp_net, senti)

				idx += 1
			

	def _traverse_community( self, keyword ):
		global community_loc
		
		base_loc = community_loc+keyword+"/"
		idx = 0

		print(base_loc)
		while True:
			loc = base_loc+str(idx)
			idx += 1
			if not os.path.isfile(loc): break

			f = open(loc, "r")
			senti   = f.readline().replace("\n", "")
			comm    = f.readline().replace("\n", "")
			title   = f.readline().replace("\n", "")
			context = f.read().replace("\n", "") 
			words   = self._make_morp(context)
			f.close()

			self.sentiment[int(senti)] += 1
			self._add_word(words, self.temp_list, senti)
			self._add_word(words, self.temp_net, senti)

			# determine community
			if 	 comm == "dcinside":   self.counter[2+int(senti)] += 1
			elif comm == "todayhumor": self.counter[4+int(senti)] += 1
			elif comm == "twitter":    self.counter[6+int(senti)] += 1
			elif comm == "fomos": 	   self.counter[8+int(senti)] += 1
			elif comm == "inven":      self.counter[10+int(senti)] += 1
			elif comm == "instiz":     self.counter[12+int(senti)] += 1
			elif comm == "ppomppu":    self.counter[14+int(senti)] += 1


	def _make_word_net( self ):
		network = []

		words = []
		count = []
		for v in self.word_net:
			words.append(v[0])
			count.append(v[1][0])

		for i, v in enumerate(self.word_list):
			for j, w in enumerate(words):
				if v[0] == w and v[1][0] > 10:
					senti = v[1][2] / v[1][0]
					pmi   = count[j] / v[1][0]
					network.append([w, senti, v[1][0], pmi])

		return network

			
	def anaylze( self, keyword ):
		self._traverse_news(keyword)
		self._traverse_community(keyword)

		# sort word_net
		self.word_net = self._arrange_word_list(self.temp_net)

		if len(self.word_net) > self.net_limit:
			self.word_net = [ self.word_net[i] for i in range(self.net_limit) ]

		# sort word_list
		self.word_list = self._arrange_word_list(self.temp_list)

		# network = [ [word, senti, frequency, PMI] .. ] 
		network = self._make_word_net()

		return self.sentiment, self.news, network, self.counter
Beispiel #21
0
import pandas as pd
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
import ipdb

from konlpy.tag import Mecab

from gensim.models import Word2Vec

mecab = Mecab()

learning_rate = 0.001
dim_embed = 200
n_epochs = 20
window_size = 5
min_count = 3

wiki_file = '../text/wiki_all'
with open( wiki_file ) as f:
    wiki_contents = f.read()
    wiki_docs = map(lambda x: filter(lambda y: y != '', x.text.split('\n')), BeautifulSoup( wiki_contents ).find_all('doc'))
    wiki_paragraphs = [item for sublist in wiki_docs for item in sublist]

paragraph_list = []
for wiki_paragraph in wiki_paragraphs:
    wiki_paragraph_pos = map(lambda x: x[0] + '^/'+ x[1], mecab.pos( wiki_paragraph ))
    if len(wiki_paragraph_pos) > 2:
        paragraph_list.append( wiki_paragraph_pos )

del wiki_paragraphs
Beispiel #22
0
class crawl_community():
	def __init__( self ):
		self.driver = webdriver.Firefox()
		self.classifier = cf.classifier()
		self.URLs = []
		self.contexts = []

		self.bag = utils.load_dictionary()
		self.tagger = Mecab()

	
	def __del__( self ):
		self.driver.quit()
	
		
	def _crawl_URL( self ):
		titles = []

		# dynamic scrolling
		more_count = 0
		while True:
			time.sleep(0.5)
			more = self.driver.find_element_by_id("real_more_page")

			if more.is_displayed():
				if more.text == "더보기":
					more.click()
					more_count += 1
				else: 
					break
			else:
			 	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			if more_count >= self.scroll: break

		# get html source
		html = self.driver.page_source
		soup = BeautifulSoup(html)

		# crawl URL
		for c in soup.find_all("li"):
			# if items are from community 
			if c.get("class") == ['realtimeitem', 'community']:
				href = c.find("a")["href"]
				self.URLs.append(href)
				title = c.find("a").get_text().strip()
				titles.append(title)
			# if items are from twitter
			elif c.get("class") == ['realtimeitem', 'twitter']:
				for s in c.find_all("span"):
					if s.get("class") == ['text', 'snsbody']:
						href = s['href']
						self.URLs.append(href)
						titles.append("twitter")

		return titles


	def _exclude_short( self, text ):
		pos = self.tagger.pos(text)
		words = [ p[0] for p in pos ]

		is_in = False
		for b in self.bag[0]:
			if b[0] in words: is_in = True

		for b in self.bag[1]:
			if b[0] in words: is_in = True

		return not is_in


	def _crawl_dcinside( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["s_write"]:
				text = c.find_all("td")[0].get_text()
				text = text.strip().replace("\n", " ")
				
				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["dcinside", title, text])

	"""
	def _crawl_mlbpark( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("td"):
			if c.get("class") == ["G13"] and c.find_all("div"):
				div = c.find_all("div")[0]
				text = div.get_text()
				text = text.strip().replace("\n", " ")
				
				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["mlbpark", title, text])
				break
	"""


	def _crawl_twitter( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)
	
		for c in soup.find_all("p"):
			tag = c.get("class")
			if tag and "tweet-text" in tag:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude : self.contexts.append(["twitter", title, text])


	def _crawl_todayhumor( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["viewContent"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["todayhumor", title, text])


	"""
	def _crawl_clien( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		c = soup.find(id="writeContents")
		if c: 
			text = c.get_text().strip().replace("\n", " ")
			if self._exclude_short: self.contexts.append(["clien", title, text])


	def _crawl_bobaedream( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["bodyCont"]:
				text = c.get_text().strip().replace("\n", " ")
				if self._exclude_short: self.contexts.append(["bobaedream", title, text])
	"""

	def _crawl_fomos( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["view_text"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["fomos", title, text])
				break


	def _crawl_inven( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["powerbbsContent"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["inven", title, text])


	def _crawl_instiz( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		c = soup.find(id="memo_content_1")
		if c:
			text = c.get_text().strip().replace("\n", " ")

			exclude = self._exclude_short(text)
			if not exclude: self.contexts.append(["instiz", title, text])


	def _crawl_ppomppu( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("td"):
			if c.get("class") == ["han"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["ppomppu", title, text])


	# determine which URL comes from
	def _crawl_context( self, titles ):
		for i, url in enumerate(self.URLs):
			if   "dcinside"   in url: self._crawl_dcinside(url, titles[i])
			#elif "mlbpark"    in url: self._crawl_mlbpark(url, titles[i])
			elif "todayhumor" in url: self._crawl_todayhumor(url, titles[i])
			#elif "clien"      in url: self._crawl_clien(url, titles[i])
			elif "twitter"    in url: self._crawl_twitter(url, titles[i])
			#elif "bobaedream" in url: self._crawl_bobaedream(url, titles[i])
			elif "fomos"	  in url: self._crawl_fomos(url, titles[i])
			elif "inven"	  in url: self._crawl_inven(url, titles[i])
			elif "instiz"	  in url: self._crawl_instiz(url, titles[i])
			elif "ppomppu"	  in url: self._crawl_ppomppu(url, titles[i])
			else: print(url)

		# classify sentiment
		for i, v in enumerate(self.contexts):
			vector = self.classifier.features(v[1]+v[2])
			predict = self.classifier.predict(vector).tolist()[0]
			self.contexts[i].insert(0, predict)


	def crawl( self, query, scroll = 5 ):
		self.scroll = scroll
		self.query = query
		self.url = "http://search.zum.com/search.zum?method=realtime&option=accu&query="+query+"&cm=more"
		self.driver.get(self.url)

		titles = self._crawl_URL()
		self._crawl_context(titles)	

		return self.contexts	
Beispiel #23
0
class DataAugmentation :
    """
    Data Augmentation Class for nlp
    mainly for create iob data with pattern and dict
    test = DataAugmentation()
    test.load_dict()
    test.convert_data()
    """

    class ThreadCls(threading.Thread) :
        def __init__(self, obj, idx):
            threading.Thread.__init__(self)
            self.obj = obj
            self.idx = idx

        def run(self):
            for _ in range(self.obj.dict_sample_iter):
                self.obj.load_dict()
                self.obj.convert_data(self.idx)

        def join(self):
            threading.Thread.join(self)
            return True

    def __init__(self, conf):
        """
        init parms need to mange teses parms on db
        """
        self.aug_file_cnt = 0
        self.use_mecab = conf.get("use_mecab")
        self.max_file_size = conf.get("max_file_size")  #10M
        self.pattern_data_path = conf.get("pattern_data_path")
        self.augmented_out_path = conf.get("augmented_out_path")
        self.dict_path = conf.get("dict_path")
        self.out_format_type = conf.get("out_format_type")
        self.ner_dicts = {}
        self.gpu_use = True
        self.dict_sample_size = int(conf.get("dict_sample_size"))
        self.dict_sample_iter = int(conf.get("dict_sample_iter"))
        self.thread_num = int(conf.get("thread_num"))

    def run(self):
        """
        run 
        :return: 
        """
        job_list = []
        for idx, _ in enumerate(range(self.thread_num)) :
            job_list.append(self.ThreadCls(self, idx))

        for job in job_list:
            job.start()

        for job in job_list:
            job.join()


    def load_dict(self):
        """
        load dict list from csv file
        :return:
        """
        self.ner_dicts = {}
        df_csv_read = pd.read_csv(self.dict_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')
        df_csv_read = df_csv_read.sample(n=self.dict_sample_size)
        for col in df_csv_read.keys() :
            self.ner_dicts[col] = []
            for val in list(set(df_csv_read[col])) :
                if (val == val and val != None) :
                    self.ner_dicts[col].append(val)

    def _check_all_match(self, words) :
        """
        check all matcing dict keys
        in ohter word entity keys
        :param words: sentence str
        :return: list contain keys
        """
        match_keys = []
        for word in words :
            word = word.replace('\n', '')
            if(word in list(self.ner_dicts.keys())) :
                match_keys.append(word)
        return match_keys

    #@autojit
    def _aug_sent(self, keys, pattern, return_aug_sent=[]) :
        """
        function which actually augment sentences
        with given pattern and keys
        :param keys: entity keys
        :param pattern: sentence pattern
        :return: list of augmented sentence
        """
        try :
            if (len(keys) > 0):
                key = keys[0]
                del keys[0]
            else :
                return return_aug_sent

            if (len(return_aug_sent) == 0):
                for word in self.ner_dicts[key] :
                    line = []
                    for slot in pattern:
                        for rep in ['\n', 'NaN'] :
                            slot = slot.replace(rep, '')
                        if(key in slot) :
                            for wd in self.mecab.morphs(word):
                                wd = wd.replace(' ', '')
                                line.append((wd, key))
                        else :
                            line.append((slot, 'O'))
                    return_aug_sent.append(line)
            else :
                del_idx = []
                for i, line in enumerate(return_aug_sent):
                    for j, slot in enumerate(line):
                        if (slot[0] == key):
                            for word in self.ner_dicts[key]:
                                line = return_aug_sent[i].copy()
                                for z, slot in enumerate(line):
                                    if(slot[0] == key) :
                                        buffer = ""
                                        for wd in self.mecab.morphs(word) :
                                            wd = wd.replace(' ', '')
                                            if(len(buffer) > 0 ) :
                                                buffer = ''.join([buffer,' ', wd])
                                            else :
                                                buffer = wd
                                        if (len(buffer) > 1 ):
                                            line[z] = (buffer, key)
                                return_aug_sent.append(line)
                            del_idx.append(i)

                for _ in del_idx:
                    del return_aug_sent[0]
            return self._aug_sent(keys, pattern, return_aug_sent)
        except Exception as e :
            print("error on nlp data augmentation :{0}".format(e))

    def _iob_formatter(self, aug_data, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test' , str(self.aug_file_cnt) , '.iob'])
        if(os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size) :
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        related_words =  word[0].split(' ')
                        for tocken in related_words :
                            f.write(''.join([tocken, ' ', word[1]]))
                            f.write('\n')
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.iob'])
            with open(path, "w")  as f :
                for line in aug_data :
                    for word in line :
                        related_words =  word[0].split(' ')
                        for tocken in related_words :
                            f.write(''.join([tocken, ' ', word[1]]))
                            f.write('\n')
                    f.write('\n')

    def _plain_formatter(self, aug_data, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out'])
        if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size):
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out'])
            with open(path, "w")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write('\n')

    def _intent_formatter(self, aug_data, key, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv'])

        if (os.path.exists(path) == False) :
            with open(path, "w")  as f :
                f.write('encode,decode\n')

        if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size):
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write(',')
                    f.write(str(key))
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv'])
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write(',')
                    f.write(str(key))
                    f.write('\n')

    def convert_data(self, idx) :
        """
        augment data with entity list and pattern
        :return: Nones
        """
        try :
            if (self.out_format_type == 'intent'):
                self._conv_type_b(idx)
            else :
                self._conv_type_a(idx)
        except Exception as e :
            print("error log : {0}".format(e))

    def _conv_type_b(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')

        i = 0
        for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
            words = []
            if (self.use_mecab):
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else:
                words = str(line).split(' ')
            match_keys = self._check_all_match(words)
            aug_data = self._aug_sent(match_keys, words, [])
            self._intent_formatter(aug_data, key, idx)

            if(i%100 == 0) :
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1

    def _conv_type_a(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')
        i = 0
        for line in df_csv_read['encode'].values:

            words = []
            if(self.use_mecab) :
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else :
                words = str(line).split(' ')

            match_keys = self._check_all_match(words)
            if(self.out_format_type == 'plain') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._plain_formatter(aug_data,idx)
            elif(self.out_format_type == 'iob') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._iob_formatter(aug_data,idx)
            else :
                raise Exception (' '.join(['not', 'plain', 'or iob']))
            if (i % 100 == 0):
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1

# da = DataAugmentation({
#                      "use_mecab": True,
#                      "max_file_size": 100000000,
#                      "pattern_data_path": "/hoya_model_root/aug/pattern.csv",
#                      "augmented_out_path": "/hoya_model_root/aug/aug_0810/",
#                      "dict_path": "/hoya_model_root/aug/dict.csv",
#                      "out_format_type": "iob",
#                      "dict_sample_size" : 3,
#                      "dict_sample_iter" : 500,
#                      "thread_num" : 8
#                  })
# da.run()
Beispiel #24
0
class classifier():
	# include POS, MAG, VX to handle negation
	POS = "NN|XR|VA|VV|MAG|VX"

	POS_IDX = ["NN", "VA", "VV", "XR"]
	# "못"은 따로 처리
	NEG_PREV = [("아니하", "VX"), ("않", "VX"), ("없", "VA"), ("없이", "MAG")]
	NEG_NEXT = [("안", "MAG")]


	def __init__(self):
		# initalize Mecab tagger
		self.tagger = Mecab()
	
		# initalize regular expression	
		self.exp = re.compile(self.POS, re.IGNORECASE)
		
		# load sentiment dictionary
		self.bag = utils.load_dictionary()
	
		# load model if exist
		with open("../Resources/models/model", "rb") as model_file:
			self.model = pickle.load(model_file)


	def handle_negation(self, words, counter):	
		# construct index to negate word except "못"
		neg_idx = []
		for neg in self.NEG_PREV:
			find = utils.find_dup_idx(words, neg)
			for item in find:
				if item-1 > -1: neg_idx.append(item-1)
		for neg in self.NEG_NEXT:
			find = utils.find_dup_idx(words, neg)
			for item in find:
				if item+1 < len(words): neg_idx.append(item+1)
	
		# handle "못~"
		for w in words:
			loc = w[0].find("못")
			if loc > 0 and w[1].find("VX"): neg_idx.append(loc-1)
		# handle "못"
		for w in words:
			loc = w[0].find("못")
			if loc > -1 and w[1].find("MAG"):
				# 긴 부정문 (못햇다, 못 했다..)
				if loc > 1 and words[loc-1][1].find("VV"): neg_idx.append(loc-1)
				# 짧은 부정
				elif loc < len(words)-1: neg_idx.append(loc+1)
				# 한계: 못 생겼다 같은 경우는 이상하게 나옴
	
		# negate word
		for i in neg_idx:
			if words[i] in self.bag[0]:
				try: idx = self.POS_IDX.index(words[i][1])
				except ValueError: pass
				else:	
					counter[idx]   -= 1
					counter[idx+4] += 1
			elif words[i] in self.bag[1]:
				try: idx = self.POS_IDX.index(words[i][1])
				except ValueError: pass
				else:
					counter[idx]   += 1
					counter[idx+4] -= 1
	
		return counter	
	
	def make_features(self, sentence, words):	
		# feature vector:
		# [ pos_noun, pos_adj, pos_verb, pos_root,
		#   neg_noun, neg_adj, neg_verb, neg_root ]
		counter = [0, 0, 0, 0, 0, 0, 0, 0]
	
		if not words: return counter
		
		for i, w in enumerate(words):
			# replace POS to sentiment dictionary type
			words[i] = list(words[i])
			if   words[i][1].find("NN") >= 0: words[i][1] = "NN"
			elif words[i][1].find("VA") >= 0: words[i][1] = "VA"
			elif words[i][1].find("VV") >= 0: words[i][1] = "VV"
			elif words[i][1].find("XR") >= 0: words[i][1] = "XR"
			elif words[i][1].find("VX") >= 0: words[i][1] = "VX"
			elif words[i][1].find("MAG") >= 0: words[i][1] = "MAG"
			words[i] = tuple(words[i])
	
			# count frequency of sentiment words
			if words[i] in self.bag[0]: # positive
				try:
					idx = self.POS_IDX.index(words[i][1])
					counter[idx] += 1
				except ValueError: pass
			elif words[i] in self.bag[1]: # negative	
				try:
					idx = self.POS_IDX.index(words[i][1])
					counter[idx+4] += 1
				except ValueError: pass
	
		counter = self.handle_negation(words, counter)
		return counter
	
			
	def features(self, article):
		# tagging article
		pos = self.tagger.pos(article)
		words = [ p for p in pos if self.exp.search(p[1]) ]
	
		# construct data sets
		data = self.make_features(article, words)
	
		# normalize features
		arr = np.array(data, dtype=float)
		scaled = preprocessing.scale(arr).tolist()
		data = scaled

		return data


	def predict(self, vector):
		return self.model.predict(vector)
Beispiel #25
0
def preprocess(args):
	"""
	Description

	Return
	- word2idx: Sequence of word index. It is 2-dim like [# of laws, # of words in each law].
	- word_dict: Word to index mapping table. { word: idx } (Only contain VOCA_SIZE words)
	- word_inv_dict: Inverted version of word_dict. { idx: word } (Only contain VOCA_SIZE words)
	- word_count: Word counter of each laws. Only contain VOCA_SIZE words.
	"""
	tagger = Mecab()
	
	with open(args.input, "r") as reader:
		data = reader.read()

	# Sequence of words in each law. [num_laws, num_words]
	word_list     = list()
	# Sequence of idx. [num_laws, num_words]
	word2idx      = list()
	# Mapping table of word - idx.
	word_dict     = dict()
	# Inversed mapping table of word - idx (for fast access).
	word_inv_dict = dict()
	# Word counter.
	word_count    = list()

	""" Tag part-of-speech and remove unimportant words (like josa..). """
	# Split each laws by <END> symbol.
	law_list = data.split("<END>")
	for law in law_list:
		# Eliminate special chars
		law = re.sub("[^a-zA-Z0-9가-힣 \n]", " ", law)
		# 1. Eliminate newline, tab and strange char.
		# 2. Split words by space.
		word_list.append(law.replace("\n", " ").replace("\t", " ").replace("\xa0" ,"").split(" "))

	for i, v in enumerate(word_list):
		for j, word in enumerate(v):
			# Tag laws using Mecab tagger. and exclude some tags.
			tag = tagger.pos(word)
			excluded = [ t[0] for t in tag if not re.search("NN|XR", t[1]) ]
		
			# Exclude word if it contain number (ex. 제1조, 제1항의 경우 해당 단어 삭제).
			for t in tag:
				if t[1] == "SN": word_list[i][j] = ""
			
			# Reconstruct word_list by using excluded tag list.
			for e in excluded:
				word_list[i][j] = word_list[i][j].replace(e, "")

		word_list[i] = [ w for w in word_list[i] if len(w) > 1 or w == "법" ]
	
	# If last element of word_list is empty, remove it.
	if not word_list[-1]:
		word_list.pop()
	
	# Construct word counter. 1st element in counter is UNKOWN_WORD (simply UNK).
	word_count.append(["UNK", 0])
	merged = list(itertools.chain.from_iterable(word_list))
	word_count.extend(collections.Counter(merged).most_common(args.voca_size-1))

	# Construct word mapping table.
	word_dict = { v[0] : i for v, i in zip(word_count, itertools.count(0)) }
	word_inv_dict = { i : v for v, i in word_dict.items() }

	# Make sequence of word-idx.
	for v in word_list:
		row = list()
		for word in v:
			idx = word_dict.get(word)
			if idx != None: 
				row.append(idx)
			else: 			
				row.append(word_dict.get("UNK"))
				word_count[0][1] += 1
		word2idx.append(row)

	word_list = None # dont use anymore
	word_dict = None # dont use anymore
	word_count = None # dont use anympre
	return np.array(word2idx), word_inv_dict
Beispiel #26
0
                                           num_keysents=5,
                                           scaling=lambda x: 1,
                                           verbose=True)
print(list(keywords.items())[:10])
print('====================')
for i, s in enumerate(sents):
    print(i, s)

print('====================')
wordrank_extractor = KRWordRank(
    min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=20,  # 단어의 최대 길이
    verbose=True)
beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10
keywords, rank, graph = wordrank_extractor.extract(text,
                                                   beta,
                                                   max_iter,
                                                   num_keywords=100)

vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x: 1)
tokenizer = MaxScoreTokenizer(vocab_score)
tokenizer2 = Mecab()
sents2 = keysentence(vocab_score,
                     text,
                     tokenizer2.nouns,
                     diversity=0.7,
                     topk=5)
for i, s in enumerate(sents2):
    print(i, s)
train_yy = DataFrame(train['smishing'], columns=['smishing'])
train_yyy = train_yy.iloc[train_smishing +
                          train_nsmishing, :].reset_index(drop=True)

test[
    'smishing'] = 2  #train data와 동일한 형태 생성을 위해 임의의 숫자를 추가 #이후 스미싱 여부 확률 값으로 덮어 씌워짐
test_xx = DataFrame(test['text'])
test_yyy = DataFrame(test['smishing'])

train_xx.shape, train_yyy.shape, test_xx.shape, test_yyy.shape

# 토크나이즈 단계
import konlpy
from konlpy.tag import Mecab
tokenizer = Mecab()

train_doc = [(tokenizer.nouns(x), y)
             for x, y in tqdm(zip(train_xx['text'], train_yyy['smishing']))
             ]  # Mecab를 활용하여 text를 토큰화 시킴
test_doc = [(tokenizer.nouns(x), y)
            for x, y in tqdm(zip(test_xx['text'], test_yyy['smishing']))]

# 불용어처리 단계

stopwords = [
    '은행', '광고', '상품', '대출', '사장', '무료', '수신', '거부', '수수료', '안내', '영업부', '년',
    '정부', '지원', '이자', '상담', '기록', '님', '고객', '고객님', '리브', 'Liiv', '최대', '카톡',
    '친구', '여신', '금리', '거부', '어플', '다운', '거부'
]
Beispiel #28
0
def tokenize(sentence):
    tagger = Mecab()
    logger.debug(sentence)
    s = " ".join(tagger.morphs(sentence))
    logger.debug("tokenized:" + s)
    return s
Beispiel #29
0
 def __init__(self):
     super(MecabTokenizer, self).__init__(Mecab())
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torchtext
import nltk
from konlpy.tag import Mecab
from torchtext.data import Field, BucketIterator, TabularDataset, Dataset
import os

from cnn_model import CNNClassifier
from rnn_model import RNN

DATA_PATH = './data' #os.environ['DATA_PATH']
tagger = Mecab()

USE_CUDA = torch.cuda.is_available()
DEVICE = 'cuda' if USE_CUDA else 'cpu'

def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)
# selected_kkma = []
# for sentence1 in kkma_morphs:
#     for word, tag in sentence1:
#         if tag in ['Noun','Adjective', 'Verb']:
#             selected_kkma.append(word)

# komoran = Komoran()
# kom_morphs = komoran.morphs(lines)
# print("komoran: ", kom_morphs)
# selected_kom = []
# for sentence1 in kom_morphs:
#     for word, tag in sentence1:
#         if tag in ['Noun','Adjective', 'Verb']:
#             selected_kom.append(word)

mecab = Mecab()
sentences_tag = []
for sentence in sentences:
    morph = mecab.pos(sentence)
    sentences_tag.append(morph)
# print("mec: ", mec_morphs)
selected_mec = []
n_sentence = 0

nouns_tag = []
for sentence in sentences:
    morph = mecab.nouns(sentence)
    nouns_tag.append(morph)
# print("mec: ", mec_morphs)

for sentence1 in sentences_tag:
Beispiel #32
0
        'ADJ':    counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'],
        'ADV':  counter['F'],
        'CC':   counter['C'] - len(subordinating_conjunctions),
        'CS':   len(subordinating_conjunctions),
        'ET':   counter['E'],
        'I':    counter['I-c'],
        'NC':   counter['N-n'] + counter['N-nc'],
        'NP':   counter['N-pn'],
        'PREF': counter['P'],
        'PRO':  counter['D'],
        'V':    counter['V-c'] + counter['V-dp'] + counter['X'],
        'PUNC': counter['M-aa'] + counter['M-cp'] + counter['M-op'] + counter['M-p'],
    }


mecab_tagger = Mecab()
twitter_tagger = Okt()
def _analyze_ko(text):
    mecab_tags = mecab_tagger.pos(text)
    twitter_tags = twitter_tagger.pos(text)
    mecab_counter = collections.Counter([x[1] for x in mecab_tags])
    twitter_counter = collections.Counter([x[1] for x in twitter_tags])

    return {  # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two
        'ADJ': twitter_counter['Adjective'],
        'ADV': twitter_counter['Adverb'],
        'CC': twitter_counter['Conjunction'],
        'CS': mecab_counter['MAJ'],
        'ET': twitter_counter['Foreign'],
        'I': max(twitter_counter['Exclamation'], mecab_counter['IC']),
        'NC': max(0, twitter_counter['Noun'] - mecab_counter['NNP'] - mecab_counter['NP']),
Beispiel #33
0
#daumNews = DaumNewsCrawling.DaumNewsCrawling(rowCnt)
#daumNews.execute()

#daumFinancing = DaumFinacingCrawling.DaumFinacingCrawling()
#daumFinancing.execute()


testArticle = DBStorage.DBStorage.instance().GetTableData(DaumNewsCrawling.NewsData(), "Index", 1165, "article");

kkma = Kkma()
print(kkma.nouns(testArticle))

okt = Okt()
print(okt.nouns((testArticle)))

mecab = Mecab()
print(mecab.nouns(testArticle))

hannanum = Hannanum()
print(hannanum.nouns(testArticle))

komoran = Komoran()
print(komoran.nouns(testArticle))

# class TestMPProcess:
#
#     _max = 0
#     _onSucess = None
#     def __init__(self, n, onsucess):
#         self._max = n
#         self._onSucess = onsucess
Beispiel #34
0
class embd_answer:

    def __init__(self):
        self.mecab = Mecab()
        self.load_data()

    def pre_phrase(self, phrase):
        for how in HOW:
            phrase = phrase.replace(how, HOW_TOKEN)
        for why in WHY:
            phrase = phrase.replace(why, WHY_TOKEN)
        for d in DEL:
            phrase = phrase.replace(d, '')
        return phrase


    def load_data(self):
        self.sentence = []
        with open('./data/training_data.txt', encoding='cp949') as f:
            lines = f.readlines()
            for line in lines:
                line = line.split('\t')
                line[0] = self.pre_phrase(line[0])
                self.sentence.append((int(line[1].replace('\n','')), self.mecab.morphs(line[0].replace('\n',''))))
        # training_set = [x[1] for x in self.sentence]
        # self.model = FastText(training_set, size=32, window=5, min_count=1, iter=10000, workers = 8)
        # self.model.save('./data/model')
        # print('training finish')
        self.model = FastText.load('./data/model')

        self.l = []
        for index, word in self.sentence:
            avg = 0
            for j in word:
                avg += self.model[j]
            avg = avg / len(word)
            self.l.append((index, avg))

    def infer(self, phrase):
        phrase = self.pre_phrase(phrase)
        phrase = self.mecab.morphs(phrase)
        qv=0
        for i in phrase:
            try:
                qv += self.model[i]
            except:
                qv += np.zeros((32))
                pass

        qv = qv / len(phrase)

        max_ = 0
        index = 0
        for i, refer in self.l:
            tmp = cosine_similarity(refer.reshape(1,-1), qv.reshape(1,-1))
            if tmp > max_:
                max_ = tmp
                index = i

        # print(index + 1)
        return index

    def infer_file(self, path = './data/training_data.txt'):
        test = []
        with open(path, encoding='cp949') as f:
            lines = f.readlines()
            for line in lines:
                test.append(line.replace('\n',''))
        for q in test:
            print(self.infer(q))
Beispiel #35
0
import re

import json
import math, struct, sys
import os.path

#from konlpy.tag import Kkma
#_analyzer= Kkma()

from konlpy.tag import Mecab
_analyzer= Mecab()

def xplit(value):
    return re.split('\r\n|\n', value)


def parse_nouns(did, text, dic_terms, f):
    candidates = xplit(text.strip())

    for candidate in candidates:
        if len(candidate):
            nouns = _analyzer.nouns(candidate)
            for noun in nouns:
                value = dic_terms.get(noun, 0)
                dic_terms[noun] = value + 1

    #terms_list = list(dic_terms.keys()).sort()



def forward_indexing():
Beispiel #36
0
 def __init__(self):
     self.mecab = Mecab()
     self.load_data()
Beispiel #37
0
import json
from konlpy.tag import Mecab
from konlpy.tag import Hannanum
from konlpy.tag import Kkma
from konlpy.tag import Komoran
from konlpy.tag import Twitter

import time

useclass = Mecab()

FILEPATH = "./data.json"
DATA = {}


def readjson(fn):
    f = open(fn, 'r')
    js = json.loads(f.read())
    f.close()
    return js


def main():
    start_time = time.time()
    global FILEPATH
    global DATA
    DATA = readjson(FILEPATH)
    i = 0
    for data in DATA:
        i += 1
        no = data['no']
from tokenizers import BertWordPieceTokenizer

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Then train it!
tokenizer.train(["./sample.csv"])

# Now, let's use it:
encoded = tokenizer.encode(
    "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다."
)
print("WPM --------------")
print(encoded.tokens)

from konlpy.tag import Mecab
print("Mecab --------------")
mecab = Mecab()
print(
    mecab.morphs(
        "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕 괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다."
    ))

# And finally save it somewhere
tokenizer.save(".", name="WPM")
Beispiel #39
0
 def __init__(self, myDB):
     self.stopword = set()
     self.myDB = myDB
     self.tagger = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
Beispiel #40
0
class TextRank:
    def __init__(self, tokenizer=None, exceptional_stop_pos=[]):
        self.stop_pos = [
            'IC', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JC', 'JX',
            'XR', 'SF', 'SE', 'SSO', 'SSC', 'SC', 'SY', 'EC', 'EF', 'ETN',
            'ETM', 'XSV', 'XSA', 'XSN', 'XPN'
        ]

        if not tokenizer:
            self.tokenizer = Mecab()
        else:
            self.tokenizer = tokenizer

        if not exceptional_stop_pos:
            self.stop_pos = [
                x for x in self.stop_pos if x not in exceptional_stop_pos
            ]

    def pos_tagging(self, content, category="정치"):
        def subtokenize(pos_list, dct):
            pos_str = "[" + ", ".join([str(t) for t in pos_list]) + "]"
            for pattern in dct:
                src = ", ".join([str(i) for i in pattern[1]])
                tgt = str(pattern[0])
                pos_str = pos_str.replace(src, tgt)

            tokenized_text = eval(pos_str)
            return tokenized_text

        def group_by_pos(tokens, join_char=''):
            return (join_char.join([
                t[0] for t in tokens
            ]), 'NNP' if len(set([t[1]
                                  for t in tokens])) > 2 else tokens[-1][1])

        def gen_tokens(
                tokens,
                join_char='',
                group_by_pos_li=['NNP', 'NNG', 'SN', 'SH', 'SL', 'NNBC'],
                stop_pos=[]):
            #last_token = tokens[0]

            ret = []
            li = []
            for t in tokens:
                if t[1] in group_by_pos_li: li.append(t)
                else:
                    if len(li) > 0: ret.append(group_by_pos(li, join_char))
                    if t[1] not in stop_pos: ret.append(t)
                    li = []
            if len(li) > 0: ret.append(group_by_pos(li, join_char))

            return ret

        morphs_dict = pickle.load(open('morphs_dict.pickle', "rb"))
        comp_dict = pickle.load(open('comps_dict.pickle', "rb"))
        ret = []
        for s in sent_tokenize(content):
            #ret.append((s, sent_li))
            sent_li = []
            sent_li.append([(w,
                             subtokenize(
                                 subtokenize(
                                     gen_tokens(self.tokenizer.pos(w),
                                                stop_pos=self.stop_pos),
                                     morphs_dict[category]),
                                 comp_dict[category])) for w in s.split()])

            #for w in s.split() : sent_li.append((w, gen_tokens(tokenizer.pos(w))))
            ret.append((s, sent_li))
            #ret.append((s, [t for t in gen_tokens(sent_li, join_char= ' ') if t[1] not in stop_pos]))

        return ret

    def keywords(self, text, n=10):
        tokens = self.pos_tagging(text)

        tokens = [t for s in tokens for w in s[1] for t in w]
        nodes = [
            k for t in tokens for k in t[1]
            if (k[1][0] in ['N', 'V']) & (len(k[0]) > 1)
        ]
        tokens = [k for t in tokens for k in t[1]]

        def connect(nodes, tokens):
            window_size = 5  # coocurrence를 판단하기 위한 window 사이즈 설정

            edges = []
            for window_start in range(0, (len(tokens) - window_size + 1)):
                window = tokens[window_start:window_start + window_size]
                #edges.append([(window[i], window[j]) for i in range(window_size) for j in range(window_size) if ( (i > j) & (window[i] in nodes) & (window[j] in nodes))])

                for i in range(window_size):
                    for j in range(window_size):
                        if (i > j) & (window[i] in nodes) & (window[j]
                                                             in nodes):
                            edges.append((window[i], window[j]))
            return edges

        graph = nx.diamond_graph()
        graph.clear()  #처음 생성시 graph에 garbage node가 남아있어 삭제
        graph.add_nodes_from(list(set(nodes)))  #node 등록
        graph.add_edges_from(connect(nodes, tokens))  #edge 연결
        scores = nx.pagerank(graph)  #pagerank 계산
        rank = sorted(scores.items(), key=lambda x: x[1],
                      reverse=True)  #score 역순 정렬
        return rank[:n]

    def print_keywords(self, text, n=10):
        print("Keyword : ")
        for k in self.keywords(text, n):
            print("{} - {}".format(k[0][0], k[1]))

    def summarize(self, text, n=3):
        tokens = self.pos_tagging(text)

        #자카드 유사도 계산
        def jaccard_similarity(query, document):
            intersection = set(query).intersection(set(document))
            union = set(query).union(set(document))
            return len(intersection) / len(union)

        # 문장간 유사도 측정 (BoW를 활용 코사인 유사도 측정)
        def sentence_similarity(sentence1, sentence2):

            sentence1 = self.tokenizer.morphs(
                sentence1[0]
            )  #[t[0] for s in sentence1[1][0] for t in s[1] if t[1][0] in ['N','V'] ]
            sentence2 = self.tokenizer.morphs(
                sentence2[0]
            )  #.split()#[t[0] for s in sentence2[1][0] for t in s[1] if t[1][0] in ['N','V'] ]
            #print(sentence1)
            return jaccard_similarity(sentence1, sentence2)

        def sentences(doc):
            return [s[0].strip() for s in doc]

        def connect(doc):
            return [(start[0].strip(), end[0].strip(),
                     sentence_similarity(start, end)) for start in doc
                    for end in doc if start is not end]

        graph = nx.diamond_graph()
        graph.clear()  #처음 생성시 graph에 garbage node가 남아있어 삭제
        graph.add_nodes_from(sentences(tokens))  #node 등록
        graph.add_weighted_edges_from(connect(tokens))  #edge 연결
        scores = nx.pagerank(graph)  #pagerank 계산
        #print(scores)
        rank = sorted(scores.items(), key=lambda x: x[1],
                      reverse=True)  #score 역순 정렬
        ssum = rank[:n]
        ranks = []
        for s in ssum:
            ranks.append(s[0])
        return ' '.join(ranks)
Beispiel #41
0
def calc_cfd(doc):
    # Calculate conditional frequency distribution of bigrams
    words = [w for w, t in Mecab().pos(doc)]
    bigrams = nltk.bigrams(words)
    return nltk.ConditionalFreqDist(bigrams)
Beispiel #42
0
def mecab_instance():
    from konlpy.tag import Mecab
    m = Mecab()
    return m
Beispiel #43
0
from konlpy.tag import Mecab
tokenizer = Mecab()

txt_file = open(
    "/Users/angeonhui/Bert-abstractive-text-summarization/data/dataset/for_vocab/all_text_0216.txt",
    'r')
text_data = txt_file.read()
txt_file.close()


def whitespace_tokenize(data):
    data = data.strip()  # 문자열의 맨앞, 맨끝 공백 지움
    if not data:
        return []
    tokens = data.split()  # 문자열을 스페이스,탭,엔터 단위로 분리하여 배열에 집어넣음
    return tokens


output_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']

for wst in whitespace_tokenize(text_data):  # wst : 공백,탭,엔터 기준 문자열 하나
    count = 0
    for token in tokenizer.morphs(wst):  # token : wst를 형태소 분석한 토큰 하나
        tk = token

        if count > 0:
            tk = "##" + tk
            if tk in output_tokens:  # 토큰이 중복되면 저장하지 않음
                continue
            output_tokens.append(tk)
        else:  # count==0
Beispiel #44
0
 def __init__(self, vocab_file, do_lower_case=True):
     self.vocab = load_vocab(vocab_file)
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
     self.mecab_tokenizer = Mecab('../mecab-ko-dic-2.1.1-20180720')
from gensim.models import Word2Vec
import pandas as pd
from konlpy.tag import Mecab
from tqdm import tqdm
import pdb
data = pd.read_csv("../data/train.txt", header=None, sep='\t')
tokenizer = Mecab().morphs
sentences = data[1]
print("tokenize start")
tokenized_texts = []
for sent in tqdm(sentences):
    try:
        tokenized_texts.append(tokenizer(sent))
    except:
        pass
print("tokenize end")
pdb.set_trace()

model = Word2Vec(tokenized_texts, size=200, window=3, min_count=1, workers=4)
print("train model")
model.save("mid.model")
model.intersect_word2vec_format('./ko.bin')
model.save("w2v.model")
# model = Word2Vec.load('word2vec.model')
Beispiel #46
0
## 불용어 제거
import pandas as pd
from tqdm import tqdm
from konlpy.tag import Mecab
mecab = Mecab(dicpath=r"C:/mecab/mecab-ko-dic")

df = pd.read_excel('./data/작업표준 목록_조선외.xlsx')
target = df["표준서명"]

targets = []

results = []

stop_word = "전 난 일 걸 뭐 줄 만 건 작업 분 위 개 끝 송 잼 이거 부 동 번 중 듯 차 때 게 내 말 나 수 거 점 것 등 측 의 급 후 간 단 시 곳"
stop_word = stop_word.split(' ')
# print(stop_word)

##########
for sentence in tqdm(target):
    result = []
    for noun in mecab.nouns(sentence):
        if noun not in stop_word:
            result.append(noun)
    targets.append(sentence)
    results.append(result)

summary = [targets, results]

result_df = pd.DataFrame(summary)
result_df = result_df.T
print(result_df.head(20))
Beispiel #47
0
import os
import json
from konlpy.tag import Mecab
from konlpy.tag import Hannanum
from konlpy.tag import Kkma
from konlpy.tag import Komoran
from konlpy.tag import Twitter
import pymysql
import math
import time
import operator
import sys
FILEPATH="./result.json"
DATA={}
cls=list()
cls.append(Mecab())
cls.append(Komoran())
cls.append(Twitter())
maxfreq=dict()
site=sys.argv[1]
site=site.strip()
def TF(nouns):
	allsize=len(nouns)
	ret=dict()
	for noun in nouns:
		if(len(noun)<2):
			continue
		ret[noun]=1
	return ret

def TFIDF(allword, tf):
Beispiel #48
0
                "test_han_20181106-20181113",
                "test_oh_20181106-20181113"]'''
    name_set = [
        "동아일보", "경향신문", "[조중동]", "[한경오]", "조선일보", "중앙일보", "한겨례", "오마이뉴스"
    ]
    porgress_set = [False, True, False, True, False, False, True, True]
    for k in range(0, len(test_set)):
        news = name_set[k]
        test = test_set[k]
        date = test.split("_")[2]
        db_date = date.split("-")[1]
        df = pd.read_table(
            "train_20180503-20181119")  # training set (이전 기사들, 여러 신문)
        pp = pd.read_table(test)  # test set (최근 기사, 하나의 신문)

        mecab = Mecab()

        # reviews : title이 tokenizee된 list가 원소
        # labels : reviews와 같은 index의 title의 label
        # _p : test set
        # all_tokens : training set 통해 test set의 판단 하려면 test set 의 토큰들이 같이 indexing 되어야함
        # unique_tokens : token의 개수 체크
        reviews = []
        reviews_p = []
        labels = []
        labels_p = []
        all_tokens = []
        unique_tokens = dict()

        # training set tokenize
        for i in range(len(df)):
Beispiel #49
0
def language_processing(input_data):
    mecab = Mecab()

    # 명사에 대한 yn 데이터 저장
    # 날개가 있을 경우, check_data['날개'] == 1
    check_data = dict()
    for name in [input_neuron.name for input_neuron in InputLayer.all_neuron]:
        # 우선 check_data 의 모든 데이터를 모른다는 조건으로 초기화
        check_data[name] = 0

    # [*range(3)] is same with [0, 1, 2]
    word_list, pos_list = zip(*[(word, pos)
                                for word, pos in mecab.pos(input_data)
                                if pos in ['VV', 'VA', 'NNG', 'JC', 'SC', 'MAG', 'VX']])

    # 이미 처리한 word 데이터를 False 로 바꾸기 위해
    # 데이터 변경을 지원하는 리스트로 형 변환. (기존에는 tuple)
    word_list = list(word_list)

    # 같은 이유
    pos_list = list(pos_list)

    # 부정적인 성분 부사를 가지고 있는 형용사를 치환
    # 날개가 안 보인다 --> 날개가 없다

    yn_dict = {
        '있': 1,
        '들리': 1,
        '보이': 1,
        '없': -1,
        '모르': 0
    }

    """
    for index in range(len(pos_list)):
        if pos_list[index] == 'MAG' and word_list[index] == '안':  # 성분 부사 이면서 부정 부사 일 경우
            word_list[index] = '없'  # 부정으로 치환


        for i in range(len(pos_list[index:])):  # 부정 부사 뒷 부분 탐색
            if pos_list[i] in ['VV', 'VA']:  # '있', '없' 등의 데이터가 나올 경우
                try:
                    word_list[i] = yn_change[word_list[i]]  # yn_change 를 이용해 반전시킨다
                except KeyError:
                    word_list
                    pass
    """

    # 형용사를 먼저 탐색하고, 주변 명사를 그룹화 하는 방식으로 처리한다.

    # pos 데이터 중에서 있,없 등의 수식어를 가져옴
    for index in range(len(pos_list)):
        if pos_list[index] == 'MAG' and word_list[index] == '안':  # 성분 부사 이면서 부정 부사 일 경우
            word_list[index] = '없'  # 부정으로 치환
            pos_list[index] = 'VA'  # pos 데이터도 맞게 변경

        if pos_list[index] in ['VA', 'VV']:  # if pos is yn data

            # 해당 명사에 서술한 내용에 따라 InputLayer Neuron 에 입력함
            try:
                yn = yn_dict[word_list[index]]
            except KeyError:
                yn = 0
            finally:
                # 뒤에 부정적인 보조용언이 올 경우
                # ex) ~하지 '않'는다

                # 다음 인덱스 부터 탐색
                tmp_index = index + 1
                while tmp_index < len(pos_list):
                    if pos_list[tmp_index] == 'VX':
                        if word_list[tmp_index] == '않':
                            yn *= -1
                            break
                    elif pos_list[tmp_index] == 'NNG':
                        break  # 다음 명사가 나오면 종료
                    tmp_index += 1

            # 그 전까지의 모든 명사를 위 yn 데이터로 저장
            for nng in [word_list[i] for i in range(index) if pos_list[i] == 'NNG']:
                # 이미 처리한 word 일 경우
                if nng is False:
                    continue
                else:
                    try:
                        check_data[nng]
                    except KeyError:
                        pass
                    else:
                        check_data[nng] = yn

            # 처리한 word 들은 False 으로 치환.
            word_list[:index] = ([False] * index)

    return check_data
Beispiel #50
0
import sys
import codecs
import re
from sys import stdin
from konlpy.tag import Mecab
from konlpy.tag import Kkma

# MeCab installation needed
mecab = Mecab()

UTF8Reader = codecs.getreader('utf8')
sys.stdin = UTF8Reader(sys.stdin)

jpatt = re.compile('J.*')
spatt = re.compile('XSN')
fpatt = re.compile('SF')
upatt = re.compile('UNKNOWN')

vpatt = re.compile('V.*')
xpatt = re.compile('XSV')
npatt = re.compile('N.*')
x2patt = re.compile('XSA')

ms_reg = r'\/{1,}'
ms_reg2 = r'\s{2,}'

log_epch = 10000
f_size = 448453

dbg_line=None
testmod=None
Beispiel #51
0
 def __init__(self, app):
     self.app = app
     self.mecab = Mecab()
     self.load_models()
#!/usr/bin/python3

from konlpy.tag import Mecab
import sys
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("file", nargs="*", default=sys.stdin, type=argparse.FileType('r', encoding='utf-8'))
    args = parser.parse_args()

    mecab = Mecab()

    files = args.file if type(args.file) is list else [args.file]
    for f in files:
        for line in f.readlines():
            print(' '.join(mecab.morphs(line.strip())))
Beispiel #53
0
class WordTokenizer(Tokenizer):
    """
    Word Tokenizer

    * Args:
        name: tokenizer name [treebank_en|spacy_en|mecab_ko|bert_basic]

    * Kwargs:
        flatten: return type as flatten list
        split_with_regex: post split action. Split tokens that the tokenizer cannot split.
    """
    def __init__(self, name, sent_tokenizer, config={}, split_with_regex=True):
        super(WordTokenizer,
              self).__init__(name, f"word-{name}+{sent_tokenizer.cache_name}")
        self.config = config
        self.sent_tokenizer = sent_tokenizer
        self.word_tokenizer = None

        self.split_with_regex = split_with_regex
        if split_with_regex:
            self.extra_split_chars_re = self.make_split_regex_expression()

    def make_split_regex_expression(self):
        """
        Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens
        due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see
        if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA
        """
        extra_split_chars = (
            "-",
            "£",
            "€",
            "¥",
            "¢",
            "₹",
            "*",
            "\u2212",
            "\u2014",
            "\u2013",
            "/",
            "~",
            '"',
            "'",
            "\ud01C",
            "\u2019",
            "\u201D",
            "\u2018",
            "\u00B0",
            ".",
            ":",
        )
        extra_split_tokens = (
            "``",
            "(?<=[^_])_(?=[^_])",  # dashes w/o a preceeding or following dash, so __wow___ -> ___ wow ___
            "''",
            "[" + "".join(extra_split_chars) + "]",
        )
        return re.compile("(" + "|".join(extra_split_tokens) + ")")

    @overrides
    def _tokenize(self, text, unit="text"):
        """ Text -> word tokens """
        if type(text) != str:
            raise ValueError(f"text type is must be str. not {type(text)}")

        if unit == "sentence":
            tokens = getattr(self, f"_{self.name}")(text)
        else:
            sentences = self.sent_tokenizer.tokenize(text)
            tokens = [
                getattr(self, f"_{self.name}")(sentence)
                for sentence in sentences
            ]

        if self.split_with_regex and self.name != "spacy_en":
            tokens = self._split_with_regex(tokens)

        return list(common_utils.flatten(tokens))

    def _split_with_regex(self, sentences):
        for i, sentence in enumerate(sentences):
            sentences[i] = [
                token for token in self._post_split_tokens(sentence)
            ]
        return sentences

    def _post_split_tokens(self, tokens):
        return [[x for x in self.extra_split_chars_re.split(token) if x != ""]
                for token in tokens]

    """ Tokenizers """

    def _space_all(self, text):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F:
                return True
            return False

        prev_is_whitespace = True
        tokens = []
        for char in text:
            if is_whitespace(char):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    tokens.append(char)
                else:
                    tokens[-1] += char
                prev_is_whitespace = False
        return tokens

    def _treebank_en(self, text):
        if self.word_tokenizer is None:
            import nltk

            self.word_tokenizer = nltk.TreebankWordTokenizer()

        return [
            token.replace("''", '"').replace("``", '"')
            for token in self.word_tokenizer.tokenize(text)
        ]

    def _spacy_en(self, text):
        if self.word_tokenizer is None:
            from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer

            self.word_tokenizer = load_spacy_model_for_tokenizer(
                self.extra_split_chars_re)

        def _remove_spaces(tokens):
            return [token.text for token in tokens if not token.is_space]

        return _remove_spaces(self.word_tokenizer(text))

    def _bert_basic(self, text):
        if self.word_tokenizer is None:
            from transformers import BasicTokenizer

            self.word_tokenizer = BasicTokenizer(**self.config)

        return self.word_tokenizer.tokenize(text)

    def _mecab_ko(self, text):
        if self.word_tokenizer is None:
            from konlpy.tag import Mecab

            self.word_tokenizer = Mecab()

        return self.word_tokenizer.morphs(text)
Beispiel #54
0
def analyzing_morphem(content_list):
    mecab = Mecab()
    for idx, doc in enumerate(content_list):
        if idx % 5000 == 0 :
            print 'Morphem Analysis on %d' % idx
        yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8'))]).encode('utf-8')