def korean_morph(text): twitter = Twitter() s=twitter.morphs(str(unicode(text))) s=' '.join(s) return s
def pos_tagging(text): available_terms_list = [] twitter = Twitter() pos_list = twitter.pos(text, norm=True, stem=True) for item in pos_list: if (item[1] == 'Verb') | (item[1] == 'Adjective'): available_terms_list.append(item) return available_terms_list
def _twitter_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ twitter = Twitter(jvmpath=None) return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(twitter.pos(str(data)), tag_combine=tag_combine) return return_arr
def pos_tagging_noun(text): noun_terms_list = [] twitter = Twitter() pos_list = twitter.pos(text, norm=True, stem=True) for item in pos_list: if (item[1] == 'Noun'): noun_terms_list.append(item) return noun_terms_list
def create_wordbag(x): wordbag = [] if(x['eval_content']) is None: return wordbag twitter = Twitter() for text in twitter.pos(x['eval_content'], stem = True): tag = text[1] if tag in unneeded: continue word = text[0] wordbag.append(word) return wordbag
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def main(): """ konlpy 사용시 주의 사항 자바 설치 및 세팅 필요 JAVA_HOME 세팅이 필요합니다. export JAVA_HOME=$(/usr/libexec/java_home) """ konl = Twitter() file_path = '/Users/bongster/Downloads/20160528_jiana.csv' with open(file_path, 'rb') as csv_file: inforeader = csv.reader(csv_file) for row in inforeader: r = konl.pos(unicode(row[4], 'utf-8'), norm=True, stem=True) print '=' * 20 for txt, post in r: print txt, post print '=' * 20
def get_noun(self): print("[*] 명사 추출 시작") start_time = time.time() twitter = Twitter() for s in self.word_list: temp = twitter.nouns(s) for t in temp: self.noun_list.append(str(t)) end_time = time.time() print("[*] 명사 추출 완료(소요시간 : {0})".format(str((end_time-start_time)))) print("[*] 추출된 명사 길이 : {0}".format(str(len(self.noun_list)))) # 빈도 분석 count = Counter(self.noun_list) #tag = count.most_common( int(len(count)*(15/100)) ) tag = count.most_common(50) taglist = pytagcloud.make_tags(tag, maxsize=100) pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(800, 600), fontname='Nanum Gothic Coding', rectangular=False)
def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter()
class KorDisintegrator: def __init__(self): self.ko_twitter = Twitter() def convert2simple(self, sentence="", norm=True, stem=True): disintegrated_sentence = self.ko_twitter.pos(sentence, norm=norm, stem=stem) convert_sentence = [] for w, t in disintegrated_sentence: if t not in ["Eomi", "Josa", "KoreanParticle", "Punctuation"]: convert_sentence.append(w) return " ".join(convert_sentence)
def __init__(self, on_han=False, on_twitter=False, on_mecab=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_han: han instance :param on_twitter: twitter instance :param on_mecab: mecab instance """ if on_han is True: self.han = Hannanum() if on_twitter is True: self.twitter = Twitter()
def main(_): is_train = True # if False then test if is_train : train() else: checklist=['Exclamation','Alpha','URL'] twitter=Twitter() dic_file_x='data/xproject_class.dict.pkl' worddict_x = dict() worddict_x = load_dict(dic_file_x) x, x_mask,prediction=build_test('/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83') while 1: choice=raw_input("Me: ") if choice in ["Q","q"]: break #print choice choice=choice.decode('utf-8') sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True) if s[1] not in checklist]) words=(word_tokenize(sen.strip().lower())) #print ' '.join(words) seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words] seqs = [s if s<600 else 1 for s in seqs] seqs=[seqs] res=test_data(seqs,x, x_mask,prediction,'/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83') #print res print "class: "+str(res)
class Parser(object): def __init__(self, filename=None, nth=-1): self.filename = filename self.nth = nth self.twitter = Twitter() self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log') def parse_sentence(self, sentence): return self.twitter.pos(sentence) def parse_all_generator(self, filename=None, nth=None): if filename is None: filename = self.filename or click.prompt('file name is required') if nth is None: nth = self.nth for row in ParseUtils.iter_csv(filename): try: parsed = self.parse_sentence(row[nth]) concated = ' '.join([ParseUtils.concat_tuple(x) for x in parsed]) row[nth] = concated except BaseException as e: msg = '{error:<80} | {data}'.format(error=str(e), data=ParseUtils.list_to_csv(row)) self.logger.error(msg) continue yield row def extract_parsed(self, out_filename, filename=None, nth=None): if filename is None: filename = self.filename or click.prompt('file name is required') filelength = ParseUtils.file_length(filename) if nth is None: nth = self.nth with open(out_filename, 'w') as f: csv_writer = csv.writer(f) for row in Progress(self.parse_all_generator(), filelength, 10): csv_writer.writerow(row)
import json, pymongo, requests, sys import time, dateutil.parser import gensim, logging, os from konlpy.tag import Twitter; t = Twitter() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='review' + '.log') pos = lambda d: ['/'.join(p) for p in t.pos(d)] # class LSGenerator(object): # def __init__(self, collname): # self.conn = pymongo.MongoClient("mongodb://localhost") # self.db = self.conn.watcha # self.cnames = self.db.collection_names() # self.collections = dict() # self.collname = collname # for cname in self.cnames: # self.collections[cname] = eval('self.db.' + cname) # del self.collections['reviews'] # del self.collections['system.indexes'] # def __iter__(self): # for row in self.collections[self.collname].find(): # rating = row['rating'] # cid = row['comment_id'] # text = row['text'] # pos_text = pos(text) # tags = [str(rating) + '_' + str(cid) + '_' + self.collname] # yield gensim.models.doc2vec.TaggedDocument(words = pos_text, tags = tags) class LSGenerator(object):
# -*- coding: utf-8 -*- import numpy as np import sys import codecs from konlpy.tag import Twitter from konlpy.tag import Kkma konlpy_twitter = Twitter() kkma = Kkma() from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn import metrics def select_test_data(sample_labels, sample_text, i): chunksize = len(sample_text)/5 start = chunksize * i; if i == 4: end = len(sample_text) else: end = start + chunksize test_labels = sample_labels[start:end] test_text = sample_text[start:end] train_labels = sample_labels[:start] + sample_labels[end:] train_text = sample_text[:start] + sample_text[end:] return (test_labels, test_text, train_labels, train_text)
#! /usr/bin/python # -*- coding: utf-8 -*- from konlpy.corpus import kobill from konlpy.tag import Twitter; t = Twitter() from matplotlib import pyplot as plt pos = lambda x: ['/'.join(p) for p in t.pos(x)] docs = [kobill.open(i).read() for i in kobill.fileids()] # get global unique token counts global_unique = [] global_unique_cnt = [] for doc in docs: tokens = pos(doc) unique = set(tokens) global_unique += list(unique) global_unique = list(set(global_unique)) global_unique_cnt.append(len(global_unique)) print(len(unique), len(global_unique)) # draw heap plt.plot(global_unique_cnt) plt.savefig('heap.png')
keyword_before = 'C:/doc2vec/result/keyword_before/' full_dir = 'C:/doc2vec/word_result/total_test/' keyword_after = 'C:/doc2vec/result/keyword_after/' txt_list = os.listdir(path_dir) # 폴더안에 있는 파일 리스트로 저장 txt_list2 = os.listdir(path_dir2) txt_list3 = os.listdir(path_dir3) txt_list4 = os.listdir(path_dir4) txt_list5 = os.listdir(path_dir5) txt_list6 = os.listdir(path_dir6) txt_pre = os.listdir(path_pre) str1 = '' total_word = [] total_word2 = [] t = Twitter() # dir=path_dir sys.stdout.flush() for change in txt_pre: total = [] # 통합문서 remover = change.replace(".txt", "-") original = path_pre + change etc_pdf = pdf_dir + remover + '0.pdf' etc_hwp = hwp_dir + remover + '0.hwp' jpg_name = change.replace("txt", "jpg") jpg_file = jpg_dir + jpg_name png_name = change.replace("txt", "png") png_file = jpg_dir + png_name pdf_name = change.replace("txt", "pdf")
plt.close('all') fig = plt.figure() ax3 = plt.subplot(111) plt.plot(model.errors) plt.grid() ax3.set_title('Training error') plt.savefig('error.png') elif mode=='te': if os.path.isfile(filepath): model.load(filepath) else: raise IOError('loading error...') checklist=['Exclamation','Alpha','URL'] twitter=Twitter() while 1: choice=raw_input("Me: ") if choice in ["Q","q"]: break #print choice choice=choice.decode('utf-8') sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True) if s[1] not in checklist]) words=(word_tokenize(sen.strip().lower())) #print ' '.join(words) seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words]
import collections from konlpy.tag import Twitter from konlpy.utils import pprint a1 = collections.Counter(['1', '2', '3', '4', '3', '3', '1', 3]) a2 = collections.Counter(['1', '1', '3', '2', '3']) a3 = a1 | a2 print(a3) print(a3['5']) twt = Twitter() word = ('알리바바와 40인의 도굴꾼 지하에서 웃는다 독도는 우리땅 우리땅 땅요 땅요 내 땅이요 조선 땅도 내땅이요') tes = twt.pos(word) pprint(tes) offsets = set(i for i in range(len(word)) if word.startswith('땅', i)) print(offsets) print(word[3]) """ 짚고 넘어갈 것 1. 형태소 분석기는 완벽하지 못하다. 2. 기사문은 일반 글과 달리 지킬 건 지킨다 : 띄어쓰기 및 오타가 많지 않은 편. 즉 문법적인 이해 요소가 헷갈릴 일이 거의 없다. 3. 기사문에는 일반 문법을 벗어나는 것이라면 보통 명사의 지나친 나열이 있다. 4. 트위터 분석기엔 대명사 판별기가 없기 때문에, 정황을 파악해서 필요한 경우 직접 파악해줘야 한다. 5. 기사문엔 대명사 말고도 줄임말 (ex: 문재인 대통령 >> 문 대통령) 으로 많이 쓰인다 6. 형태소 분석기들은 띄어쓰기를 믿지 않는 경향이 있다. 띄어쓰기 여부를 직접 입력해 주도록 하자.
import pickle from os import listdir import nltk import gensim nltk.download('wordnet') from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from konlpy.tag import Twitter lm = WordNetLemmatizer() twit = Twitter() import logging import urllib, json from urllib.request import urlopen from elasticsearch import Elasticsearch default = "http://192.168.101.183:9200/wrapsody/document/" def createJson(url): with urllib.request.urlopen(url) as url2: data = json.loads(url2.read().decode('utf-8')) return data def saveIdList(fname): dic = createJson(default + "_search?size=10000&_source=false&sort=_id:desc") idList = [] for e in dic["hits"]["hits"]:
def tokenize(self, doc): pos_tagger = Twitter() return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
import tweepy from func_lib import * from konlpy.tag import Twitter; t = Twitter() from function import * import time # crawling from twitter def twitterC(ID,flist,site_type): # authorization level API_KEY = 'NjHC0Y2Iql94ivUB78lC60Bpm' API_SECRET = 'mR4R132jKjuUga5GN0RyVngVk80I23daJhR21n1RstQDQvZNG6' ACCESS_KEY = '794856483007533057-n4iG19CHb8KNvxIkmSp1ahg7mPhe0Bq' ACCESS_SECRET = '2Pp0aZBou8DQ2h6y0ptim6Zo1F3HlLmOAxhG5sVuN2EY2' oAuth = tweepy.OAuthHandler(API_KEY, API_SECRET) oAuth.set_access_token(ACCESS_KEY, ACCESS_SECRET) api = tweepy.API(auth_handler=oAuth, api_root='/1.1') # search timeline through ID keyword userID = ID user = api.get_user(userID) timeline = api.user_timeline(userID) total_update(site_type) # GET USER'S TIMELINE TWEETS for tweet in timeline: try:
loc = os.listdir(dir) content = [] nouns = [] # Listing Texts for maindir in loc: subdir = os.listdir(dir + '/' + maindir) file_list = [] for file in subdir: file_list.append(open(dir + '/' + maindir + '/' + file, "r").read()) content.append(file_list) nlp = Twitter() # Seperating Words for i in content: list_wrap = [] for j in i: list_wrap.append(nlp.nouns(j)) nouns.append(list_wrap) words = '' for c in content[0]: words = words + ' ' + c nouns2 = nlp.nouns(words)
from keras.layers.convolutional import Conv1D from keras.layers.convolutional import Conv2D from keras.layers.convolutional import MaxPooling1D from keras.layers.embeddings import Embedding from keras.preprocessing import sequence from keras.models import Model from keras.engine.topology import Layer, InputSpec from keras import initializers, regularizers, constraints from keras import backend as K # ---------------Data Load Starts------------------ corpus_fname = './dataset/train_data.txt' from konlpy.tag import Twitter mecab = Twitter() def get_normalized_data(sentence): # original_sentence = mecab.pos(sentence, norm=True, stem=True) original_sentence = mecab.pos(sentence) inputData = [] for w, t in original_sentence: # if t in ['Number']: # w = '0' if t not in ['Number', 'Punctuation', 'KoreanParticle']: inputData.append(w) return (' '.join(inputData)).strip() def get_text(fname):
def __init__(self): self.t = Twitter()
def upload(request): if request.method == 'POST': if 'file' in request.FILES: myUid = str(uuid.uuid4()) dataChatroom = Chatroom( uid = myUid ) dataChatroom.save() data = Chatroom.objects.get(uid=myUid) chatroom_id = data.id file = request.FILES['file'] filename = myUid fp = open('%s/%s' % ("data", filename) , 'wb') for chunk in file.chunks(): fp.write(chunk) fp.close() log_file = open('%s/%s' % ("data", filename) , 'r') messages = normalize( log_file ) log_file.close() #파일 삭제 os.remove('%s/%s' % ("data", filename)) sender_list = set() send_ratio = {} msg_bytes = {} sent_time = {} sent_time = {} for i in range (0, 7) : sent_time[ i ] = {} for j in range(0,24) : sent_time[ i ][ j ] = 0 kcount = {} hcount = {} ucount = {} keywords = {} keywords_all = {} sent_month = "" temp_keywords = "" emoticons = 0 total = 0 last_sender = "" intimacy = {} is_one_to_one = 0 twitter = Twitter() for msg in messages : sender_list.add(msg.sender) # to calculate intimacy between member if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : td_increment( intimacy, last_sender, msg.sender, 1) td_increment( intimacy, msg.sender, last_sender, 1) last_sender = msg.sender # check send ratio. td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1) # calculate msg bytes by sender td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents)) # count k in msg. increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8'))) increment(hcount, msg.sender, msg.contents.count(unicode('ㅎ','utf-8'))) increment(ucount, msg.sender, msg.contents.count(unicode('ㅠ','utf-8'))) # count emoticons if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents: emoticons = emoticons + 1 # calculate active time td_increment(sent_time, msg.datetime.weekday(), msg.datetime.time().hour, 1) # analyze keyword """ keywords_list = twitter.nouns(msg.contents) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ): td_increment(keywords_all, str(msg.datetime)[:7], keyword, 1) increment(keywords, keyword, 1) """ if len(sent_month) == 0 : sent_month = str(msg.datetime)[:7] if sent_month == str(msg.datetime)[:7] : temp_keywords = temp_keywords + " " + msg.contents elif sent_month != str(msg.datetime)[:7] : keywords_list = twitter.nouns(temp_keywords) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ) : td_increment(keywords_all, sent_month, keyword, 1) increment(keywords, keyword, 1) sent_month = str(msg.datetime)[:7] temp_keywords = msg.contents #마지막달은 위 for문에서 못 하니까 여기서 한번 더 함. keywords_list = twitter.nouns(temp_keywords) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ) : td_increment(keywords_all, sent_month, keyword, 1) increment(keywords, keyword, 1) if len(sender_list) == 2 : response_time = {} last_sender = "" last_response_time = timedelta(0) for sender in sender_list : response_time[sender] = [] for msg in messages : if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : last_sender = msg.sender response_time[msg.sender].append(msg.datetime - last_response_time) last_response_time = msg.datetime #insert frequency message & byte for date in send_ratio : for sender in send_ratio[date] : dataMessage = FrequencyMessage( chatroom_id = chatroom_id, name = unicode(str(sender), 'utf-8').encode('utf-8'), date = date, count = int(send_ratio[date][sender]), bytes = int(msg_bytes[date][sender]) ) dataMessage.save() #insert all keywords cnt = 0 for date in keywords_all : for keyword in keywords_all[date] : tasks.insert_keywords.delay(keyword, date, keywords_all[date][keyword]) """ word = smart_str(keyword) cnt = cnt + 1 getWordData = FrequencyWordAll.objects.filter(word=keyword, date=date) if getWordData.exists() : FrequencyWordAll.objects.filter(id=getWordData[0].id).update(count=F('count') + keywords_all[date][keyword]) else : dataWordAll = FrequencyWordAll( date = date, word = word, count = int(keywords_all[date][keyword]) ) dataWordAll.save() """ #insert most keywords 20 sorted_keywords = sorted(keywords.items(), key=lambda x:x[1], reverse = True) for i in range(0,20) : try : word = smart_str(sorted_keywords[i][0]) dataWord = FrequencyWord( chatroom_id = chatroom_id, word = word, count = int(sorted_keywords[i][1]) ) dataWord.save() except : pass #insert moment for week in sent_time : for hour in sent_time[week] : dateTime = FrequencyTime( chatroom_id = chatroom_id, week = int(week), hour = int(hour), count = int(sent_time[week][hour]) ) dateTime.save() if len(sender_list) == 2 : is_one_to_one = 1 intimacy = {} for sender in response_time : rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender]) td_increment( intimacy, sender, " ", rt_average.total_seconds()) #insert intimacy for member in intimacy : for friends in intimacy[member] : dataIntimacy = Intimacy( chatroom_id = chatroom_id, name = unicode(str(member), 'utf-8').encode('utf-8'), target = unicode(str(friends), 'utf-8').encode('utf-8'), count = int(intimacy[member][friends]) ) dataIntimacy.save() #insert each char count for sender in kcount : dataChar = FrequencyChars( chatroom_id = chatroom_id, name = unicode(str(sender), 'utf-8').encode('utf-8') ) try : dataChar.count_char_1 = int(kcount[sender]) except : pass try : dataChar.count_char_2 = int(hcount[sender]) except : pass try : dataChar.count_char_3 = int(ucount[sender]) except : pass dataChar.save() Chatroom.objects.filter(id=chatroom_id).update(complete_datetime=datetime.datetime.now(), is_one_to_one=is_one_to_one) return HttpResponse(myUid) return HttpResponse('Failed to Upload File')
def main(): engine = create_engine( 'mysql://*****:*****@13.125.100.34:3306/release?charset=utf8mb4') conn = engine.connect() review_df = pd.read_sql_table('api_review', conn) s_review = review_df.store_id.unique() tot_rev_df = pd.DataFrame(columns=["store", "contents"]) revlist = [] for i in s_review: str = "" for content in review_df[review_df.store_id == i].content: str = str + " " + content revlist.append(str) tot_rev_df["store"] = s_review tot_rev_df["contents"] = revlist twitter = Twitter() all = [] for i in range(0, len(tot_rev_df)): if (len(tot_rev_df.loc[i].contents) == 1): temp = [] else: temp = twitter.nouns(tot_rev_df.loc[i].contents) all.append(temp) tfMapList = [] wordMap = {} wordCount = 0 for data in all: tfMap = {} for word in data: if word in tfMap.keys(): tfMap[word] += 1 else: tfMap[word] = 1 if word not in wordMap.keys(): wordMap[word] = wordCount wordCount += 1 tfMapList.append(tfMap) table = [[0] * len(wordMap) for _ in range(len(tfMapList))] row = 0 for tfMap in tfMapList: for word, tf in tfMap.items(): word_count = 0 for map1 in tfMapList: if word in map1.keys(): word_count += 1 idf = math.log10(len(tfMapList) / word_count) tf_idf = tf * idf column = wordMap[word] table[row][column] = tf_idf table2 = pd.DataFrame.from_records(table) svd = TruncatedSVD(n_components=15) pos = svd.fit_transform(table2) norm = Normalizer(copy=False) pos2 = norm.fit_transform(pos) km = KMeans(n_clusters=40, random_state=0) labels_km = km.fit_predict(pos2) result = [] for cur in range(0, 40): temp = [i for i, e in enumerate(labels_km) if e == cur] result.append(temp) with open('../../../../data/tf-idf_Result.pkl', 'wb') as f: pickle.dump(result, f)
from konlpy.tag import Twitter import sys import json twitter = Twitter() with open(sys.argv[1]) as f: datas = json.loads(f.read()) result = [] for data in datas: result.append(twitter.nouns(data)) print(str(result).replace("'",'"'))
# set default coding euc-kr 2 utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') print ("load") #load from kobill from konlpy.corpus import kobill #docs_ko =kobill.open('kobill/news.txt').read() docs_ko = [kobill.open(i).read() for i in kobill.fileids()] print ("tokenize") #tokenize from konlpy.tag import Twitter; t = Twitter() print ("tokenize1") pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)] print ('tokenize2') texts_ko = [pos(doc) for doc in docs_ko] #texts_ko = pos(docs_ko) print ("train") import time now_time = time.time() #train from gensim.models import word2vec wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300) wv_model_ko.init_sims(replace=True) wv_model_ko.save('ko_word2vec_e.model')
def index(request): message = '' tagger = '' list_of_tagger = '' if request.method == 'POST': input_sentence = request.POST.get("name", "") list_of_tagger = request.POST.getlist("tagger") if 'selectall' in list_of_tagger: list_of_tagger = ['moara', 'twitter', 'kkma', 'komoran'] if input_sentence != '': if 'moara' in list_of_tagger: # ======== 모아라 형태소 분석기 ======== from subprocess import Popen, PIPE, STDOUT import os import re p = Popen([ 'java', '-jar', 'C:/Users/bessh4/Desktop/mysite/postagging/moara_pos.jar', input_sentence ], stdout=PIPE, stderr=STDOUT, shell=True) output = [line.decode('cp949') for line in p.stdout] print_moara = [] print_moara2 = [] for i in output[2:]: if i != "\n": if i != '\r\n': print_moara2.append( i.split(", ")[0][1:] + " / " + i.split(", ")[1]) string_i = '' for i in print_moara2: word_i = i.split('단어음절: ')[1].split(" / ")[0] pos_i = i.split('단어품사: ')[1] if len(re.findall(r'\bRECOMMEND\S+\b', pos_i)) == 0: string_i += word_i + "/" + pos_i + " " print_moara = ("모아라", string_i) else: print_moara = ("모아라", "") # ======== konlpy 형태소 분석기 ======== if 'twitter' in list_of_tagger: twitter_str = '' twitter = Twitter() for i in twitter.pos(input_sentence): twitter_str += str(i[0] + "/" + i[1] + " ") twitter_message = ("트위터", twitter_str) else: twitter_message = ("트위터", "") if 'kkma' in list_of_tagger: kkma_str = '' kkma = Kkma() for i in kkma.pos(input_sentence): kkma_str += str(i[0] + "/" + i[1] + " ") kkma_message = ("꼬꼬마", kkma_str) else: kkma_message = ("꼬꼬마", "") if 'komoran' in list_of_tagger: komoran_str = '' komoran = Komoran() for i in komoran.pos(input_sentence): komoran_str += str(i[0] + "/" + i[1] + " ") komoran_message = ("코모란", komoran_str) else: komoran_message = ("코모란", "") message = [ print_moara, twitter_message, kkma_message, komoran_message ] else: message = "형태소 분석할 문장을 입력하세요" context = {'message': message} return render(request, 'postagging/index.html', context)
import pandas as pd from konlpy.tag import Twitter twitter = Twitter() #ex) print(twitter.pos(u'이것도 되나욬ㅋㅋ',norm=True, stem=True)) path='/Users/kims/' # file1 file1 = pd.read_csv(path+'comments_17_df.csv') file1.head() # konlpy file1 text = [] len(file1) for i in range(0,len(file1)): text_spider = twitter.pos(file1.loc[i,'value'],norm=True, stem=True) text.append(text_spider) text text_df=pd.DataFrame.from_records(text) text_df=text_df.stack() text_df.to_csv('text_17.csv', encoding='utf-8') # file2 file2 = pd.read_csv(path+'comments_12_df.csv') file2.head() # konlpy file2 text = []
from konlpy.tag import Twitter from gensim.models import Word2Vec import csv twitter = Twitter() file = open("Article_shuffled.csv", 'r', encoding='euc-kr') line = csv.reader(file) token = [] embeddingmodel = [] for i in line: sentence = twitter.pos(i[0], norm=True, stem=True) temp = [] temp_embedding = [] all_temp = [] for k in range(len(sentence)): temp_embedding.append(sentence[k][0]) temp.append(sentence[k][0] + '/' + sentence[k][1]) all_temp.append(temp) embeddingmodel.append(temp_embedding) if i[3] == "IT과학": all_temp.append(0) elif i[3] == "경제": all_temp.append(1) elif i[3] == "정치": all_temp.append(2) elif i[3] == "e스포츠": all_temp.append(3) elif i[3] == "골프": all_temp.append(4)
class FuzzyWuzzy(): def __init__(self, clusterings=[], texts=[], confidence=0.6, batch_size=0, merge=False): st = datetime.datetime.now() print('0. [start] init_setting ------------------------------') self.before_texts = texts self.texts = [] self.batch_size = batch_size self.merge = merge self.t = Twitter() self.confidence = confidence * 100 self.clusterings = [] self.id = 0 self.texts_len = 13 if len(clusterings) > 0: self.convert_clustering(clusterings=clusterings) et = datetime.datetime.now() print('0. [end] init_setting => ', et - st) # stopwords를 제거하는 부분 def filtering(self, str_list=None, noun=False): str_list = list(map(lambda x: re.sub('[\?\.]', '', x), str_list)) str_list = list(map(lambda x: re.sub('어떻게 해야 하나요', '', x), str_list)) str_list = list(map(lambda x: re.sub('어떻게 되는 것인가요', '', x), str_list)) str_list = list(map(lambda x: re.sub('어떻게 되나요', '', x), str_list)) str_list = list(map(lambda x: re.sub('왜 그런가요', '', x), str_list)) str_list = list(map(lambda x: re.sub('라고 나와요', '', x), str_list)) str_list = list(map(lambda x: re.sub('되나요', '', x), str_list)) str_pos = self.t.pos(str_list[0], stem=True) stop_pos = ['Noun', 'Alpha', 'Foreign', 'Number'] if not (noun): stop_pos.append('Verb') stop_pos.append('Adjective') str_filt = np.array(str_pos)[np.where( np.in1d(list(map(itemgetter(1), str_pos)), stop_pos))[0]] if noun and len(str_filt) > 1 and str_filt[-1][1] != 'Noun': str_filt = str_filt[0:-1] str_final = [' '.join(list(map(itemgetter(0), str_filt)))] stop_words = [ '방법', '하는법', '어떤', '무슨', '알다', '말', '하다', '되다', '궁금하다', '가능하다', '시', '수', '인가요', '있다', '하나요', '해야하나요', '좋다', '해', '요', '한', '가요', '대해' ] split_str_list = list(map(lambda x: re.split(' ', x), str_final)) filtered_word = list( map( lambda x: ' '.join( list(np.array(x)[np.logical_not(np.in1d(x, stop_words))])), split_str_list)) return filtered_word[0] #return jaso_split(filtered_word[0]) def run(self): st = datetime.datetime.now() print('1. [start] init_run ------------------------------') init_confidence = self.confidence self.init_run() et = datetime.datetime.now() print('1. [end] init_run => ', et - st) st = datetime.datetime.now() print('2. [start] run_batch ------------------------------') self.confidence = init_confidence for i in range(2): if self.confidence >= 70: st_1 = datetime.datetime.now() self.run_batch(noun=True) self.confidence = self.confidence - 5 et_1 = datetime.datetime.now() print('2-1. [end] run_batch noun-------', i + 1, '번째 run_batch => ', et_1 - st_1) elif self.confidence < 70: break self.confidence = init_confidence for i in range(self.batch_size): if self.confidence >= 70: st_1 = datetime.datetime.now() self.run_batch(noun=False) self.confidence = self.confidence - 2.5 et_1 = datetime.datetime.now() print('2-2. [end] run_batch verb-------', i + 1, '번째 run_batch => ', et_1 - st_1) elif self.confidence < 70: break et = datetime.datetime.now() print('2. [end] run_batch => ', et - st) if self.merge: st = datetime.datetime.now() print('3. [start] merge_run ------------------------------') self.merge_run() et = datetime.datetime.now() print('3. [end] merge_run => ', et - st) st = datetime.datetime.now() print('4. [start] reform_run ------------------------------') self.reform_run() et = datetime.datetime.now() print('4. [end] reform_run => ', et - st) return self.clusterings def init_run(self): for i, text in enumerate(self.before_texts): convert_text = self.filtering(str_list=[text], noun=True) if i == 0 and len(self.clusterings) == 0: self.create_clustering(text, convert_text) else: self.ratio(text, convert_text) def find_zero_texts(self, noun=False): self.texts = [] self.new_clusterings = [] for clustering in self.clusterings: if len(clustering['texts']) < 2: text = clustering['texts'][0] #convert_text = self.filtering(str_list=[text], noun=noun) self.texts.append(text) else: self.new_clusterings.append(clustering) self.clusterings = self.new_clusterings def run_batch(self, noun=True): # 독립적으로 묶여있는 클러스터링은 한번 더 돌려서 확인 self.find_zero_texts(noun=noun) if len(self.texts) > 0: for text in self.texts: self.ratio(original=text, text=self.filtering(str_list=[text], noun=noun)) def merge_run(self): ##그룹간 매칭 new_small_c = [] new_big_c = [] for cluster in self.clusterings: if len(cluster['texts']) > 4: new_big_c.append(cluster) else: new_small_c.append(cluster) for sc in new_small_c: max_ratio = 0 max_bc_category = 0 for bc in new_big_c: this_ratio = fuzz.token_set_ratio(sc['totalText'], bc['totalText']) if max_ratio < this_ratio: max_bc_category = bc['category'] max_ratio = this_ratio if max_ratio > 77: for item in new_big_c: if item.get('category') == max_bc_category: item['texts'].extend(sc['texts']) temp_totalText = item['totalText'] + ' ' + sc[ 'totalText'] count = Counter( list(set(re.split(' ', temp_totalText)))) item['totalText'] = '' for n, c in count.most_common(self.texts_len): item['totalText'] = item['totalText'] + ' ' + n #[item['texts'].extend(sc['texts']) for item in new_big_c if item.get('category')==max_bc_category] else: new_big_c.append(sc) self.clusterings = new_big_c def reform_run(self): reform_ts = [] reform_cs = [] for cluster in self.clusterings: text_size = len(cluster['texts']) total_size = len(re.split(' ', cluster['totalText'])) if text_size == 2 and total_size >= 7: reform_ts.extend(cluster['texts']) else: reform_cs.append(cluster) self.clusterings = reform_cs for text in reform_ts: convert_text = self.filtering(str_list=[text], noun=False) self.ratio(text, convert_text) def ratio(self, original, text): max_category = 0 max_ratio = 0 random.shuffle(self.clusterings) for i, ob in enumerate(self.clusterings): this_ratio = fuzz.token_set_ratio(text, ob['totalText']) if max_ratio < this_ratio: max_ratio = this_ratio max_index = i if max_ratio > self.confidence: self.add_clustering(max_index, original, text) else: self.create_clustering(original, text) def create_clustering(self, original, text): tmp_totalTexts = list(set(re.split(' ', text))) text = ' '.join(list(set(tmp_totalTexts))) cluster = { "category": self.id, "texts": [original], "totalText": text, } self.clusterings.append(cluster) self.id = self.id + 1 def add_clustering(self, max_index, original, text): cluster = self.clusterings[max_index] cluster['texts'].append(original) cluster['totalText'] = cluster['totalText'] + ' ' + text tmp_totalTexts = list(set(re.split(' ', cluster['totalText']))) if len(tmp_totalTexts) < self.texts_len: cluster['totalText'] = ' '.join(tmp_totalTexts) else: count = Counter(tmp_totalTexts) cluster['totalText'] = "" for n, c in count.most_common(self.texts_len): cluster['totalText'] = cluster['totalText'] + ' ' + n def convert_clustering(self, clusterings=[]): self.clusterings = [] for c in clusterings: #print(c) tmp_total_str = "" totalText = "" tmp_total_original_list = [] for t in c['texts']: tmp_total_str = tmp_total_str + " " + self.filtering( str_list=[t], noun=False) tmp_total_original_list.append(t) tmp_total_list = re.split(' ', tmp_total_str) #print(tmp_total_list) if len(list(set(tmp_total_list))) > self.texts_len: count = Counter(tmp_total_list) for n, c in count.most_common(self.texts_len): totalText = totalText + " " + n else: totalText = " ".join(list(set(tmp_total_list))) self.id = self.id + 1 self.clusterings.append({ 'category': self.id, 'texts': tmp_total_original_list, 'totalText': totalText })
import numpy as np import os Model_File = './model/model_news_v2.hdf5' FILE_LIST = ['news0.txt', 'news1.txt', 'news2.txt', 'news3.txt'] text = [] result = [] for x in FILE_LIST: fp = open(x, 'r', encoding='utf-8') sentences = fp.readlines() for x in sentences: for y in x.split('. '): text.append(y.strip() + '.') fp.close() twitter = Twitter() for x in text: ret = twitter.pos(x, stem=True, norm=True) tmp = [] for y, _ in ret: tmp.append(y) result.append(tmp) result = np.array(result) tokenizer = Tokenizer() tokenizer.fit_on_texts(result) x_tmp = tokenizer.texts_to_sequences(result) sen_len = 5 x_train = [] y_train = []
def __init__(self): self.twitter = Twitter() self.kkma = Kkma()
from gensim.models.keyedvectors import KeyedVectors from konlpy.tag import Twitter import numpy as np import numpy.linalg as la import argparse import matplotlib.pyplot as plt import sklearn.decomposition as decomposition import sys twitter = Twitter() def normalize(array): norm = la.norm(array) return array / norm def create_word_vector(word, pos_embeddings): pos_list = twitter.pos(word, norm=True) word_vector = np.sum( [pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0) return normalize(word_vector) def plot_with_labels(embeds, labels, filename="output.png"): plt.figure(figsize=(18, 18)) pca = decomposition.PCA(n_components=2) pca.fit(embeds) Y = pca.transform(embeds) for i, label in enumerate(labels):
def analyzer( messages ) : # store senders in chat room sender_list = set() send_ratio = {} msg_bytes = {} sent_time = {} for i in range (0, 7) : sent_time[ i ] = {} for j in range(0,24) : sent_time[ i ][ j ] = 0 kcount = {} keywords = {} sent_month = "" temp_keywords = [] emoticons = 0 total = 0 last_sender = "" intimacy = {} twitter = Twitter() for msg in messages : sender_list.add(msg.sender) # to calculate intimacy between member if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : td_increment( intimacy, last_sender, msg.sender, 1) td_increment( intimacy, msg.sender, last_sender, 1) last_sender = msg.sender # check send ratio. td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1) # calculate msg bytes by sender td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents)) # count k in msg. increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8'))) # count emoticons if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents: emoticons = emoticons + 1 # calculate active time td_increment(sent_time, msg.datetime.weekday() , msg.datetime.time().hour, 1) # analyze keyword if ( is_msg_content(msg.contents) ) : if len(sent_month) == 0 : sent_month = str(msg.datetime)[:7] elif sent_month == str(msg.datetime)[:7] : temp_keywords.append(msg.contents) elif sent_month != str(msg.datetime)[:7] : keywords_list = twitter.nouns(msg.contents) for keyword in keywords_list : if len(keyword) > 1: td_increment(keywords, sent_month, keyword, 1) sent_month = str(msg.datetime)[:7] del temp_keywords[:] temp_keywords.append(msg.contents) # in case of 1:1 chat room if len(sender_list) == 2 : response_time = {} last_sender = "" last_response_time = timedelta(0) for sender in sender_list : response_time[sender] = [] for msg in messages : if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : last_sender = msg.sender response_time[msg.sender].append(msg.datetime - last_response_time) last_response_time = msg.datetime print "Who sent how much messages? " for date in send_ratio : print "in " + str(date) for sender in send_ratio[date] : print str(sender) + " sent " + str(send_ratio[date][sender]) + " messages" total = total + int(send_ratio[date][sender]) print "" print "Msg bytes : " for date in msg_bytes : print "in " + str(date) for sender in msg_bytes[date] : print str(sender) + " sent " + str(msg_bytes[date][sender]) + " bytes" print "" for sender in kcount : print sender + " wrote " + unicode('ㅋ','utf-8').encode('utf-8') + " " + str(kcount[sender]) + " byte times" print "" print "" # sorted keywords has 'list' type. not dict. print "Top 20 most frequently used keywords in your chatroom." for date in keywords : print "in " + date sorted_keywords = sorted(keywords[date].items(), key=lambda x:x[1], reverse = True) for i in range(0,20) : try : print sorted_keywords[i][0] + " : " + str(sorted_keywords[i][1]) except : pass print "" print "When is the most active moment in this chat room?" for week in sent_time : print week for hour in sorted(sent_time[week]): print str(sent_time[week][hour]) + " messages were sent at " + str(hour) + " o'clock" print "" print "you guys used emoticons " + str(emoticons) + " times" print "" print "intimacy between members" if len(sender_list) == 2 : for sender in response_time : print sender rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender]) print "responded in " + str(rt_average) + "in average" else : for member in intimacy : print member + " : " for friends in intimacy[member] : print " - " + friends + " " + str(intimacy[member][friends]) print "" print "totally, " + str(total) + " messages were sent"
from konlpy.tag import Twitter t = Twitter() from konlpy.corpus import kolaw import nltk import gensim from gensim.models import LdaModel from gensim import corpora, models import nltk import MySQLdb import xlwt db = MySQLdb.connect(host="localhost", user="******", passwd="kkms1234", db="scraping", charset='utf8') cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("set names utf8") db.query("set character_set_connection=utf8;") db.query("set character_set_server=utf8;") db.query("set character_set_client=utf8;") db.query("set character_set_results=utf8;") db.query("set character_set_database=utf8;") cursor.execute("set names utf8") sql = "select * from Text3 where ArticleNumber<=10000" cursor.execute(sql.encode('utf8')) rows = cursor.fetchall() document = ''
import csv from konlpy.tag import Twitter reader = csv.reader(open("../sample/top_song.csv",'r')) writer = csv.writer(open("../sample/top_song_lemma.csv",'w')) twitter = Twitter() lema = str() for i in reader: s = twitter.pos(i[4],norm=True) x = [i[0] for i in s if i[1] in ['Noun','Verb','Adjective','Alpha'] and len(i[0])>1] print(i[4],"\n",x,"\n"," ".join(x),"\n") result = [seg for seg in i] result.append(" ".join(x)) writer.writerow(result)
def __init__(self, logger): self.logger = logger self.twitter = Twitter()
""" Created on Wed Mar 9 00:35:54 2016 @author: chuckgu """ import json,os from nltk.tokenize import sent_tokenize,word_tokenize from konlpy.tag import Twitter import numpy as np import sys reload(sys) sys.setdefaultencoding('utf8') twitter=Twitter() txt=[] checklist=['Exclamation','Alpha','URL','Punctuation','Foreign','Unknown','Hashtag','ScreenName','Josa'] ''' currdir = os.getcwd() os.chdir('%s/' % currdir) print currdir with open("text8", 'r') as f: for line in f: sentences.append(line[:100]) print sentences
class NewsParser(object): news_link = "http://news.naver.com/main/read.nhn?oid=%s&aid=%s" jsonp_regex = re.compile(r'^\s*cb\s*\((.*)\)\s*;?\s*$', re.DOTALL) def __init__(self, logger): self.logger = logger self.twitter = Twitter() def parse(self, news_id_token): split = news_id_token.split(',') href = NewsParser.news_link % tuple(split) req = requests.get(href) soup = BeautifulSoup(req.text, 'html.parser') title_elem = soup.select_one('#articleTitle') content_elem = soup.select_one('#articleBodyContents') news_type = 'NEWS' if not title_elem: title_elem = soup.select_one("h2.end_tit") content_elem = soup.select_one( "#articeBody") # Not typo, it is really "artice" news_type = 'ENTERTAIN' if not title_elem or not content_elem: self.logger.info('[Crawl::News Info] %s has no title!' % news_id_token) return None for script in content_elem.findAll("script"): script.decompose() title = self.twitter.pos(title_elem.get_text(), norm=True, stem=True) content = self.twitter.pos(content_elem.get_text(), norm=True, stem=True) api_req = requests.get( "http://news.like.naver.com/v1/search/contents", params={"q": "%s[ne_%s_%s]" % (news_type, split[0], split[1])}, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + "Chrome/64.0.3282.167 Safari/537.36", "Referer": href # Needed header }) api_resp = json.loads(api_req.text) if ('contents' not in api_resp) or \ (len(api_resp['contents']) < 1) or \ ('reactions' not in api_resp['contents'][0]): self.logger.info('[CrawlAppend::News Info] %s has no reactions!' % news_id_token) return None reactions = api_resp['contents'][0]['reactions'] reactions_parsed = {} for reaction in reactions: reactions_parsed[reaction['reactionType']] = reaction['count'] api_req = requests.get( "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json", params={ "_callback": "cb", "objectId": "news" + news_id_token, "pool": "cbox5", "ticket": "news", "lang": "ko", "initialize": "true", "pageSize": "1" # Reduce packet size, pageSize will be ignored if it is less than one. }, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + "Chrome/64.0.3282.167 Safari/537.36", "Referer": href # Needed header }) api_resp = json.loads( NewsParser.jsonp_regex.match(api_req.text).group(1)) if ('result' not in api_resp) or\ ('graph' not in api_resp['result']) or\ ('count' not in api_resp['result']) or\ ('gender' not in api_resp['result']['graph']) or\ ('old' not in api_resp['result']['graph']): self.logger.info('[Crawl::News Info] %s has no graphs!' % news_id_token) return None gender_graph = api_resp['result']['graph']['gender'] age_graph = api_resp['result']['graph']['old'] age_parsed = {} comments = api_resp['result']['count']['total'] for age in age_graph: age_parsed[age['age']] = age['value'] return { 'title': title, 'content': content, 'age': age_parsed, 'gender': gender_graph, 'comment': comments, 'reaction': reactions_parsed }
import csv import operator from pykospacing import spacing from konlpy.tag import Twitter def findfeq(worddict, data): pass twitter = Twitter() f = open('practice/train.csv', 'r', encoding='utf-8') rdr = csv.reader(f) i = 0 worddict = {} wordlist = [] wordfreq = [] # str = "히라마블로그에온걸^^환영해1993!" # print(spacing(str)) for line in rdr: if (i != 0): message = list(line) sms = message[2] sentencelist = list(sms.split('.')) for sentence in sentencelist: sentence = sentence.replace(' ', '') sentence = spacing(sentence)
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') from konlpy.corpus import kobill docs_ko = [kobill.open(i).read() for i in kobill.fileids()] from konlpy.tag import Twitter; t=Twitter() pos = lambda d: ['/'.join(p) for p in t.pos(d, stem=True, norm=True)] texts_ko = [pos(doc) for doc in docs_ko] #encode tokens to integers from gensim import corpora dictionary_ko = corpora.Dictionary(texts_ko) dictionary_ko.save('ko.dict') # save dictionary to file for future use #calulate TF-IDF from gensim import models tf_ko = [dictionary_ko.doc2bow(text) for text in texts_ko] tfidf_model_ko = models.TfidfModel(tf_ko) tfidf_ko = tfidf_model_ko[tf_ko] corpora.MmCorpus.serialize('ko.mm', tfidf_ko) # save corpus to file for future use #train topic model #LSI ntopics, nwords = 3, 5 lsi_ko = models.lsimodel.LsiModel(tfidf_ko, id2word=dictionary_ko, num_topics=ntopics)
query = {"$and": [ {"lyrics": {"$exists": True}}, {"count": {"$gte": 20}} ]} proj = {"lyrics": True, "_id": False} cursor = target.find(query, proj) lyrics_set = [] for doc in cursor: # print(doc["lyrics"]) lyrics_set.append(str(doc["lyrics"])) print("===== complete data import =====") ## tokenization and tagging using konlpy from konlpy.tag import Twitter tw = Twitter() ## stopword elimination(보류) """ 패키지 수정해야될지, 진짜 필요한 부분인지, 대체할 수 있는 방법 없는지 논의 from many_stop_words import get_stop_words stopwords = list(get_stop_words('kr')) for sw in stopwords: print(sw) """ ## create English stop words list from stop_words import get_stop_words en_stop = get_stop_words('en') ## Create p_stemmer of class PorterStemmer from nltk.stem.porter import PorterStemmer
def __init__(self): self.ko_twitter = Twitter()
이 프로그램은 문서 키워드 추출과 관련된 연구를 위해 작성되었습니다. """ from matplotlib import font_manager, rc font_fname = '/Library/Fonts/AppleGothic.ttf' # A font of your choice font_name = font_manager.FontProperties(fname=font_fname).get_name() rc('font', family=font_name) import nltk from konlpy.tag import Twitter import numpy tw = Twitter() # 트위터 형태소 분석기를 사용함 import sys reload(sys) sys.setdefaultencoding('utf-8') # 찾으려고 하는 상위 n개의 단어(명사) _toFind_ = 30 # 문서 읽기 doc_ko = open('./k_tex2.txt').read() # print(doc_ko) # 명사만 추출 token_ko = tw.nouns(doc_ko)
import json, pymongo, requests, os, sys import time, dateutil.parser, codecs from konlpy.tag import Twitter; t = Twitter() pos = lambda d: ['/'.join(p) for p in t.pos(d) if p[1] in ['Noun', 'Adjective', 'Determiner', 'Adverb', 'KoreanParticle']] conn = pymongo.MongoClient("mongodb://localhost") db = conn.watcha cnames = db.collection_names() collections = dict() for cname in cnames: collections[cname] = eval('db.' + cname) del collections['reviews'] del collections['system.indexes'] cursor = collections['comedy'].find() length = collections['comedy'].count() cnt = 0 with codecs.open('D:\watcha_reviews\comedy.txt', 'w', encoding='utf-8') as fp: for row in cursor: cnt += 1 if cnt % 1000 == 0: print str(cnt) + ' / ' + str(length) rating = row['rating'] cid = row['comment_id'] text = row['text'] fp.write(' '.join([str(rating), str(cid)] + pos(text)) + '\n')
# LDA 모델링 패키지 import gensim from gensim import corpora, models # 한국어 처리 from konlpy.tag import Twitter twitter = Twitter() import operator documents = [] # 중복 토큰 필터 filterDocuments = [] for i in documents: # 문장 토큰화 tokens = twitter.pos(i, norm=True, stem=True) # 명사만 추출하지 않고 진행시 거리가 먼 분류가 노출됨 stem_tokens = [split[0] for split in tokens if split[1] == "Noun"] filterDocuments.append(stem_tokens) # Dictionary 생성 dictinory = corpora.Dictionary(filterDocuments) # 토큰으로 생성된 사전 중첩처리 corpus = [dictinory.doc2bow(text) for text in filterDocuments]
from collections import Counter import os from konlpy.tag import Twitter tw = Twitter() region = [ "서울", "부산", "기장", "대구", "달성", "인천", "강화", "옹진", "광주", "대전", "울산", "울주", "세종", "수원", "성남", "안양", "안산", "용인", "광명", "평택", "과천", "오산", "시흥", "군포", "의왕", "하남", "이천", "안성", "김포", "화성", "경기도_광주", "여주", "부천", "양평", "고양", "의정부", "동두천", "구리", "남양주", "파주", "양주", "포천", "연천", "가평", "춘천", "원주", "강릉", "동해", "태백", "속초", "삼척", "홍천", "횡성", "영월", "평창", "정선", "철원", "화천", "양구", "인제", "고성", "양양", "청주", "충주", "제천", "보은", "옥천", "영동", "진천", "괴산", "음성", "단양", "증평", "천안", "공주", "보령", "아산", "서산", "논산", "계룡", "당진", "금산", "부여", "서천", "청양", "홍성", "예산", "태안", '전주', '군산', '익산', '정읍', '남원', '김제', '완주', '진안', '무주', '장수', '임실', '순창', '고창', '부안', '목포', '여수', '순천', '나주', '광양', '담양', '곡성', '구례', '고흥', '보성', '화순', '장흥', '강진', '해남', '영암', '무안', '함평', '영광', '장성', '완도', '진도', '신안', '창원', '진주', '통영', '사천', '김해', '밀양', '거제', '양산', '의령', '함안', '창녕', '고성', '남해', '하동', '산청', '함양', '거창', '합천', '포항', '경주', '김천', '안동', '구미', '영주', '영천', '상주', '문경', '경산', '군위', '의성', '청송', '영양', '영덕', '청도', '고령', '성주', '칠곡', '예천', '봉화', '울진', '울릉', '제주' ] water_issues = [ "수력", "하수", "용수", "하천", "댐", "강우", "저수", "호우", "빗물", "상수", "조류", "녹조", "수질", "풍수", "누수", "유수", "강수", "정수", "취수", "수돗물", "배수", "오염", "홍수", "가뭄" ] def get_tags(text, issues): spliter = Twitter() nouns = spliter.nouns(text) count = Counter(nouns)
def __init__(self, filename=None, nth=-1): self.filename = filename self.nth = nth self.twitter = Twitter() self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log')
OOV = "<OOV>" # 없는 단어(Out of Vocabulary) # 태그 인덱스 PAD_INDEX = 0 STA_INDEX = 1 END_INDEX = 2 OOV_INDEX = 3 # 데이터 타입 ENCODER_INPUT = 0 DECODER_INPUT = 1 DECODER_TARGET = 2 max_sequences = 30 RE_FILTER = re.compile("[.,!?\"':;~()]") tagger = Twitter() app = Flask(__name__) # 모델 / 단어 가져오기 project_path = os.path.dirname(os.path.abspath(__file__)) encoder_model = load_model(project_path + "/data/encoder_model.h5") decoder_model = load_model(project_path + "/data/decoder_model.h5") with open(project_path + "/data/words.pickle", 'rb') as f: words = pickle.load(f) words[:0] = [PAD, STA, END, OOV] # 단어와 인덱스의 딕셔너리 생성 word_to_index = {word: index for index, word in enumerate(words)} index_to_word = {index: word for index, word in enumerate(words)}
# -*- coding: utf-8 -*- from konlpy.tag import Twitter import pymysql import instagram import sys reload(sys) sys.setdefaultencoding('utf-8') twitter = Twitter() # connection to mysql(local) conn = pymysql.connect(host='aws-server-name', port=3306, user='******', passwd='password', db='ematewha', charset ='utf8', use_unicode=True) cur = conn.cursor() # connection to instagram_api client_id = 'id' client_secret = 'd790b3e4268c4b4bb91492af520647fd' access_token = 'given-token' client_ip = 'ip' api = instagram.InstagramAPI(client_id=client_id, client_secret=client_secret, client_ips=client_ip, access_token=access_token) # bring upso_refine.name from mysql cur.execute('SELECT upso_id from upso_refined') upso_id = cur.fetchall() cur.execute('SELECT name_refined from upso_refined') upso_refine = cur.fetchall() # search with upso_refine from instagram
def count_wordfreq(data): twitter = Twitter() nouns = twitter.nouns(data) count = Counter(nouns) return count
def main(): # def job(): conn = pymysql.connect(host='192.168.0.61', user='******', password='******', db='one_db', charset='utf8mb4') cursor = conn.cursor() sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s' cursor.execute(sql, 'N') original = cursor.fetchall() print('original data') print(original) # 신조어 필터링 sql = 'SELECT word FROM tb_newdic' cursor.execute(sql) newdic = cursor.fetchall() # print('신조어 사전') # print(newdic) # 예외사전 데이터 가져오기 sql = 'SELECT word FROM tb_excdic' cursor.execute(sql) excdic = cursor.fetchall() print('예외 사전') print(excdic) originalList = [] for data in original: dataList = list(data) for word in newdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' cursor.execute(sql, (dataList[0], word[0], dataList[2])) conn.commit() for word in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') originalList.append(dataList) original = originalList # 트위터로 분석 from konlpy.tag import Twitter twitter = Twitter() tresult = [] for data in original: tresult.append([data[0], twitter.nouns(data[1]), data[2]]) print(twitter.pos(data[1])) # 트위터 분석 결과 확인 print('twitter result') print(tresult) # 코모란으로 분석 from konlpy.tag import Komoran komoran = Komoran() kresult = [] for data in tresult: words = data[1] # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False state = True for word in words: try: type = komoran.pos(word)[0][1] if type == 'NNG' or type == 'NNP': kresult.append([data[0], komoran.morphs(word)[0]]) # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우 exist = False # 예외 사전에 있는 단어는 INSERT 전에 필터링 for exc in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (word, exc[0])) count = cursor.fetchone() if count[0] != 0: print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치') exist = True break if exist: continue # NNG, NNP 타입만 DB에 INSERT # 예외 발생 시 rollback, 아닌 경우 commit으로 처리 sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' try: if len(komoran.morphs(word)[0]) != 1: cursor.execute( sql, (data[0], komoran.morphs(word)[0], data[2])) except Exception as err: state = False print('ERROR : komoran result의 ' + str(data[0]) + '번 글의 에서 insert 처리 중 오류 발생') print(str(err)) conn.rollback() else: conn.commit() except Exception as err: state = False print('ERROR : komoran 키워드 분석 중 오류 발생') continue ssql = 'UPDATE test_original SET state = %s WHERE ono = %s' state = 'Y' if state == True else 'E' cursor.execute(ssql, (state, data[0])) conn.commit() # 코모란 분석 결과 확인 print('komoran result') print(kresult) print('-----') print('끝') # schedule.every().day.at("").do(job) # # while 1: # schedule.run_pending() # time.sleep(1)
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from konlpy.corpus import kolaw # 1. 이전 포스트에서 크롤링한 댓글파일을 읽기전용으로 호출함 file = open('./test4.txt', 'r', encoding='utf-8') lines = file.readlines() # 2. 변수 okja에 전체댓글을 다시저장 okja = [] for line in lines: okja.append(line) file.close() twitter = Twitter() # 4. 각 문장별로 형태소 구분하기 sentences_tag = [] for sentence in okja: morph = twitter.pos(sentence) sentences_tag.append(morph) print(morph) print('-' * 30) print(sentences_tag) print(len(sentences_tag)) print('\n' * 3) # 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기 docs = []
# -*- coding:utf8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import json import operator import urllib2 from konlpy.tag import Kkma from konlpy.utils import pprint from konlpy.tag import Twitter twitter = Twitter() #f = open("seoul_data2.txt", "r") #f = open("polatics.txt", "r") # current 500 articles f = urllib2.urlopen("http://polatics.news/all").read().split('\n') f.reverse() f = f[0:400] for i in f: print i print "line : %d" %(len(f)) f2 = open("polatics_out.txt", "w") voca = {}
# -*- coding:utf-8 -*- """ Title: Parts of speech tagging using KoNLPy Purpose: To comprison the result of POS using KoNLPy """ from konlpy.tag import Kkma from konlpy.tag import Okt from konlpy.tag import Hannanum from konlpy.tag import Komoran from konlpy.tag import Twitter if __name__ == '__main__': kkma = Kkma() okt = Okt() komoran = Komoran() hannanum = Hannanum() twitter = Twitter() # Only Kkma can split the setences print("kkma 문장 분리 : ", kkma.sentences("네 안녕하세요 반갑습니다.")) # Comprison of Konlpy's library parts of speech print("okt 형태소 분석 : ", okt.pos(u"집에 가면 감자 좀 쪄줄래?")) #--> Ok print("kkma 형태소 분석 : ", kkma.pos(u"집에 가면 감자 좀 쪄줄래?")) print("hannanum 형태소 분석 : ", hannanum.pos(u"집에 가면 감자 좀 쪄줄래?")) print("komoran 형태소 분석 : ", komoran.pos(u"집에 가면 감자 좀 쪄줄래?")) print("twitter 형태소 분석 : ", twitter.pos(u"집에 가면 감자 좀 쪄줄래?")) # --> Ok
import codecs from bs4 import BeautifulSoup from konlpy.tag import Twitter # utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1) fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16") soup = BeautifulSoup(fp, "html.parser") body = soup.select_one("body > text") text = body.getText() # 텍스트를 한 줄씩 처리하기 --- (※2) twitter = Twitter() word_dic = {} lines = text.split("\n") for line in lines: malist = twitter.pos(line) for word in malist: if word[1] == "Noun": # 명사 확인하기 --- (※3) if not (word[0] in word_dic): word_dic[word[0]] = 0 word_dic[word[0]] += 1 # 카운트하기 # 많이 사용된 명사 출력하기 --- (※4) keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True) for word, count in keys[:50]: print("{0}({1}) ".format(word, count), end="") print()
import pickle from konlpy.tag import Kkma from konlpy.tag import Twitter import jpype import os twitter = Twitter() f = open('pickle.pickle','rb') data = pickle.load(f) #wordbag = [] doc_list = [] termdoc = {} for datum in data: doc_list.append(datum['no']) #data = None #gc.collect() for datum in data: doc_id = datum['no'] lec_no = datum['lec_no'] # pos = twitter.pos(datum['eval_content'],stem = True) for p in pos: tag = p[1] if ('Exclamation' or 'Josa' or 'Eomi' or 'Suffix' or 'Punctuation' or 'Foreign' or 'Alpha' or 'Unknown' or 'KoreanParticle' or 'Hashtag' or 'ScreenName') in tag: continue if p[0] not in termdoc: termdoc[p[0]] = dict.fromkeys(doc_list,0) termdoc[p[0]][doc_id] += 1