def korean_morph(text):
    twitter = Twitter()
    
    s=twitter.morphs(str(unicode(text)))
    
    s=' '.join(s)
    
    
    
    return s
Example #2
0
def pos_tagging(text):
    available_terms_list = []

    twitter = Twitter()
    pos_list = twitter.pos(text, norm=True, stem=True)

    for item in pos_list:
        if (item[1] == 'Verb') | (item[1] == 'Adjective'):
            available_terms_list.append(item)

    return available_terms_list
Example #3
0
    def _twitter_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        twitter = Twitter(jvmpath=None)
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(twitter.pos(str(data)), tag_combine=tag_combine)
        return return_arr
Example #4
0
def pos_tagging_noun(text):
    noun_terms_list = []

    twitter = Twitter()
    pos_list = twitter.pos(text, norm=True, stem=True)

    for item in pos_list:
        if (item[1] == 'Noun'):
            noun_terms_list.append(item)

    return noun_terms_list
Example #5
0
def create_wordbag(x):
	wordbag = []
	if(x['eval_content']) is None:
		return wordbag	
	twitter = Twitter()
	for text in twitter.pos(x['eval_content'], stem = True):
		tag = text[1]
		if tag in unneeded:
			continue

		word = text[0]
		wordbag.append(word)
	return wordbag
Example #6
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Example #7
0
def main():
    """
        konlpy 사용시 주의 사항
        자바 설치 및 세팅 필요
        JAVA_HOME 세팅이 필요합니다.
        export JAVA_HOME=$(/usr/libexec/java_home)
    """
    konl = Twitter()
    file_path = '/Users/bongster/Downloads/20160528_jiana.csv'
    with open(file_path, 'rb') as csv_file:
        inforeader = csv.reader(csv_file)
        for row in inforeader:
            r = konl.pos(unicode(row[4], 'utf-8'), norm=True, stem=True)
            print '=' * 20
            for txt, post in r:
                print txt, post
            print '=' * 20
Example #8
0
	def get_noun(self):
		print("[*] 명사 추출 시작")
		start_time = time.time()
		twitter = Twitter()
		for s in self.word_list:
			temp = twitter.nouns(s)
			for t in temp:
				self.noun_list.append(str(t))

		end_time = time.time()
		print("[*] 명사 추출 완료(소요시간 : {0})".format(str((end_time-start_time))))
		print("[*] 추출된 명사 길이 : {0}".format(str(len(self.noun_list))))

		# 빈도 분석
		count = Counter(self.noun_list)
		#tag = count.most_common( int(len(count)*(15/100)) )
		tag = count.most_common(50)
		taglist = pytagcloud.make_tags(tag, maxsize=100)
		pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(800, 600), fontname='Nanum Gothic Coding', rectangular=False)
Example #9
0
 def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
     """
     Allocate kkma or twitter diction instance
     :param on_kkma: kkma instance
     :param on_twitter: twitter instance
     """
     if on_kkma is True:
         self.kkma = Kkma()
     if on_twitter is True:
         self.twitter = Twitter()
Example #10
0
class KorDisintegrator:
    def __init__(self):
        self.ko_twitter = Twitter()

    def convert2simple(self, sentence="", norm=True, stem=True):
        disintegrated_sentence = self.ko_twitter.pos(sentence, norm=norm, stem=stem)
        convert_sentence = []

        for w, t in disintegrated_sentence:
            if t not in ["Eomi", "Josa", "KoreanParticle", "Punctuation"]:
                convert_sentence.append(w)
        return " ".join(convert_sentence)
Example #11
0
    def __init__(self, on_han=False, on_twitter=False, on_mecab=False):    # maybe move to init of analysis_app

        """
        Allocate kkma or twitter diction instance
        :param on_han: han instance
        :param on_twitter: twitter instance
        :param on_mecab: mecab instance
        """
        if on_han is True:
            self.han = Hannanum()
        if on_twitter is True:
            self.twitter = Twitter()
Example #12
0
def main(_):


    is_train = True  # if False then test
    
    if is_train :
        train()
              
    else:
        checklist=['Exclamation','Alpha','URL']
        twitter=Twitter()
        
        dic_file_x='data/xproject_class.dict.pkl'
        worddict_x = dict()

        worddict_x = load_dict(dic_file_x)   
        
        x, x_mask,prediction=build_test('/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83')
        
        while 1:
            choice=raw_input("Me: ")
            if choice in ["Q","q"]: break
            #print choice
            
            choice=choice.decode('utf-8')
            
            sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True)  if s[1] not in checklist])
            
    
            words=(word_tokenize(sen.strip().lower()))
            #print ' '.join(words)
            seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words]
            seqs = [s if s<600 else 1 for s in seqs]
            seqs=[seqs]
            res=test_data(seqs,x, x_mask,prediction,'/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83')
            
            #print res
            print "class: "+str(res)
Example #13
0
class Parser(object):
    def __init__(self, filename=None, nth=-1):
        self.filename = filename
        self.nth = nth
        self.twitter = Twitter()
        self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log')

    def parse_sentence(self, sentence):
        return self.twitter.pos(sentence)

    def parse_all_generator(self, filename=None, nth=None):
        if filename is None:
            filename = self.filename or click.prompt('file name is required')
        if nth is None:
            nth = self.nth
        for row in ParseUtils.iter_csv(filename):
            try:
                parsed = self.parse_sentence(row[nth])
                concated = ' '.join([ParseUtils.concat_tuple(x) for x in parsed])
                row[nth] = concated
            except BaseException as e:
                msg = '{error:<80}  |  {data}'.format(error=str(e), data=ParseUtils.list_to_csv(row))
                self.logger.error(msg)
                continue
            yield row

    def extract_parsed(self, out_filename, filename=None, nth=None):
        if filename is None:
            filename = self.filename or click.prompt('file name is required')
        filelength = ParseUtils.file_length(filename)
        if nth is None:
            nth = self.nth
        with open(out_filename, 'w') as f:
            csv_writer = csv.writer(f)
            for row in Progress(self.parse_all_generator(), filelength, 10):
                csv_writer.writerow(row)
import json, pymongo, requests, sys
import time, dateutil.parser
import gensim, logging, os
from konlpy.tag import Twitter; t = Twitter()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='review' + '.log')
pos = lambda d: ['/'.join(p) for p in t.pos(d)]

# class LSGenerator(object):
# 	def __init__(self, collname):
# 		self.conn = pymongo.MongoClient("mongodb://localhost")
# 		self.db = self.conn.watcha
# 		self.cnames = self.db.collection_names()
# 		self.collections = dict()
# 		self.collname = collname

# 		for cname in self.cnames:
# 			self.collections[cname] = eval('self.db.' + cname)

# 		del self.collections['reviews']
# 		del self.collections['system.indexes']

# 	def __iter__(self):
# 		for row in self.collections[self.collname].find():
# 			rating = row['rating']
# 			cid = row['comment_id']
# 			text = row['text']
# 			pos_text = pos(text)
# 			tags = [str(rating) + '_' + str(cid) + '_' + self.collname]
# 			yield gensim.models.doc2vec.TaggedDocument(words = pos_text, tags = tags)

class LSGenerator(object):
Example #15
0
# -*- coding: utf-8 -*-
import numpy as np
import sys
import codecs

from konlpy.tag import Twitter
from konlpy.tag import Kkma
konlpy_twitter = Twitter()
kkma = Kkma()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


def select_test_data(sample_labels, sample_text, i):
	chunksize = len(sample_text)/5
	start = chunksize * i;
	if i == 4:
		end = len(sample_text)
	else:
		end = start + chunksize

	test_labels = sample_labels[start:end]
	test_text = sample_text[start:end]
	train_labels = sample_labels[:start] + sample_labels[end:]
	train_text = sample_text[:start] + sample_text[end:]

	return (test_labels, test_text, train_labels, train_text)

Example #16
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

from konlpy.corpus import kobill
from konlpy.tag import Twitter; t = Twitter()
from matplotlib import pyplot as plt

pos = lambda x: ['/'.join(p) for p in t.pos(x)]
docs = [kobill.open(i).read() for i in kobill.fileids()]

# get global unique token counts
global_unique = []
global_unique_cnt = []
for doc in docs:
    tokens = pos(doc)
    unique = set(tokens)
    global_unique += list(unique)
    global_unique = list(set(global_unique))
    global_unique_cnt.append(len(global_unique))
    print(len(unique), len(global_unique))

# draw heap
plt.plot(global_unique_cnt)
plt.savefig('heap.png')
Example #17
0
keyword_before = 'C:/doc2vec/result/keyword_before/'
full_dir = 'C:/doc2vec/word_result/total_test/'
keyword_after = 'C:/doc2vec/result/keyword_after/'

txt_list = os.listdir(path_dir)  # 폴더안에 있는 파일 리스트로 저장
txt_list2 = os.listdir(path_dir2)
txt_list3 = os.listdir(path_dir3)
txt_list4 = os.listdir(path_dir4)
txt_list5 = os.listdir(path_dir5)
txt_list6 = os.listdir(path_dir6)

txt_pre = os.listdir(path_pre)
str1 = ''
total_word = []
total_word2 = []
t = Twitter()
# dir=path_dir
sys.stdout.flush()

for change in txt_pre:
    total = []  # 통합문서

    remover = change.replace(".txt", "-")
    original = path_pre + change
    etc_pdf = pdf_dir + remover + '0.pdf'
    etc_hwp = hwp_dir + remover + '0.hwp'
    jpg_name = change.replace("txt", "jpg")
    jpg_file = jpg_dir + jpg_name
    png_name = change.replace("txt", "png")
    png_file = jpg_dir + png_name
    pdf_name = change.replace("txt", "pdf")
Example #18
0
    plt.close('all')
    fig = plt.figure()
    ax3 = plt.subplot(111)   
    plt.plot(model.errors)
    plt.grid()
    ax3.set_title('Training error')    
    plt.savefig('error.png')
    
    
elif mode=='te':
    if os.path.isfile(filepath): model.load(filepath)
    else: 
        raise IOError('loading error...')
    
    checklist=['Exclamation','Alpha','URL']
    twitter=Twitter()

    
    while 1:
        choice=raw_input("Me: ")
        if choice in ["Q","q"]: break
        #print choice
        
        choice=choice.decode('utf-8')
        
        sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True)  if s[1] not in checklist])
        

        words=(word_tokenize(sen.strip().lower()))
        #print ' '.join(words)
        seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words]
Example #19
0
import collections
from konlpy.tag import Twitter
from konlpy.utils import pprint

a1 = collections.Counter(['1', '2', '3', '4', '3', '3', '1', 3])
a2 = collections.Counter(['1', '1', '3', '2', '3'])

a3 = a1 | a2

print(a3)
print(a3['5'])
twt = Twitter()

word = ('알리바바와 40인의 도굴꾼 지하에서 웃는다 독도는 우리땅 우리땅 땅요 땅요 내 땅이요 조선 땅도 내땅이요')

tes = twt.pos(word)
pprint(tes)

offsets = set(i for i in range(len(word)) if word.startswith('땅', i))

print(offsets)
print(word[3])
"""
짚고 넘어갈 것

1. 형태소 분석기는 완벽하지 못하다.
2. 기사문은 일반 글과 달리 지킬 건 지킨다 : 띄어쓰기 및 오타가 많지 않은 편. 즉 문법적인 이해 요소가 헷갈릴 일이 거의 없다.
3. 기사문에는 일반 문법을 벗어나는 것이라면 보통 명사의 지나친 나열이 있다.
4. 트위터 분석기엔 대명사 판별기가 없기 때문에, 정황을 파악해서 필요한 경우 직접 파악해줘야 한다.
5. 기사문엔 대명사 말고도 줄임말 (ex: 문재인 대통령 >> 문 대통령) 으로 많이 쓰인다
6. 형태소 분석기들은 띄어쓰기를 믿지 않는 경향이 있다. 띄어쓰기 여부를 직접 입력해 주도록 하자.
Example #20
0
import pickle
from os import listdir
import nltk
import gensim

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from konlpy.tag import Twitter

lm = WordNetLemmatizer()
twit = Twitter()
import logging
import urllib, json
from urllib.request import urlopen
from elasticsearch import Elasticsearch

default = "http://192.168.101.183:9200/wrapsody/document/"


def createJson(url):
    with urllib.request.urlopen(url) as url2:
        data = json.loads(url2.read().decode('utf-8'))
        return data


def saveIdList(fname):
    dic = createJson(default +
                     "_search?size=10000&_source=false&sort=_id:desc")
    idList = []
    for e in dic["hits"]["hits"]:
 def tokenize(self, doc):
     pos_tagger = Twitter()
     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
Example #22
0
import tweepy
from func_lib import *
from konlpy.tag import Twitter; t = Twitter()
from function import *
import time

# crawling from twitter

def twitterC(ID,flist,site_type):


    # authorization level
    API_KEY = 'NjHC0Y2Iql94ivUB78lC60Bpm'
    API_SECRET = 'mR4R132jKjuUga5GN0RyVngVk80I23daJhR21n1RstQDQvZNG6'
    ACCESS_KEY = '794856483007533057-n4iG19CHb8KNvxIkmSp1ahg7mPhe0Bq'
    ACCESS_SECRET = '2Pp0aZBou8DQ2h6y0ptim6Zo1F3HlLmOAxhG5sVuN2EY2'

    oAuth = tweepy.OAuthHandler(API_KEY, API_SECRET)
    oAuth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
    api = tweepy.API(auth_handler=oAuth, api_root='/1.1')

    # search timeline through ID keyword
    userID = ID
    user = api.get_user(userID)
    timeline = api.user_timeline(userID)

    total_update(site_type)

    # GET USER'S TIMELINE TWEETS
    for tweet in timeline:
        try:
Example #23
0
loc = os.listdir(dir)

content = []
nouns = []

# Listing Texts
for maindir in loc:
    subdir = os.listdir(dir + '/' + maindir)
    file_list = []

    for file in subdir:
        file_list.append(open(dir + '/' + maindir + '/' + file, "r").read())

    content.append(file_list)

nlp = Twitter()

# Seperating Words
for i in content:
    list_wrap = []
    for j in i:
        list_wrap.append(nlp.nouns(j))

    nouns.append(list_wrap)

words = ''

for c in content[0]:
    words = words + ' ' + c

nouns2 = nlp.nouns(words)
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, constraints
from keras import backend as K

# ---------------Data Load Starts------------------

corpus_fname = './dataset/train_data.txt'

from konlpy.tag import Twitter
mecab = Twitter()


def get_normalized_data(sentence):
    # original_sentence = mecab.pos(sentence, norm=True, stem=True)
    original_sentence = mecab.pos(sentence)
    inputData = []
    for w, t in original_sentence:
        # if t in ['Number']:
        #     w = '0'
        if t not in ['Number', 'Punctuation', 'KoreanParticle']:
            inputData.append(w)
    return (' '.join(inputData)).strip()


def get_text(fname):
Example #25
0
 def __init__(self):
     self.t = Twitter()
Example #26
0
def upload(request):
	if request.method == 'POST':
		if 'file' in request.FILES:
  			myUid = str(uuid.uuid4())

			dataChatroom = Chatroom(
				uid = myUid
			)
			dataChatroom.save()

			data = Chatroom.objects.get(uid=myUid) 
			chatroom_id = data.id

			file = request.FILES['file']
			filename = myUid		
			
			fp = open('%s/%s' % ("data", filename) , 'wb')
			for chunk in file.chunks():
				fp.write(chunk)
			fp.close()
			log_file = open('%s/%s' % ("data", filename) , 'r')
						
			messages = normalize( log_file )
			log_file.close()
			
			#파일 삭제
			os.remove('%s/%s' % ("data", filename))

			sender_list = set()
			send_ratio = {}
			msg_bytes = {}
			sent_time = {}
			sent_time = {}
			for i in range (0, 7) :
				sent_time[ i ] = {}
				for j in range(0,24) :
					sent_time[ i ][ j ] = 0	
			kcount = {}
			hcount = {}
			ucount = {}
			keywords = {}
			keywords_all = {}
			sent_month = ""
			temp_keywords = ""
			emoticons = 0
			total = 0
			last_sender = ""			
			intimacy = {}
			is_one_to_one = 0
			twitter = Twitter()
		
			for msg in messages :
				sender_list.add(msg.sender)

				# to calculate intimacy between member
				if len(last_sender) == 0 :
					last_sender = msg.sender
				if last_sender != msg.sender :
					td_increment( intimacy, last_sender, msg.sender, 1)
					td_increment( intimacy, msg.sender, last_sender, 1)
					last_sender = msg.sender
			
				# check send ratio.
				td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1)
			
				# calculate msg bytes by sender
				td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents))
				
				# count k in msg.
				increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8')))
				increment(hcount, msg.sender, msg.contents.count(unicode('ㅎ','utf-8')))
				increment(ucount, msg.sender, msg.contents.count(unicode('ㅠ','utf-8')))
			
				# count emoticons
				if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents:
					emoticons = emoticons + 1
			
				# calculate active time
				td_increment(sent_time, msg.datetime.weekday(), msg.datetime.time().hour, 1)
			
				# analyze keyword
				"""
				keywords_list = twitter.nouns(msg.contents)
				for keyword in keywords_list :
					if len(keyword) > 1:
						if ( is_msg_content(keyword) ):	
							td_increment(keywords_all, str(msg.datetime)[:7], keyword, 1)
							increment(keywords, keyword, 1)
				"""
				if len(sent_month) == 0 :
					sent_month = str(msg.datetime)[:7]
				
				if sent_month == str(msg.datetime)[:7] :
					temp_keywords = temp_keywords + " " + msg.contents 
				elif sent_month != str(msg.datetime)[:7] :
					keywords_list = twitter.nouns(temp_keywords)
					for keyword in keywords_list :
						if len(keyword) > 1:
							if ( is_msg_content(keyword) ) :
								td_increment(keywords_all, sent_month, keyword, 1)
								increment(keywords, keyword, 1)
					sent_month = str(msg.datetime)[:7]
					temp_keywords = msg.contents

			#마지막달은 위 for문에서 못 하니까 여기서 한번 더 함.
			keywords_list = twitter.nouns(temp_keywords)
			for keyword in keywords_list :
				if len(keyword) > 1:
					if ( is_msg_content(keyword) ) :
						td_increment(keywords_all, sent_month, keyword, 1)
						increment(keywords, keyword, 1)

			if len(sender_list) == 2 :
				response_time = {}
				last_sender = ""
				last_response_time = timedelta(0)

				for sender in sender_list :
					response_time[sender] = []
				for msg in messages :
					if len(last_sender) == 0 :
						last_sender = msg.sender
					if last_sender != msg.sender :
						last_sender = msg.sender
						response_time[msg.sender].append(msg.datetime - last_response_time)

					last_response_time = msg.datetime

			#insert frequency message & byte	
			for date in send_ratio :
				for sender in send_ratio[date] :
                                	dataMessage = FrequencyMessage(
                                		chatroom_id = chatroom_id,
						name = unicode(str(sender), 'utf-8').encode('utf-8'),
                                		date = date,
                                		count = int(send_ratio[date][sender]),
						bytes = int(msg_bytes[date][sender])
                        		)
                        		dataMessage.save()
			
			#insert all keywords
			cnt = 0
			for date in keywords_all :
				for keyword in keywords_all[date] :
					tasks.insert_keywords.delay(keyword, date, keywords_all[date][keyword])
					"""
					word = smart_str(keyword)
					cnt = cnt + 1
					getWordData = FrequencyWordAll.objects.filter(word=keyword, date=date)
					if getWordData.exists() :
						FrequencyWordAll.objects.filter(id=getWordData[0].id).update(count=F('count') + keywords_all[date][keyword])
					else :
						dataWordAll = FrequencyWordAll(
							date = date,
							word = word,
							count = int(keywords_all[date][keyword])
						)
						dataWordAll.save()
					"""
			#insert most keywords 20				
			sorted_keywords = sorted(keywords.items(), key=lambda x:x[1], reverse = True)
			for i in range(0,20) :
				try :
					word = smart_str(sorted_keywords[i][0])
					dataWord = FrequencyWord(
						chatroom_id = chatroom_id,
						word = word,
						count = int(sorted_keywords[i][1])
					)
					dataWord.save()
				except :
					pass
			
			#insert moment
			for week in sent_time :
				for hour in sent_time[week] :
					dateTime = FrequencyTime(
                                		chatroom_id = chatroom_id,
						week = int(week),
						hour = int(hour),
						count = int(sent_time[week][hour])
					)
					dateTime.save()
			if len(sender_list) == 2 :
				is_one_to_one = 1
				intimacy = {}
				for sender in response_time : 
					rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender])
					td_increment( intimacy, sender, " ", rt_average.total_seconds())

			#insert intimacy
			for member in intimacy :
                                for friends in intimacy[member] :
					dataIntimacy = Intimacy(
                                		chatroom_id = chatroom_id,
						name = unicode(str(member), 'utf-8').encode('utf-8'),
						target = unicode(str(friends), 'utf-8').encode('utf-8'),
						count = int(intimacy[member][friends])
					)
                                	dataIntimacy.save()


			#insert each char count
			for sender in kcount :
				dataChar = FrequencyChars(
                                	chatroom_id = chatroom_id,
					name = unicode(str(sender), 'utf-8').encode('utf-8')
                                )
				try :
					dataChar.count_char_1 = int(kcount[sender])
				except :
					pass
				try :
                                        dataChar.count_char_2 = int(hcount[sender])
                                except :
                                        pass
				try :
                                        dataChar.count_char_3 = int(ucount[sender])
                                except :
                                        pass

                                dataChar.save()

			Chatroom.objects.filter(id=chatroom_id).update(complete_datetime=datetime.datetime.now(), is_one_to_one=is_one_to_one)
			return HttpResponse(myUid)
	return HttpResponse('Failed to Upload File')
Example #27
0
def main():
    engine = create_engine(
        'mysql://*****:*****@13.125.100.34:3306/release?charset=utf8mb4')
    conn = engine.connect()
    review_df = pd.read_sql_table('api_review', conn)
    s_review = review_df.store_id.unique()
    tot_rev_df = pd.DataFrame(columns=["store", "contents"])

    revlist = []
    for i in s_review:
        str = ""
        for content in review_df[review_df.store_id == i].content:
            str = str + " " + content
        revlist.append(str)

    tot_rev_df["store"] = s_review
    tot_rev_df["contents"] = revlist

    twitter = Twitter()

    all = []

    for i in range(0, len(tot_rev_df)):
        if (len(tot_rev_df.loc[i].contents) == 1):
            temp = []
        else:
            temp = twitter.nouns(tot_rev_df.loc[i].contents)
        all.append(temp)
    tfMapList = []
    wordMap = {}
    wordCount = 0

    for data in all:
        tfMap = {}
        for word in data:
            if word in tfMap.keys():
                tfMap[word] += 1
            else:
                tfMap[word] = 1

            if word not in wordMap.keys():
                wordMap[word] = wordCount
                wordCount += 1

        tfMapList.append(tfMap)

    table = [[0] * len(wordMap) for _ in range(len(tfMapList))]
    row = 0
    for tfMap in tfMapList:
        for word, tf in tfMap.items():
            word_count = 0
            for map1 in tfMapList:
                if word in map1.keys():
                    word_count += 1

            idf = math.log10(len(tfMapList) / word_count)
            tf_idf = tf * idf
            column = wordMap[word]
            table[row][column] = tf_idf

    table2 = pd.DataFrame.from_records(table)

    svd = TruncatedSVD(n_components=15)
    pos = svd.fit_transform(table2)

    norm = Normalizer(copy=False)
    pos2 = norm.fit_transform(pos)

    km = KMeans(n_clusters=40, random_state=0)
    labels_km = km.fit_predict(pos2)

    result = []
    for cur in range(0, 40):
        temp = [i for i, e in enumerate(labels_km) if e == cur]
        result.append(temp)

    with open('../../../../data/tf-idf_Result.pkl', 'wb') as f:
        pickle.dump(result, f)
Example #28
0
from konlpy.tag import Twitter
import sys
import json

twitter = Twitter()
with open(sys.argv[1]) as f:
    datas = json.loads(f.read())
result = []
for data in datas:
    result.append(twitter.nouns(data))
print(str(result).replace("'",'"'))
# set default coding euc-kr 2 utf-8
import sys
reload(sys)

sys.setdefaultencoding('utf-8')

print ("load")
#load from kobill 
from konlpy.corpus import kobill
#docs_ko =kobill.open('kobill/news.txt').read()
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]
print ("tokenize")

#tokenize
from konlpy.tag import Twitter; t = Twitter()
print ("tokenize1")
pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)]
print ('tokenize2')
texts_ko = [pos(doc) for doc in docs_ko]
#texts_ko = pos(docs_ko)
print ("train")
import time
now_time = time.time()
#train
from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300)
wv_model_ko.init_sims(replace=True)


wv_model_ko.save('ko_word2vec_e.model')
Example #30
0
def index(request):
    message = ''
    tagger = ''
    list_of_tagger = ''
    if request.method == 'POST':
        input_sentence = request.POST.get("name", "")
        list_of_tagger = request.POST.getlist("tagger")
        if 'selectall' in list_of_tagger:
            list_of_tagger = ['moara', 'twitter', 'kkma', 'komoran']

        if input_sentence != '':
            if 'moara' in list_of_tagger:
                # ======== 모아라 형태소 분석기 ========
                from subprocess import Popen, PIPE, STDOUT
                import os
                import re
                p = Popen([
                    'java', '-jar',
                    'C:/Users/bessh4/Desktop/mysite/postagging/moara_pos.jar',
                    input_sentence
                ],
                          stdout=PIPE,
                          stderr=STDOUT,
                          shell=True)
                output = [line.decode('cp949') for line in p.stdout]

                print_moara = []
                print_moara2 = []
                for i in output[2:]:
                    if i != "\n":
                        if i != '\r\n':
                            print_moara2.append(
                                i.split(", ")[0][1:] + " / " +
                                i.split(", ")[1])

                string_i = ''
                for i in print_moara2:
                    word_i = i.split('단어음절: ')[1].split(" / ")[0]
                    pos_i = i.split('단어품사: ')[1]
                    if len(re.findall(r'\bRECOMMEND\S+\b', pos_i)) == 0:
                        string_i += word_i + "/" + pos_i + " "

                print_moara = ("모아라", string_i)
            else:
                print_moara = ("모아라", "")

            # ======== konlpy 형태소 분석기 ========
            if 'twitter' in list_of_tagger:
                twitter_str = ''
                twitter = Twitter()
                for i in twitter.pos(input_sentence):
                    twitter_str += str(i[0] + "/" + i[1] + " ")
                twitter_message = ("트위터", twitter_str)
            else:
                twitter_message = ("트위터", "")

            if 'kkma' in list_of_tagger:
                kkma_str = ''
                kkma = Kkma()
                for i in kkma.pos(input_sentence):
                    kkma_str += str(i[0] + "/" + i[1] + " ")
                kkma_message = ("꼬꼬마", kkma_str)
            else:
                kkma_message = ("꼬꼬마", "")

            if 'komoran' in list_of_tagger:
                komoran_str = ''
                komoran = Komoran()
                for i in komoran.pos(input_sentence):
                    komoran_str += str(i[0] + "/" + i[1] + " ")
                komoran_message = ("코모란", komoran_str)
            else:
                komoran_message = ("코모란", "")

            message = [
                print_moara, twitter_message, kkma_message, komoran_message
            ]

        else:
            message = "형태소 분석할 문장을 입력하세요"

    context = {'message': message}
    return render(request, 'postagging/index.html', context)
Example #31
0
import pandas as pd
from konlpy.tag import Twitter
twitter = Twitter()
#ex) print(twitter.pos(u'이것도 되나욬ㅋㅋ',norm=True, stem=True))

path='/Users/kims/'

# file1
file1 = pd.read_csv(path+'comments_17_df.csv')
file1.head()

# konlpy file1
text = []
len(file1)

for i in range(0,len(file1)):
    text_spider = twitter.pos(file1.loc[i,'value'],norm=True, stem=True) 
    text.append(text_spider)

text
text_df=pd.DataFrame.from_records(text)
text_df=text_df.stack()

text_df.to_csv('text_17.csv', encoding='utf-8')

# file2
file2 = pd.read_csv(path+'comments_12_df.csv')
file2.head()

# konlpy file2
text = []
Example #32
0
from konlpy.tag import Twitter
from gensim.models import Word2Vec
import csv

twitter = Twitter()

file = open("Article_shuffled.csv", 'r', encoding='euc-kr')
line = csv.reader(file)
token = []
embeddingmodel = []

for i in line:
    sentence = twitter.pos(i[0], norm=True, stem=True)
    temp = []
    temp_embedding = []
    all_temp = []
    for k in range(len(sentence)):
        temp_embedding.append(sentence[k][0])
        temp.append(sentence[k][0] + '/' + sentence[k][1])
    all_temp.append(temp)
    embeddingmodel.append(temp_embedding)
    if i[3] == "IT과학":
        all_temp.append(0)
    elif i[3] == "경제":
        all_temp.append(1)
    elif i[3] == "정치":
        all_temp.append(2)
    elif i[3] == "e스포츠":
        all_temp.append(3)
    elif i[3] == "골프":
        all_temp.append(4)
Example #33
0
class FuzzyWuzzy():
    def __init__(self,
                 clusterings=[],
                 texts=[],
                 confidence=0.6,
                 batch_size=0,
                 merge=False):

        st = datetime.datetime.now()
        print('0. [start] init_setting ------------------------------')

        self.before_texts = texts
        self.texts = []
        self.batch_size = batch_size
        self.merge = merge
        self.t = Twitter()
        self.confidence = confidence * 100
        self.clusterings = []
        self.id = 0
        self.texts_len = 13
        if len(clusterings) > 0:
            self.convert_clustering(clusterings=clusterings)
        et = datetime.datetime.now()
        print('0. [end] init_setting => ', et - st)

    # stopwords를 제거하는 부분
    def filtering(self, str_list=None, noun=False):
        str_list = list(map(lambda x: re.sub('[\?\.]', '', x), str_list))
        str_list = list(map(lambda x: re.sub('어떻게 해야 하나요', '', x), str_list))
        str_list = list(map(lambda x: re.sub('어떻게 되는 것인가요', '', x), str_list))
        str_list = list(map(lambda x: re.sub('어떻게 되나요', '', x), str_list))
        str_list = list(map(lambda x: re.sub('왜 그런가요', '', x), str_list))
        str_list = list(map(lambda x: re.sub('라고 나와요', '', x), str_list))
        str_list = list(map(lambda x: re.sub('되나요', '', x), str_list))

        str_pos = self.t.pos(str_list[0], stem=True)
        stop_pos = ['Noun', 'Alpha', 'Foreign', 'Number']
        if not (noun):
            stop_pos.append('Verb')
            stop_pos.append('Adjective')

        str_filt = np.array(str_pos)[np.where(
            np.in1d(list(map(itemgetter(1), str_pos)), stop_pos))[0]]

        if noun and len(str_filt) > 1 and str_filt[-1][1] != 'Noun':
            str_filt = str_filt[0:-1]

        str_final = [' '.join(list(map(itemgetter(0), str_filt)))]
        stop_words = [
            '방법', '하는법', '어떤', '무슨', '알다', '말', '하다', '되다', '궁금하다', '가능하다',
            '시', '수', '인가요', '있다', '하나요', '해야하나요', '좋다', '해', '요', '한', '가요',
            '대해'
        ]
        split_str_list = list(map(lambda x: re.split(' ', x), str_final))
        filtered_word = list(
            map(
                lambda x: ' '.join(
                    list(np.array(x)[np.logical_not(np.in1d(x, stop_words))])),
                split_str_list))

        return filtered_word[0]
        #return jaso_split(filtered_word[0])

    def run(self):
        st = datetime.datetime.now()
        print('1. [start] init_run ------------------------------')

        init_confidence = self.confidence
        self.init_run()
        et = datetime.datetime.now()
        print('1. [end] init_run => ', et - st)

        st = datetime.datetime.now()
        print('2. [start] run_batch ------------------------------')

        self.confidence = init_confidence
        for i in range(2):

            if self.confidence >= 70:

                st_1 = datetime.datetime.now()

                self.run_batch(noun=True)
                self.confidence = self.confidence - 5

                et_1 = datetime.datetime.now()
                print('2-1. [end] run_batch noun-------', i + 1,
                      '번째 run_batch => ', et_1 - st_1)

            elif self.confidence < 70:
                break

        self.confidence = init_confidence
        for i in range(self.batch_size):

            if self.confidence >= 70:

                st_1 = datetime.datetime.now()
                self.run_batch(noun=False)
                self.confidence = self.confidence - 2.5

                et_1 = datetime.datetime.now()
                print('2-2. [end] run_batch verb-------', i + 1,
                      '번째 run_batch => ', et_1 - st_1)

            elif self.confidence < 70:
                break

        et = datetime.datetime.now()
        print('2. [end] run_batch => ', et - st)

        if self.merge:

            st = datetime.datetime.now()
            print('3. [start] merge_run ------------------------------')
            self.merge_run()
            et = datetime.datetime.now()
            print('3. [end] merge_run => ', et - st)

            st = datetime.datetime.now()
            print('4. [start] reform_run ------------------------------')
            self.reform_run()
            et = datetime.datetime.now()
            print('4. [end] reform_run => ', et - st)

        return self.clusterings

    def init_run(self):
        for i, text in enumerate(self.before_texts):
            convert_text = self.filtering(str_list=[text], noun=True)
            if i == 0 and len(self.clusterings) == 0:
                self.create_clustering(text, convert_text)
            else:
                self.ratio(text, convert_text)

    def find_zero_texts(self, noun=False):
        self.texts = []
        self.new_clusterings = []
        for clustering in self.clusterings:
            if len(clustering['texts']) < 2:
                text = clustering['texts'][0]
                #convert_text = self.filtering(str_list=[text], noun=noun)
                self.texts.append(text)
            else:
                self.new_clusterings.append(clustering)
        self.clusterings = self.new_clusterings

    def run_batch(self, noun=True):

        # 독립적으로 묶여있는 클러스터링은 한번 더 돌려서 확인
        self.find_zero_texts(noun=noun)
        if len(self.texts) > 0:
            for text in self.texts:
                self.ratio(original=text,
                           text=self.filtering(str_list=[text], noun=noun))

    def merge_run(self):
        ##그룹간 매칭
        new_small_c = []
        new_big_c = []
        for cluster in self.clusterings:
            if len(cluster['texts']) > 4:
                new_big_c.append(cluster)
            else:
                new_small_c.append(cluster)

        for sc in new_small_c:

            max_ratio = 0
            max_bc_category = 0
            for bc in new_big_c:
                this_ratio = fuzz.token_set_ratio(sc['totalText'],
                                                  bc['totalText'])
                if max_ratio < this_ratio:
                    max_bc_category = bc['category']
                    max_ratio = this_ratio

            if max_ratio > 77:
                for item in new_big_c:
                    if item.get('category') == max_bc_category:
                        item['texts'].extend(sc['texts'])
                        temp_totalText = item['totalText'] + ' ' + sc[
                            'totalText']
                        count = Counter(
                            list(set(re.split(' ', temp_totalText))))
                        item['totalText'] = ''
                        for n, c in count.most_common(self.texts_len):
                            item['totalText'] = item['totalText'] + ' ' + n
                            #[item['texts'].extend(sc['texts']) for item in new_big_c if item.get('category')==max_bc_category]
            else:
                new_big_c.append(sc)

        self.clusterings = new_big_c

    def reform_run(self):

        reform_ts = []
        reform_cs = []
        for cluster in self.clusterings:
            text_size = len(cluster['texts'])
            total_size = len(re.split(' ', cluster['totalText']))
            if text_size == 2 and total_size >= 7:
                reform_ts.extend(cluster['texts'])
            else:
                reform_cs.append(cluster)
        self.clusterings = reform_cs

        for text in reform_ts:
            convert_text = self.filtering(str_list=[text], noun=False)
            self.ratio(text, convert_text)

    def ratio(self, original, text):
        max_category = 0
        max_ratio = 0
        random.shuffle(self.clusterings)
        for i, ob in enumerate(self.clusterings):
            this_ratio = fuzz.token_set_ratio(text, ob['totalText'])
            if max_ratio < this_ratio:
                max_ratio = this_ratio
                max_index = i

        if max_ratio > self.confidence:
            self.add_clustering(max_index, original, text)
        else:
            self.create_clustering(original, text)

    def create_clustering(self, original, text):
        tmp_totalTexts = list(set(re.split(' ', text)))
        text = ' '.join(list(set(tmp_totalTexts)))
        cluster = {
            "category": self.id,
            "texts": [original],
            "totalText": text,
        }
        self.clusterings.append(cluster)
        self.id = self.id + 1

    def add_clustering(self, max_index, original, text):
        cluster = self.clusterings[max_index]
        cluster['texts'].append(original)
        cluster['totalText'] = cluster['totalText'] + ' ' + text
        tmp_totalTexts = list(set(re.split(' ', cluster['totalText'])))
        if len(tmp_totalTexts) < self.texts_len:
            cluster['totalText'] = ' '.join(tmp_totalTexts)
        else:
            count = Counter(tmp_totalTexts)
            cluster['totalText'] = ""
            for n, c in count.most_common(self.texts_len):
                cluster['totalText'] = cluster['totalText'] + ' ' + n

    def convert_clustering(self, clusterings=[]):
        self.clusterings = []
        for c in clusterings:
            #print(c)
            tmp_total_str = ""
            totalText = ""
            tmp_total_original_list = []

            for t in c['texts']:
                tmp_total_str = tmp_total_str + " " + self.filtering(
                    str_list=[t], noun=False)
                tmp_total_original_list.append(t)

            tmp_total_list = re.split(' ', tmp_total_str)
            #print(tmp_total_list)
            if len(list(set(tmp_total_list))) > self.texts_len:
                count = Counter(tmp_total_list)
                for n, c in count.most_common(self.texts_len):
                    totalText = totalText + " " + n
            else:
                totalText = " ".join(list(set(tmp_total_list)))

            self.id = self.id + 1
            self.clusterings.append({
                'category': self.id,
                'texts': tmp_total_original_list,
                'totalText': totalText
            })
Example #34
0
import numpy as np
import os

Model_File = './model/model_news_v2.hdf5'
FILE_LIST = ['news0.txt', 'news1.txt', 'news2.txt', 'news3.txt']
text = []
result = []
for x in FILE_LIST:
    fp = open(x, 'r', encoding='utf-8')
    sentences = fp.readlines()
    for x in sentences:
        for y in x.split('. '):
            text.append(y.strip() + '.')
    fp.close()

twitter = Twitter()
for x in text:
    ret = twitter.pos(x, stem=True, norm=True)
    tmp = []
    for y, _ in ret:
        tmp.append(y)
    result.append(tmp)
result = np.array(result)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(result)
x_tmp = tokenizer.texts_to_sequences(result)

sen_len = 5
x_train = []
y_train = []
Example #35
0
    def __init__(self):

        self.twitter = Twitter()
        self.kkma = Kkma()
Example #36
0
from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Twitter
import numpy as np
import numpy.linalg as la
import argparse
import matplotlib.pyplot as plt
import sklearn.decomposition as decomposition
import sys

twitter = Twitter()


def normalize(array):
    norm = la.norm(array)
    return array / norm


def create_word_vector(word, pos_embeddings):
    pos_list = twitter.pos(word, norm=True)
    word_vector = np.sum(
        [pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list],
        axis=0)
    return normalize(word_vector)


def plot_with_labels(embeds, labels, filename="output.png"):
    plt.figure(figsize=(18, 18))
    pca = decomposition.PCA(n_components=2)
    pca.fit(embeds)
    Y = pca.transform(embeds)
    for i, label in enumerate(labels):
Example #37
0
def analyzer( messages ) :

	# store senders in chat room
	sender_list = set()

	send_ratio = {}
	msg_bytes = {}
	sent_time = {}
	for i in range (0, 7) :
		sent_time[ i ] = {}
		for j in range(0,24) :
			sent_time[ i ][ j ] = 0	

	kcount = {}
	keywords = {}
	sent_month = ""
	temp_keywords = []

	emoticons = 0
	total = 0
	last_sender = ""

	

	intimacy = {}

	twitter = Twitter()

	for msg in messages :
		
		sender_list.add(msg.sender)

		# to calculate intimacy between member
		if len(last_sender) == 0 :
			last_sender = msg.sender
		if last_sender != msg.sender :
			td_increment( intimacy, last_sender, msg.sender, 1)
			td_increment( intimacy, msg.sender, last_sender, 1)
			last_sender = msg.sender

		# check send ratio.
		td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1)

		# calculate msg bytes by sender
		td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents))
		
		# count k in msg.
		increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8')))

		# count emoticons
		if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents:
			emoticons = emoticons + 1

		# calculate active time
		td_increment(sent_time, msg.datetime.weekday() , msg.datetime.time().hour, 1)

		# analyze keyword
		if ( is_msg_content(msg.contents) ) :
			if len(sent_month) == 0 :
				sent_month = str(msg.datetime)[:7]
			elif sent_month == str(msg.datetime)[:7] :
				temp_keywords.append(msg.contents)
			elif sent_month != str(msg.datetime)[:7] :
				keywords_list = twitter.nouns(msg.contents)
				for keyword in keywords_list :
					if len(keyword) > 1:
						td_increment(keywords, sent_month, keyword, 1)
				sent_month = str(msg.datetime)[:7]
				del temp_keywords[:]
				temp_keywords.append(msg.contents)

	# in case of 1:1 chat room
	if len(sender_list) == 2 :
		response_time = {}
		last_sender = ""
		last_response_time = timedelta(0)

		for sender in sender_list :
			response_time[sender] = []
		for msg in messages :
			if len(last_sender) == 0 :
				last_sender = msg.sender
			if last_sender != msg.sender :
				last_sender = msg.sender
				response_time[msg.sender].append(msg.datetime - last_response_time)

			last_response_time = msg.datetime


	print "Who sent how much messages? "

	for date in send_ratio :
		print "in " + str(date)
		for sender in send_ratio[date] :
			print str(sender) + " sent " + str(send_ratio[date][sender]) + " messages"
			total = total + int(send_ratio[date][sender])

	print ""

	print "Msg bytes : "

	for date in msg_bytes :
		print "in " + str(date)
		for sender in msg_bytes[date] :
			print str(sender) + " sent " + str(msg_bytes[date][sender]) + " bytes"

	print ""

	for sender in kcount :
		print sender + " wrote " + unicode('ㅋ','utf-8').encode('utf-8') + " " + str(kcount[sender]) + " byte times"

	print ""

	print ""


	# sorted keywords has 'list' type. not dict.
	print "Top 20 most frequently used keywords in your chatroom."
	for date in keywords :
		print "in " + date
 		sorted_keywords = sorted(keywords[date].items(), key=lambda x:x[1], reverse = True)
		for i in range(0,20) :
			try :
				print sorted_keywords[i][0] + " : " + str(sorted_keywords[i][1])
			except :
				pass

	print ""


	print "When is the most active moment in this chat room?"
	for week in sent_time :
		print week
		for hour in sorted(sent_time[week]):
			print str(sent_time[week][hour]) + " messages were sent at " + str(hour) + " o'clock"
		
	print ""

	print "you guys used emoticons " + str(emoticons) + " times"

	print ""

	print "intimacy between members"

	if len(sender_list) == 2 : 
		for sender in response_time : 
			print sender
			rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender])
			print "responded in " + str(rt_average) + "in average"

	else : 
		for member in intimacy :
			print member + " : "
			for friends in intimacy[member] :
				print " - " + friends + " " + str(intimacy[member][friends])

	print ""

	print "totally, " + str(total) + " messages were sent"
from konlpy.tag import Twitter
t = Twitter()
from konlpy.corpus import kolaw
import nltk
import gensim
from gensim.models import LdaModel
from gensim import corpora, models
import nltk
import MySQLdb
import xlwt
db = MySQLdb.connect(host="localhost",
                     user="******",
                     passwd="kkms1234",
                     db="scraping",
                     charset='utf8')

cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor.execute("set names utf8")

db.query("set character_set_connection=utf8;")
db.query("set character_set_server=utf8;")
db.query("set character_set_client=utf8;")
db.query("set character_set_results=utf8;")
db.query("set character_set_database=utf8;")

cursor.execute("set names utf8")
sql = "select * from Text3 where ArticleNumber<=10000"
cursor.execute(sql.encode('utf8'))

rows = cursor.fetchall()
document = ''
Example #39
0
import csv

from konlpy.tag import Twitter


reader = csv.reader(open("../sample/top_song.csv",'r'))
writer = csv.writer(open("../sample/top_song_lemma.csv",'w'))
twitter = Twitter()

lema = str()

for i in reader:
    s = twitter.pos(i[4],norm=True)
    x = [i[0] for i in s if i[1] in ['Noun','Verb','Adjective','Alpha'] and len(i[0])>1]
    print(i[4],"\n",x,"\n"," ".join(x),"\n")

    result = [seg for seg in i]
    result.append(" ".join(x))
    writer.writerow(result)
Example #40
0
 def __init__(self, logger):
     self.logger = logger
     self.twitter = Twitter()
Example #41
0
"""
Created on Wed Mar  9 00:35:54 2016

@author: chuckgu
"""

import json,os
from nltk.tokenize import sent_tokenize,word_tokenize
from konlpy.tag import Twitter
import numpy as np
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

twitter=Twitter()

txt=[]

checklist=['Exclamation','Alpha','URL','Punctuation','Foreign','Unknown','Hashtag','ScreenName','Josa']

'''
currdir = os.getcwd()
os.chdir('%s/' % currdir)
print currdir

with open("text8", 'r') as f:
    for line in f:
        sentences.append(line[:100])
        
print sentences 
Example #42
0
class NewsParser(object):
    news_link = "http://news.naver.com/main/read.nhn?oid=%s&aid=%s"
    jsonp_regex = re.compile(r'^\s*cb\s*\((.*)\)\s*;?\s*$', re.DOTALL)

    def __init__(self, logger):
        self.logger = logger
        self.twitter = Twitter()

    def parse(self, news_id_token):
        split = news_id_token.split(',')
        href = NewsParser.news_link % tuple(split)
        req = requests.get(href)

        soup = BeautifulSoup(req.text, 'html.parser')
        title_elem = soup.select_one('#articleTitle')
        content_elem = soup.select_one('#articleBodyContents')
        news_type = 'NEWS'

        if not title_elem:
            title_elem = soup.select_one("h2.end_tit")
            content_elem = soup.select_one(
                "#articeBody")  # Not typo, it is really "artice"
            news_type = 'ENTERTAIN'

            if not title_elem or not content_elem:
                self.logger.info('[Crawl::News Info] %s has no title!' %
                                 news_id_token)
                return None

        for script in content_elem.findAll("script"):
            script.decompose()

        title = self.twitter.pos(title_elem.get_text(), norm=True, stem=True)
        content = self.twitter.pos(content_elem.get_text(),
                                   norm=True,
                                   stem=True)

        api_req = requests.get(
            "http://news.like.naver.com/v1/search/contents",
            params={"q": "%s[ne_%s_%s]" % (news_type, split[0], split[1])},
            headers={
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                + "Chrome/64.0.3282.167 Safari/537.36",
                "Referer":
                href  # Needed header
            })

        api_resp = json.loads(api_req.text)

        if ('contents' not in api_resp) or \
                (len(api_resp['contents']) < 1) or \
                ('reactions' not in api_resp['contents'][0]):
            self.logger.info('[CrawlAppend::News Info] %s has no reactions!' %
                             news_id_token)
            return None

        reactions = api_resp['contents'][0]['reactions']
        reactions_parsed = {}

        for reaction in reactions:
            reactions_parsed[reaction['reactionType']] = reaction['count']

        api_req = requests.get(
            "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json",
            params={
                "_callback": "cb",
                "objectId": "news" + news_id_token,
                "pool": "cbox5",
                "ticket": "news",
                "lang": "ko",
                "initialize": "true",
                "pageSize":
                "1"  # Reduce packet size, pageSize will be ignored if it is less than one.
            },
            headers={
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                + "Chrome/64.0.3282.167 Safari/537.36",
                "Referer":
                href  # Needed header
            })

        api_resp = json.loads(
            NewsParser.jsonp_regex.match(api_req.text).group(1))

        if ('result' not in api_resp) or\
           ('graph' not in api_resp['result']) or\
           ('count' not in api_resp['result']) or\
           ('gender' not in api_resp['result']['graph']) or\
           ('old' not in api_resp['result']['graph']):

            self.logger.info('[Crawl::News Info] %s has no graphs!' %
                             news_id_token)
            return None

        gender_graph = api_resp['result']['graph']['gender']
        age_graph = api_resp['result']['graph']['old']
        age_parsed = {}
        comments = api_resp['result']['count']['total']

        for age in age_graph:
            age_parsed[age['age']] = age['value']

        return {
            'title': title,
            'content': content,
            'age': age_parsed,
            'gender': gender_graph,
            'comment': comments,
            'reaction': reactions_parsed
        }
Example #43
0
import csv
import operator
from pykospacing import spacing
from konlpy.tag import Twitter


def findfeq(worddict, data):
    pass


twitter = Twitter()
f = open('practice/train.csv', 'r', encoding='utf-8')
rdr = csv.reader(f)
i = 0
worddict = {}
wordlist = []
wordfreq = []

# str = "히라마블로그에온걸^^환영해1993!"
# print(spacing(str))
for line in rdr:

    if (i != 0):

        message = list(line)
        sms = message[2]
        sentencelist = list(sms.split('.'))

        for sentence in sentencelist:
            sentence = sentence.replace(' ', '')
            sentence = spacing(sentence)
Example #44
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from konlpy.corpus import kobill
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]


from konlpy.tag import Twitter; t=Twitter()
pos = lambda d: ['/'.join(p) for p in t.pos(d, stem=True, norm=True)]
texts_ko = [pos(doc) for doc in docs_ko]


#encode tokens to integers
from gensim import corpora
dictionary_ko = corpora.Dictionary(texts_ko)
dictionary_ko.save('ko.dict')  # save dictionary to file for future use

#calulate TF-IDF
from gensim import models
tf_ko = [dictionary_ko.doc2bow(text) for text in texts_ko]
tfidf_model_ko = models.TfidfModel(tf_ko)
tfidf_ko = tfidf_model_ko[tf_ko]
corpora.MmCorpus.serialize('ko.mm', tfidf_ko) # save corpus to file for future use

#train topic model
#LSI
ntopics, nwords = 3, 5
lsi_ko = models.lsimodel.LsiModel(tfidf_ko, id2word=dictionary_ko, num_topics=ntopics)
Example #45
0
query = {"$and": [
    {"lyrics": {"$exists": True}},
    {"count": {"$gte": 20}}
]}
proj = {"lyrics": True, "_id": False}
cursor = target.find(query, proj)
lyrics_set = []
for doc in cursor:
    # print(doc["lyrics"])
    lyrics_set.append(str(doc["lyrics"]))
print("===== complete data import =====")

## tokenization and tagging using konlpy
from konlpy.tag import Twitter
tw = Twitter()

## stopword elimination(보류)
""" 패키지 수정해야될지, 진짜 필요한 부분인지, 대체할 수 있는 방법 없는지 논의
from many_stop_words import get_stop_words
stopwords = list(get_stop_words('kr'))
for sw in stopwords:
    print(sw)
"""

## create English stop words list
from stop_words import get_stop_words
en_stop = get_stop_words('en')

## Create p_stemmer of class PorterStemmer
from nltk.stem.porter import PorterStemmer
Example #46
0
 def __init__(self):
     self.ko_twitter = Twitter()
Example #47
0
이 프로그램은 문서 키워드 추출과 관련된 연구를 위해 작성되었습니다.


"""




from matplotlib import font_manager, rc
font_fname = '/Library/Fonts/AppleGothic.ttf'     # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)
import nltk
from konlpy.tag import Twitter
import numpy
tw = Twitter()
# 트위터 형태소 분석기를 사용함
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# 찾으려고 하는 상위 n개의 단어(명사)
_toFind_ = 30

# 문서 읽기
doc_ko = open('./k_tex2.txt').read()

# print(doc_ko)

# 명사만 추출
token_ko = tw.nouns(doc_ko)
import json, pymongo, requests, os, sys
import time, dateutil.parser, codecs
from konlpy.tag import Twitter; t = Twitter()
pos = lambda d: ['/'.join(p) for p in t.pos(d) if p[1] in ['Noun', 'Adjective', 'Determiner', 'Adverb', 'KoreanParticle']]

conn = pymongo.MongoClient("mongodb://localhost")
db = conn.watcha
cnames = db.collection_names()
collections = dict()

for cname in cnames:
	collections[cname] = eval('db.' + cname)

del collections['reviews']
del collections['system.indexes']

cursor = collections['comedy'].find()
length = collections['comedy'].count()
cnt = 0

with codecs.open('D:\watcha_reviews\comedy.txt', 'w', encoding='utf-8') as fp:
	for row in cursor:
		cnt += 1
		if cnt % 1000 == 0:
			print str(cnt) + ' / ' + str(length)
		rating = row['rating']
		cid = row['comment_id']
		text = row['text']
		fp.write(' '.join([str(rating), str(cid)] + pos(text)) + '\n')
Example #49
0
# LDA 모델링 패키지
import gensim
from gensim import corpora, models

# 한국어 처리
from konlpy.tag import Twitter

twitter = Twitter()

import operator

documents = []

# 중복 토큰 필터
filterDocuments = []

for i in documents:

    # 문장 토큰화
    tokens = twitter.pos(i, norm=True, stem=True)

    # 명사만 추출하지 않고 진행시 거리가 먼 분류가 노출됨
    stem_tokens = [split[0] for split in tokens if split[1] == "Noun"]

    filterDocuments.append(stem_tokens)

# Dictionary 생성
dictinory = corpora.Dictionary(filterDocuments)

# 토큰으로 생성된 사전 중첩처리
corpus = [dictinory.doc2bow(text) for text in filterDocuments]
Example #50
0
from collections import Counter
import os
from konlpy.tag import Twitter
tw = Twitter()
region = [
    "서울", "부산", "기장", "대구", "달성", "인천", "강화", "옹진", "광주", "대전", "울산", "울주",
    "세종", "수원", "성남", "안양", "안산", "용인", "광명", "평택", "과천", "오산", "시흥", "군포",
    "의왕", "하남", "이천", "안성", "김포", "화성", "경기도_광주", "여주", "부천", "양평", "고양",
    "의정부", "동두천", "구리", "남양주", "파주", "양주", "포천", "연천", "가평", "춘천", "원주", "강릉",
    "동해", "태백", "속초", "삼척", "홍천", "횡성", "영월", "평창", "정선", "철원", "화천", "양구",
    "인제", "고성", "양양", "청주", "충주", "제천", "보은", "옥천", "영동", "진천", "괴산", "음성",
    "단양", "증평", "천안", "공주", "보령", "아산", "서산", "논산", "계룡", "당진", "금산", "부여",
    "서천", "청양", "홍성", "예산", "태안", '전주', '군산', '익산', '정읍', '남원', '김제', '완주',
    '진안', '무주', '장수', '임실', '순창', '고창', '부안', '목포', '여수', '순천', '나주', '광양',
    '담양', '곡성', '구례', '고흥', '보성', '화순', '장흥', '강진', '해남', '영암', '무안', '함평',
    '영광', '장성', '완도', '진도', '신안', '창원', '진주', '통영', '사천', '김해', '밀양', '거제',
    '양산', '의령', '함안', '창녕', '고성', '남해', '하동', '산청', '함양', '거창', '합천', '포항',
    '경주', '김천', '안동', '구미', '영주', '영천', '상주', '문경', '경산', '군위', '의성', '청송',
    '영양', '영덕', '청도', '고령', '성주', '칠곡', '예천', '봉화', '울진', '울릉', '제주'
]

water_issues = [
    "수력", "하수", "용수", "하천", "댐", "강우", "저수", "호우", "빗물", "상수", "조류", "녹조",
    "수질", "풍수", "누수", "유수", "강수", "정수", "취수", "수돗물", "배수", "오염", "홍수", "가뭄"
]


def get_tags(text, issues):
    spliter = Twitter()
    nouns = spliter.nouns(text)
    count = Counter(nouns)
Example #51
0
 def __init__(self, filename=None, nth=-1):
     self.filename = filename
     self.nth = nth
     self.twitter = Twitter()
     self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log')
Example #52
0
OOV = "<OOV>"  # 없는 단어(Out of Vocabulary)

# 태그 인덱스
PAD_INDEX = 0
STA_INDEX = 1
END_INDEX = 2
OOV_INDEX = 3

# 데이터 타입
ENCODER_INPUT = 0
DECODER_INPUT = 1
DECODER_TARGET = 2

max_sequences = 30
RE_FILTER = re.compile("[.,!?\"':;~()]")
tagger = Twitter()

app = Flask(__name__)

# 모델 / 단어 가져오기
project_path = os.path.dirname(os.path.abspath(__file__))
encoder_model = load_model(project_path + "/data/encoder_model.h5")
decoder_model = load_model(project_path + "/data/decoder_model.h5")
with open(project_path + "/data/words.pickle", 'rb') as f:
    words = pickle.load(f)
words[:0] = [PAD, STA, END, OOV]

# 단어와 인덱스의 딕셔너리 생성
word_to_index = {word: index for index, word in enumerate(words)}
index_to_word = {index: word for index, word in enumerate(words)}
Example #53
0
# -*- coding: utf-8 -*-

from konlpy.tag import Twitter
import pymysql
import instagram
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
twitter = Twitter()

# connection to mysql(local)
conn = pymysql.connect(host='aws-server-name', port=3306, user='******', passwd='password', db='ematewha', charset ='utf8', use_unicode=True)
cur = conn.cursor()

# connection to instagram_api
client_id = 'id'
client_secret = 'd790b3e4268c4b4bb91492af520647fd'
access_token = 'given-token'
client_ip = 'ip'
api = instagram.InstagramAPI(client_id=client_id, client_secret=client_secret, client_ips=client_ip, access_token=access_token)

# bring upso_refine.name from mysql
cur.execute('SELECT upso_id from upso_refined')
upso_id = cur.fetchall()


cur.execute('SELECT name_refined from upso_refined')
upso_refine = cur.fetchall()

# search with upso_refine from instagram
Example #54
0
def count_wordfreq(data):
    twitter = Twitter()
    nouns = twitter.nouns(data)
    count = Counter(nouns)

    return count
Example #55
0
def main():
    # def job():
    conn = pymysql.connect(host='192.168.0.61',
                           user='******',
                           password='******',
                           db='one_db',
                           charset='utf8mb4')
    cursor = conn.cursor()

    sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s'
    cursor.execute(sql, 'N')

    original = cursor.fetchall()

    print('original data')
    print(original)

    # 신조어 필터링
    sql = 'SELECT word FROM tb_newdic'
    cursor.execute(sql)

    newdic = cursor.fetchall()

    # print('신조어 사전')
    # print(newdic)

    # 예외사전 데이터 가져오기
    sql = 'SELECT word FROM tb_excdic'
    cursor.execute(sql)

    excdic = cursor.fetchall()
    print('예외 사전')
    print(excdic)

    originalList = []
    for data in original:
        dataList = list(data)

        for word in newdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

                sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'
                cursor.execute(sql, (dataList[0], word[0], dataList[2]))
                conn.commit()

        for word in excdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

        originalList.append(dataList)

    original = originalList

    # 트위터로 분석
    from konlpy.tag import Twitter
    twitter = Twitter()

    tresult = []

    for data in original:
        tresult.append([data[0], twitter.nouns(data[1]), data[2]])
        print(twitter.pos(data[1]))

    # 트위터 분석 결과 확인
    print('twitter result')
    print(tresult)

    # 코모란으로 분석
    from konlpy.tag import Komoran
    komoran = Komoran()

    kresult = []

    for data in tresult:
        words = data[1]

        # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False
        state = True

        for word in words:
            try:
                type = komoran.pos(word)[0][1]
                if type == 'NNG' or type == 'NNP':
                    kresult.append([data[0], komoran.morphs(word)[0]])

                    # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우
                    exist = False
                    # 예외 사전에 있는 단어는 INSERT 전에 필터링
                    for exc in excdic:
                        sql = 'SELECT INSTR(%s, %s)'
                        cursor.execute(sql, (word, exc[0]))

                        count = cursor.fetchone()
                        if count[0] != 0:
                            print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치')
                            exist = True
                            break

                    if exist:
                        continue

                    # NNG, NNP 타입만 DB에 INSERT
                    # 예외 발생 시 rollback, 아닌 경우 commit으로 처리
                    sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'

                    try:
                        if len(komoran.morphs(word)[0]) != 1:
                            cursor.execute(
                                sql,
                                (data[0], komoran.morphs(word)[0], data[2]))

                    except Exception as err:
                        state = False
                        print('ERROR : komoran result의 ' + str(data[0]) +
                              '번 글의 에서 insert 처리 중 오류 발생')
                        print(str(err))
                        conn.rollback()
                    else:
                        conn.commit()

            except Exception as err:
                state = False
                print('ERROR : komoran 키워드 분석 중 오류 발생')
                continue

        ssql = 'UPDATE test_original SET state = %s WHERE ono = %s'
        state = 'Y' if state == True else 'E'
        cursor.execute(ssql, (state, data[0]))

        conn.commit()

    # 코모란 분석 결과 확인
    print('komoran result')
    print(kresult)

    print('-----')
    print('끝')


# schedule.every().day.at("").do(job)
#
# while 1:
#     schedule.run_pending()
#     time.sleep(1)
Example #56
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.corpus import kolaw

# 1. 이전 포스트에서 크롤링한 댓글파일을 읽기전용으로 호출함
file = open('./test4.txt', 'r', encoding='utf-8')
lines = file.readlines()

# 2. 변수 okja에 전체댓글을 다시저장
okja = []
for line in lines:
    okja.append(line)
file.close()

twitter = Twitter()

# 4. 각 문장별로 형태소 구분하기
sentences_tag = []
for sentence in okja:
    morph = twitter.pos(sentence)
    sentences_tag.append(morph)
    print(morph)
    print('-' * 30)

print(sentences_tag)
print(len(sentences_tag))
print('\n' * 3)

# 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기
docs = []
Example #57
0
# -*- coding:utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import json
import operator 
import urllib2


from konlpy.tag import Kkma
from konlpy.utils import pprint
from konlpy.tag import Twitter

twitter = Twitter()

#f = open("seoul_data2.txt", "r")
#f = open("polatics.txt", "r")
# current 500 articles
f = urllib2.urlopen("http://polatics.news/all").read().split('\n')
f.reverse()
f = f[0:400]

for i in f:
	print i 

print "line : %d" %(len(f))
f2 = open("polatics_out.txt", "w")

voca = {}
Example #58
0
# -*- coding:utf-8 -*-
"""
Title: Parts of speech tagging using KoNLPy
Purpose: To comprison the result of POS using KoNLPy
"""

from konlpy.tag import Kkma
from konlpy.tag import Okt
from konlpy.tag import Hannanum
from konlpy.tag import Komoran
from konlpy.tag import Twitter

if __name__ == '__main__':

    kkma = Kkma()
    okt = Okt()
    komoran = Komoran()
    hannanum = Hannanum()
    twitter = Twitter()

    # Only Kkma can split the setences
    print("kkma 문장 분리 : ", kkma.sentences("네 안녕하세요 반갑습니다."))

    # Comprison of Konlpy's library parts of speech
    print("okt 형태소 분석 : ", okt.pos(u"집에 가면 감자 좀 쪄줄래?"))  #--> Ok
    print("kkma 형태소 분석 : ", kkma.pos(u"집에 가면 감자 좀 쪄줄래?"))
    print("hannanum 형태소 분석 : ", hannanum.pos(u"집에 가면 감자 좀 쪄줄래?"))
    print("komoran 형태소 분석 : ", komoran.pos(u"집에 가면 감자 좀 쪄줄래?"))
    print("twitter 형태소 분석 : ", twitter.pos(u"집에 가면 감자 좀 쪄줄래?"))  # --> Ok
Example #59
0
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter
# utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1)
fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16")
soup = BeautifulSoup(fp, "html.parser")
body = soup.select_one("body > text")
text = body.getText()
# 텍스트를 한 줄씩 처리하기 --- (※2)
twitter = Twitter()
word_dic = {}
lines = text.split("\n")
for line in lines:
    malist = twitter.pos(line)
    for word in malist:
        if word[1] == "Noun":  #  명사 확인하기 --- (※3)
            if not (word[0] in word_dic):
                word_dic[word[0]] = 0
            word_dic[word[0]] += 1  # 카운트하기
# 많이 사용된 명사 출력하기 --- (※4)
keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True)
for word, count in keys[:50]:
    print("{0}({1}) ".format(word, count), end="")
print()
Example #60
0
import pickle
from konlpy.tag import Kkma
from konlpy.tag import Twitter
import jpype
import os

twitter = Twitter()
f = open('pickle.pickle','rb')
data = pickle.load(f)

#wordbag = []
doc_list = []
termdoc = {}

for datum in data:
	doc_list.append(datum['no'])
	
#data = None
#gc.collect()
	
for datum in data:
	doc_id = datum['no']
	lec_no = datum['lec_no'] #
	pos = twitter.pos(datum['eval_content'],stem = True)
	for p in pos:
		tag = p[1]
		if ('Exclamation' or 'Josa' or 'Eomi' or 'Suffix' or 'Punctuation' or 'Foreign' or 'Alpha' or 'Unknown' or 'KoreanParticle' or 'Hashtag' or 'ScreenName') in tag:
			continue
		if p[0] not in termdoc:
			termdoc[p[0]] = dict.fromkeys(doc_list,0)
		termdoc[p[0]][doc_id] += 1