Beispiel #1
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Beispiel #2
0
def pos_tagging_noun(text):
    noun_terms_list = []

    twitter = Twitter()
    pos_list = twitter.pos(text, norm=True, stem=True)

    for item in pos_list:
        if (item[1] == 'Noun'):
            noun_terms_list.append(item)

    return noun_terms_list
Beispiel #3
0
def pos_tagging(text):
    available_terms_list = []

    twitter = Twitter()
    pos_list = twitter.pos(text, norm=True, stem=True)

    for item in pos_list:
        if (item[1] == 'Verb') | (item[1] == 'Adjective'):
            available_terms_list.append(item)

    return available_terms_list
Beispiel #4
0
    def _twitter_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        twitter = Twitter(jvmpath=None)
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(twitter.pos(str(data)), tag_combine=tag_combine)
        return return_arr
class KorDisintegrator:
    def __init__(self):
        self.ko_twitter = Twitter()

    def convert2simple(self, sentence="", norm=True, stem=True):
        disintegrated_sentence = self.ko_twitter.pos(sentence, norm=norm, stem=stem)
        convert_sentence = []

        for w, t in disintegrated_sentence:
            if t not in ["Eomi", "Josa", "KoreanParticle", "Punctuation"]:
                convert_sentence.append(w)
        return " ".join(convert_sentence)
Beispiel #6
0
def create_wordbag(x):
	wordbag = []
	if(x['eval_content']) is None:
		return wordbag	
	twitter = Twitter()
	for text in twitter.pos(x['eval_content'], stem = True):
		tag = text[1]
		if tag in unneeded:
			continue

		word = text[0]
		wordbag.append(word)
	return wordbag
Beispiel #7
0
def main():
    """
        konlpy 사용시 주의 사항
        자바 설치 및 세팅 필요
        JAVA_HOME 세팅이 필요합니다.
        export JAVA_HOME=$(/usr/libexec/java_home)
    """
    konl = Twitter()
    file_path = '/Users/bongster/Downloads/20160528_jiana.csv'
    with open(file_path, 'rb') as csv_file:
        inforeader = csv.reader(csv_file)
        for row in inforeader:
            r = konl.pos(unicode(row[4], 'utf-8'), norm=True, stem=True)
            print '=' * 20
            for txt, post in r:
                print txt, post
            print '=' * 20
Beispiel #8
0
def main(_):


    is_train = True  # if False then test
    
    if is_train :
        train()
              
    else:
        checklist=['Exclamation','Alpha','URL']
        twitter=Twitter()
        
        dic_file_x='data/xproject_class.dict.pkl'
        worddict_x = dict()

        worddict_x = load_dict(dic_file_x)   
        
        x, x_mask,prediction=build_test('/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83')
        
        while 1:
            choice=raw_input("Me: ")
            if choice in ["Q","q"]: break
            #print choice
            
            choice=choice.decode('utf-8')
            
            sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True)  if s[1] not in checklist])
            
    
            words=(word_tokenize(sen.strip().lower()))
            #print ' '.join(words)
            seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words]
            seqs = [s if s<600 else 1 for s in seqs]
            seqs=[seqs]
            res=test_data(seqs,x, x_mask,prediction,'/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83')
            
            #print res
            print "class: "+str(res)
class Parser(object):
    def __init__(self, filename=None, nth=-1):
        self.filename = filename
        self.nth = nth
        self.twitter = Twitter()
        self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log')

    def parse_sentence(self, sentence):
        return self.twitter.pos(sentence)

    def parse_all_generator(self, filename=None, nth=None):
        if filename is None:
            filename = self.filename or click.prompt('file name is required')
        if nth is None:
            nth = self.nth
        for row in ParseUtils.iter_csv(filename):
            try:
                parsed = self.parse_sentence(row[nth])
                concated = ' '.join([ParseUtils.concat_tuple(x) for x in parsed])
                row[nth] = concated
            except BaseException as e:
                msg = '{error:<80}  |  {data}'.format(error=str(e), data=ParseUtils.list_to_csv(row))
                self.logger.error(msg)
                continue
            yield row

    def extract_parsed(self, out_filename, filename=None, nth=None):
        if filename is None:
            filename = self.filename or click.prompt('file name is required')
        filelength = ParseUtils.file_length(filename)
        if nth is None:
            nth = self.nth
        with open(out_filename, 'w') as f:
            csv_writer = csv.writer(f)
            for row in Progress(self.parse_all_generator(), filelength, 10):
                csv_writer.writerow(row)
filter_below_frequency = 7

import collections
import csv

#read input file
if debug_mode:
    print("reading input.txt file")
with open(input_file_path, 'r') as myfile:
    input_content = myfile.read()
    input_content = input_content  #.decode('string-escape').decode("utf-8")

#get nouns, adjectives and verbs
if debug_mode:
    print("analysing text")
all_words = twitter.pos(input_content, norm=True, stem=True)

if debug_mode:
    print("filtering words")
filtered_words = []
for word, word_type in all_words:
    #filter only important words
    if (word_type == "Noun" or word_type == "Verb" or word_type == "Adjectives"):
        #print(word)
        filtered_words.append(word)

if debug_mode:
    print("counting words")
words_counter = collections.Counter(filtered_words)

total_words_count = sum(words_counter.values())
Beispiel #11
0
class DeepCNModule:
    def __init__(self):
        self.step = 1000
        self.batch_size = 100
        self.learning_rate = 0.01

        self.n_layers = 5
        self.n_rnn_hidden = 50
        self.n_rnn_output_size = 10
        self.n_hidden = 300

        self.epoch = 100

        self.twitter = Twitter()

        self.prod_len_max = 50

        self.data_path = './data.txt'
        if not os.path.exists(self.data_path):
            print('no data file')
            return

        self.vocab_path = './vocab'
        self.vocab_path_dict = {
            'name': 'name.txt',
            'model': 'model.txt',
            'price': 'price.txt',
            'maker': 'maker.txt',
            'cmpnycate': 'cmpnycate.txt',
            'img': 'img.txt',
            'cate': 'cate.txt'
        }

        # 사전 세팅
        self.vocab = {}
        self.make_vocab()

        self.label_size = 0

    def make_vocab(self):
        self.vocab['name'] = {}
        self.vocab['model'] = {}
        self.vocab['price'] = {}
        self.vocab['maker'] = {}
        self.vocab['cmpnycate'] = {}
        self.vocab['img'] = {}
        self.vocab['cate'] = {}
        if os.path.exists(self.vocab_path):
            for data in self.vocab_path_dict:
                if os.path.exists(self.vocab_path + "/" +
                                  self.vocab_path_dict[data]):
                    os.remove(self.vocab_path + "/" +
                              self.vocab_path_dict[data])
            os.rmdir(self.vocab_path)

        os.makedirs(self.vocab_path)

        with open(self.data_path, 'rb') as f:
            while True:
                line = f.readline().decode('utf-8')
                if not line: break

                line = line.replace("\n", '')

                _, maker, model, prodname, lcatecode, price, cmpny_cate, img_code = line.split(
                    '\t')

                self.add_vocab(maker, 'maker')
                self.add_vocab(model, 'model')
                self.tokenize_word_data(prodname, 'name')
                self.add_vocab(lcatecode, 'cate')
                self.add_vocab(price, 'price')
                self.add_vocab(cmpny_cate, 'cmpnycate')
                self.add_vocab(img_code, 'img')
        self.show_dict_status()

        self.save_vocab()

    def tokenize_word_data(self, data, param):
        result = []
        for data in self.twitter.pos(data, norm=True, stem=True):
            if data[1] == 'Foreign':
                continue
            if data[1] == 'Punctuation':
                continue
            if data[1] == 'Josa':
                continue
            result.append(self.add_vocab(data[0], param))

        while True:
            if len(result) >= self.prod_len_max:
                break

            result.append(0)
        return result

    def add_vocab(self, data, param):
        vocab_code = 0
        if data not in self.vocab[param].keys():
            vocab_code = len(self.vocab[param]) + 1
            self.vocab[param][data] = vocab_code
        else:
            vocab_code = self.vocab[param][data]

        return vocab_code

    def save_vocab(self):
        for data in self.vocab_path_dict:
            with open(self.vocab_path + '/' + self.vocab_path_dict[data],
                      'w') as f:
                f.write(json.dumps(self.vocab[data]))

    def read_vocab(self, path):
        vocab = {}
        with open(self.vocab_path + "/" + path, 'r') as f:
            vocab = json.loads(f.read())

        return vocab

    def set_vocab(self):
        if os.path.exists(self.vocab_path):
            try:
                self.vocab['name'] = self.read_vocab(
                    self.vocab_path_dict['name'])
                self.vocab['model'] = self.read_vocab(
                    self.vocab_path_dict['model'])
                self.vocab['price'] = self.read_vocab(
                    self.vocab_path_dict['price'])
                self.vocab['maker'] = self.read_vocab(
                    self.vocab_path_dict['maker'])
                self.vocab['cmpnycate'] = self.read_vocab(
                    self.vocab_path_dict['cmpnycate'])
                self.vocab['img'] = self.read_vocab(
                    self.vocab_path_dict['img'])
                self.vocab['cate'] = self.read_vocab(
                    self.vocab_path_dict['cate'])

                self.show_dict_status()
            except Exception as e:
                print(str(e))
                self.make_vocab()
        else:
            self.make_vocab()

    def show_dict_status(self):
        print('name vocab = ', str(len(self.vocab['name'])))
        print('maker vocab = ', str(len(self.vocab['maker'])))
        print('model vocab = ', str(len(self.vocab['model'])))
        print('cate vocab = ', str(len(self.vocab['cate'])))
        print('price vocab = ', str(len(self.vocab['price'])))
        print('cmpnycate vocab = ', str(len(self.vocab['cmpnycate'])))
        print('img vocab = ', str(len(self.vocab['img'])))

    def make_model(self):
        print('make_model')
        # input layer
        graph1 = tf.Graph()

        with graph1.as_default():
            prodname = tf.placeholder(tf.float32, [None, None, 50],
                                      name='prodname')
            maker = tf.placeholder(tf.float32, [None, None, 1], name='maker')
            model = tf.placeholder(tf.float32, [None, None, 1], name='model')
            cate = tf.placeholder(tf.float32, [None, None, 1], name='cate')
            price = tf.placeholder(tf.float32, [None, None, 1], name='price')
            cmpnycate = tf.placeholder(tf.float32, [None, None, 1],
                                       name='cmpnycate')
            img = tf.placeholder(tf.float32, [None, None, 1], name='img')

            label = tf.placeholder(tf.int32, [None], name='label')

            w = tf.Variable(tf.random_normal([285, self.label_size]))
            b = tf.Variable(tf.random_normal([self.label_size]))

            # RNN 연결
            # RNN output
            rnn_list = []

            rnn_list.append(
                self._getoutput_data(prodname, 'prodname', hidden_size=50))
            rnn_list.append(self._getoutput_data(maker, 'maker',
                                                 hidden_size=1))
            rnn_list.append(self._getoutput_data(model, 'model',
                                                 hidden_size=1))
            rnn_list.append(self._getoutput_data(cate, 'cate', hidden_size=1))
            rnn_list.append(self._getoutput_data(price, 'price',
                                                 hidden_size=1))
            rnn_list.append(
                self._getoutput_data(cmpnycate, 'cmpnycate', hidden_size=1))
            rnn_list.append(self._getoutput_data(img, 'img', hidden_size=1))

            # concatenation layer
            concate_data = tf.concat(rnn_list, 1)

            hidden1 = tf.layers.dense(concate_data,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden2 = tf.layers.dense(hidden1,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden3 = tf.layers.dense(hidden2,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden4 = tf.layers.dense(hidden3,
                                      self.n_hidden,
                                      activation=tf.nn.relu)

            # output layer
            output = tf.layers.dense(hidden4,
                                     self.label_size,
                                     activation=tf.nn.softmax)
            output = tf.transpose(output, [1, 0, 2])
            output = output[-1]
            print(output)
            model = tf.matmul(output, w) + b

            cost = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model,
                                                               labels=label))
            print('cost')
            print(cost)
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)
            print('optimizer')
            print(optimizer)
            train_step = optimizer.minimize(cost)

            predict = tf.cast(tf.argmax(model, 1), tf.int32)

            return output, cost, train_step, graph1, predict

    def _get_data(self, outputdata, variable_name):
        with tf.variable_scope(variable_name):
            # fully connected layer
            hidden1 = tf.layers.dense(outputdata,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden2 = tf.layers.dense(hidden1,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden3 = tf.layers.dense(hidden2,
                                      self.n_hidden,
                                      activation=tf.nn.relu)
            hidden4 = tf.layers.dense(hidden3,
                                      self.n_hidden,
                                      activation=tf.nn.relu)

            # output layer
            output = tf.layers.dense(hidden4,
                                     self.n_rnn_output_size,
                                     activation=tf.nn.softmax)
        return output

    def _getoutput_data(self, input_data, variable_name, hidden_size):
        with tf.variable_scope(variable_name):
            outputdata, _ = tf.nn.dynamic_rnn(
                self._build_cells(hidden_size=hidden_size),
                input_data,
                dtype=tf.float32)

        return self._get_data(outputdata, variable_name + "layers")

    def _cell(self, output_keep_prob, hidden_size):
        rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        rnn_cell = tf.nn.rnn_cell.DropoutWrapper(
            rnn_cell, output_keep_prob=output_keep_prob)
        return rnn_cell

    def _build_cells(self, hidden_size, output_keep_prob=0.5):
        enc_cell = tf.nn.rnn_cell.MultiRNNCell([
            self._cell(output_keep_prob, hidden_size)
            for _ in range(self.n_layers)
        ])
        return enc_cell

    def train(self):

        print('train')
        # 학습 데이터 로드
        prodname_list, maker_list, model_list, price_list, catecode_list, cmpnycate_list, imgcode_list, label_list = self.data_load(
            self.data_path)

        # 모델 구축
        output, cost, train_step, graph, predict = self.make_model()

        with tf.Session(graph=graph) as sess:
            # 추후 saver 추가
            sess.run(tf.global_variables_initializer())
            print(type(prodname_list))
            print(type(maker_list))
            print(type(model_list))
            print(type(price_list))
            print(type(catecode_list))
            print(type(cmpnycate_list))
            print(type(imgcode_list))
            print(type(label_list))

            for i in range(self.epoch):
                print('epoch: ', str(i))
                _, cost = sess.run(
                    [train_step, cost],
                    feed_dict={
                        'prodname:0': prodname_list,
                        'maker:0': maker_list,
                        'model:0': model_list,
                        'cate:0': catecode_list,
                        'price:0': price_list,
                        'cmpnycate:0': cmpnycate_list,
                        'img:0': imgcode_list,
                        'label:0': label_list
                    })

                print('cost: %f' % cost)

    def data_load(self, data_path):
        prodname_list = []
        maker_list = []
        model_list = []
        price_list = []
        catecode_list = []
        cmpnycate_list = []
        imgcode_list = []

        label_list = []

        try:
            with open(data_path, 'rb') as f:
                while True:
                    line = f.readline().decode('utf-8')
                    if not line: break

                    line = line.replace("\n", '')

                    prodcode, maker, model, prodname, lcatecode, price, cmpny_cate, img_code = line.split(
                        '\t')

                    maker_list.append([[self.add_vocab(maker, 'maker')]])
                    model_list.append([[self.add_vocab(model, 'model')]])
                    prodname_list.append(
                        [self.tokenize_word_data(prodname, 'name')])
                    catecode_list.append([[self.add_vocab(lcatecode, 'cate')]])
                    price_list.append([[self.add_vocab(price, 'price')]])
                    cmpnycate_list.append(
                        [[self.add_vocab(cmpny_cate, 'cmpnycate')]])
                    imgcode_list.append([[self.add_vocab(img_code, 'img')]])

                    label_list.append(prodcode)
            self.label_dic = {n: i for i, n in enumerate(label_list)}
            return_label = [i for i, n in enumerate(label_list)]

            self.show_dict_status()
            self.save_vocab()

        except Exception as e:
            print(str(e))
            return -1

        self.label_size = len(label_list)
        print(return_label)

        return np.array(prodname_list),\
               np.array(maker_list), \
               np.array(model_list), \
               np.array(price_list), \
               np.array(catecode_list), \
               np.array(cmpnycate_list),\
               np.array(imgcode_list),\
               np.array(return_label)

    def predict(self):
        pass
Beispiel #12
0
from konlpy.tag import Twitter

twitter = Twitter()
malist = twitter.pos('아버지 가방에 들어가신다.', norm=True, stem=True)

print(malist)
 def tokenize(self, doc):
     pos_tagger = Twitter()
     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
Beispiel #14
0
    # params = urllib.parse.urlencode({
    #     "_callback":"",
    #     "q":ret
    # })
    # data = urllib.request.urlopen("https://m.search.naver.com/p/csearch/dcontent/spellchecker.nhn?" + params)
    # data = data.read().decode("utf-8")[1:-2]
    # data = json.loads(data)
    # data = data["message"]['result']["html"]
    # data = soup = BeautifulSoup(data,"html.parser").getText()
    # return data
    return ret


i = 0
words = []
while True:
    if i > 3:
        break
    f = open('news' + str(i) + '.txt', 'r', encoding='utf8')
    line = f.readline()
    twitter = Twitter()
    malist = twitter.pos(line, norm=True)
    for x in malist:
        if not x[1] in ["Punctuation"]:
            words.append(x[0])
        if x[0] == ".":
            words.append(x[0])
    i += 1
dic = make_dic(words)

print(make_sentence(dic))
Beispiel #15
0
os.chdir('%s/' % currdir)
print currdir

with open("text8", 'r') as f:
    for line in f:
        sentences.append(line[:100])
        
print sentences 
'''      
with open("/home/chuckgu/Desktop/project/preprocessing/x-project/word2vec/namuwiki160229/namuwiki_20160229.json") as json_file:
    json_data = json.load(json_file)

for i,j in enumerate(json_data): 
    print i
    
    sentences=sent_tokenize(j["text"])
    
    if len(sentences)>5:
        for line in sentences:
            line=line.decode('utf-8')
            #txt.append(' '.join(twitter.morphs(line)))
            txt.extend([s[0]for s in twitter.pos(line,norm=True)  if s[1] not in checklist])
            
    if i==120000: break
    
#np.savetxt("namu.txt",txt,fmt='%s') 
import cPickle as pkl
f = open('namu_wo_josa.pkl', 'wb')
pkl.dump(txt, f, -1)
f.close()
print 'saved'
Beispiel #16
0
    originalList.append(dataList)
    i = 1

original = originalList
# print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
# 트위터로 분석
from konlpy.tag import Twitter
twitter = Twitter()

tresult = []
# print('')
print('--- 형태소 분석 결과 : Twitter ---')
for data in original:
    tresult.append([data[0], twitter.nouns(data[1]), data[2]])
    print(twitter.pos(data[1]))

print('디딩')

# 트위터 분석 결과 확인
print('')
print('===> 명사')
print(tresult[0][1])

print('')
print('--- 형태소 분석 결과 : Komoran ---')
# print(tresult)
# print(tresult[1][1])

# 코모란으로 분석
from konlpy.tag import Komoran
Beispiel #17
0
from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Twitter
import numpy as np

pos_vectors = KeyedVectors.load_word2vec_format('pos.vec', binary=False)
pos_vectors.most_similar("('남자','Noun')")
twitter = Twitter()
word = "대통령이"
pos_list = twitter.pos(word, norm=True)
word_vector = np.sum([pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0)
Beispiel #18
0
train_sentence_list = []
train_tag_list = []

for i in temp_list:
	if i != temp_list[0]:
		a = i.split('\t')
		train_sentence_list.append(a[1])
		train_tag_list.append(int(a[2]))

twitter = Twitter()
# twitter 객체만들기.

train_sentence_w2v = []
for i in train_sentence_list:    
	temp_bef = twitter.pos(i, norm=True, stem=True)
	temp_del_pos = []
	for j in temp_bef:
		if j[1] != 'Josa' and j[1] != 'Punctuation':  
			temp_del_pos.append(j[0])               # 조사랑 Punctuation은 필요가 없으니까 빼는거임.
	train_sentence_w2v.append(temp_del_pos)


################# 여기까지가 trainset 아래부터가 testset ##############################
file = open('./ratings_test.txt', encoding="utf-8")
temp_list = []
cutting = 0
for line in file:
	if cutting == 10:
		break
	cutting += 1
Beispiel #19
0
    
    
elif mode=='te':
    if os.path.isfile(filepath): model.load(filepath)
    else: 
        raise IOError('loading error...')
    
    checklist=['Exclamation','Alpha','URL']
    twitter=Twitter()

    
    while 1:
        choice=raw_input("Me: ")
        if choice in ["Q","q"]: break
        #print choice
        
        choice=choice.decode('utf-8')
        
        sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True)  if s[1] not in checklist])
        

        words=(word_tokenize(sen.strip().lower()))
        #print ' '.join(words)
        seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words]
        seqs = [s if s<n_words else 1 for s in seqs]
        mask_set_x=np.ones((len(seqs))).astype('float32')
        res=model.predict(seqs,mask_set_x)
        
        #print res
        print "class: "+str(res)
with open("c:\\Users\\myeon\\Desktop\\network_text_문재인.txt", "rb") as f:
    headlines = pickle.load(f)

with open("c:\\Users\\myeon\\Desktop\\network_text_문재인뉴스.txt", "rb") as f:
    press = pickle.load(f)

press = list(map(lambda x: x.replace('언론사 선정', ''), press))

t = Twitter()

node_dic = defaultdict(int)
edges = []

for headline, news in zip(headlines, press):
    tags_ko = t.pos(headline)
    temp = []

    for word, pumsa in tags_ko:
        # print(word, pumsa)
        # if len(word) > 1 and (pumsa == 'Noun' or pumsa == 'Verb' or pumsa == 'Adjective'):
        # if len(word) > 1 and (pumsa == 'Noun' or pumsa == 'Verb'):
        if len(word) > 1 and (pumsa == 'Noun'):
            node_dic[word] += 1
            temp.append(word)

    edges.extend(generate_edges(temp, news))

sorted_nodes = sorted(node_dic, key=node_dic.get, reverse=True)
press_count = defaultdict(int)
for i in press:
Beispiel #21
0
def pageCrawl(conn):
    url_input = "환율"
    plus_url = urllib.parse.quote_plus(url_input,
                                       safe='/',
                                       encoding='utf-8',
                                       errors='strict')
    pageNum = 1

    print()

    morphs = []
    cnt = 0
    while True:
        url = f'https://search.naver.com/search.naver?&where=news&query={plus_url}&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=0&ds=&de=&docid=&nso=so:dd,p:all,a:all&mynews=0&start={pageNum}'
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html, 'html.parser')

        lis = soup.select(
            '#main_pack > section > div > div.group_news > ul li')

        pageNum += 10

        for i in lis:
            new_tit = ''
            news_link = ''
            news_name = ''
            news_date = ''
            news_article = ''

            new_tit = i.select('div.news_wrap.api_ani_send > div > a')

            if len(new_tit) == 0:
                continue
            else:
                new_tit[0].attrs['title']
                new_tit[0].attrs['href']

                if len(i.select("a.info")) == 2:

                    try:

                        i.select("a.info")[1].attrs['href']
                        url2 = i.select("a.info")[1].attrs['href']

                        headers = {
                            "user-agent":
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
                        }

                        html2 = requests.get(url2, headers=headers)
                        soup2 = BeautifulSoup(html2.text, 'html.parser')

                        # 신문사 불러오기
                        news_name = soup2.select_one(
                            'div.press_logo > a.nclicks(atp_press) > img')

                        # 날짜 불러오기
                        news_date = soup2.find('span', {'class': 't11'})

                        news_article = soup2.select_one(
                            '#articleBodyContents').text
                        if news_article is None:
                            continue
                        else:
                            news_article = soup2.select_one(
                                '#articleBodyContents').text
                            news_article = news_article.replace(
                                '// flash 오류를 우회하기 위한 함수 추가', '')
                            news_article = news_article.replace(
                                'function _flash_removeCallback() {}', '')
                            news_article = news_article.replace('동영상 뉴스', '')
                            news_article = news_article.replace(
                                '무단전재 및 재배포 금지', '')
                            news_article = news_article.replace('\'', '')
                            news_article = news_article.strip()
                            pretty_news_article = re.sub(
                                '[가-힣]{2,3} *기자|▶.*|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|\[[가-힣]{2,5} *|\[[가-힣].*\]]',
                                '', news_article)  # idk

                        title = new_tit[0].attrs['title']
                        link = new_tit[0].attrs['href']
                        company = news_name.attrs['title']
                        upload_date = news_date.text
                        content = pretty_news_article

                        twitter = Twitter()
                        sentence = twitter.pos(pretty_news_article)

                        noun_adj_adv_list = []

                        for word, tag in sentence:
                            if tag in [
                                    "Noun"
                            ] and ("것" not in word) and ("내" not in word) and (
                                    "나"
                                    not in word) and ("수" not in word) and (
                                        "게" not in word) and ("말" not in word):
                                noun_adj_adv_list.append(word)
                        words = ",".join(noun_adj_adv_list)

                        cnt += 1

                        if cnt > 100:
                            naver_news_remove(conn)
                            return
                        else:
                            print(str(cnt), '기사입력중 : ', title)
                            naver_news_info(conn, title, link, company,
                                            upload_date, content, words)

                    except:
                        print('기사입력오류')
Beispiel #22
0
# from nltk.corpus import stopwords

# text = "/Users/jeongyoonlee/Desktop/kakao.txt"
text = open('/Users/jeongyoonlee/Desktop/kakao.txt').read()

wordcloud = WordCloud(
    background_color="white",
    font_path='/Users/jeongyoonlee/Desktop/CookieRun Regular.ttf',
    max_font_size=100).generate(text)
# mask = np.array(Image.open('/Desktop/joy.png'))

twitter = Twitter()
morphs = []

for sentence in text:
    morphs.append(twitter.pos(sentence))
    print(morphs)

noun_adj_adv_list = []
for sentence in morphs:
    for word, tag in sentence:
        if tag in ['Noun'] and ("것" not in word) and ("내" not in word) and (
                "나" not in word) and ("수" not in word) and (
                    "게" not in word) and ("말" not in word):
            noun_adj_adv_list.append(word)

print(noun_adj_adv_list)

count = Counter(noun_adj_adv_list)
words = dict(count.most_common())
Beispiel #23
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

from konlpy.corpus import kobill
from konlpy.tag import Twitter; t = Twitter()
from matplotlib import pyplot as plt

pos = lambda x: ['/'.join(p) for p in t.pos(x)]
docs = [kobill.open(i).read() for i in kobill.fileids()]

# get global unique token counts
global_unique = []
global_unique_cnt = []
for doc in docs:
    tokens = pos(doc)
    unique = set(tokens)
    global_unique += list(unique)
    global_unique = list(set(global_unique))
    global_unique_cnt.append(len(global_unique))
    print(len(unique), len(global_unique))

# draw heap
plt.plot(global_unique_cnt)
plt.savefig('heap.png')
from konlpy.tag import Kkma
kkma=Kkma()
kkma.sentences('한국어 분석을 시작합니다 재미있어요--')
kkma.nouns('한국어 분석을 시작합니다 재미있어요--') 
kkma.pos('한국어 분석을 시작합니다 재미있어요--') #형태소 분석

from konlpy.tag import Hannanum
hannanum=Hannanum()
hannanum.nouns('한국어 분석을 시작합니다 재미있어요--')
hannanum.pos('한국어 분석을 시작합니다 재미있어요--')

from konlpy.tag import Twitter
t=Twitter()
t.nouns('한국어 분석을 시작합니다 재미있어요--')
t.morphs('한국어 분석을 시작합니다 재미있어요--')
t.pos('한국어 분석을 시작합니다 재미있어요--')

from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image
text=open('DataScience-master\\data\\09. alice.txt').read()
alice_mask=np.array(Image.open('DataScience-master\\data\\09. alice_mask.png'))
stopwords=set(STOPWORDS)
stopwords.add('said')

import matplotlib.pyplot as plt
import platform

path='c:\Windows\Fonts\malgun.ttf'
from matplotlib import font_manager, rc
if platform.system()=='Darwin':
def croll(urlinput):
    #url_set = set({}) # 크롤하는 모든 url 모음 set. fake, real, mix간 중복이 없기 위함
    ##-----크롤링------##

    #list1=[]
    keywordlist = [
        "맛집 \"제공 받아\"", "맛집 \"제공 받고\"", "맛집 \"후원 받아\"", "맛집 \"후원 받고\"",
        "맛집 \"소정의\" \"받고\"", "맛집 \"소정의\" \"받아\"", "맛집 \"원고료를\"",
        "맛집 \"지원 받고\"", "맛집 \"지원 받아\"", "맛집 \"업체로부터\""
    ]

    datastring = ''

    #urlinput="http://blog.naver.com/jys2432176/221156543724"

    #print(len(list1))

    nlp = Twitter()  # Twitter 라이브러리 사용
    #list1 = list(set(list1)) # 중복된 url 제거하는 원시적인 코드...
    negative = ('아니다', '절대', '검색', '그냥', '듯', '같다', '대부분', '어디서', '그렇다', '전혀')
    regex = r'[가-힣, \s ]+'
    data = open('reviews_rawdata_4.txt', 'w', encoding='UTF-8')

    if "m.blog.naver.com" in urlinput:
        url = urlinput
    elif "blog.naver.com" in urlinput:
        where = urlinput.find("blog")
        url = "http://m." + urlinput[where:]
    else:
        #print("Wrong URL")
        result_pop("WURL")
        sys.exit(1)
    '''
    http://blog.naver.com/sldkfjalskdfj
    blog.naver.com/sdlkfjlk
    '''

    try:
        datastring = ''
        source_code = requests.get(url, timeout=5)
        soup = BeautifulSoup(source_code.text, "html.parser")
        # 글 제목 태그 정보
        if soup.find_all("h3", class_="tit_h3"):
            title = soup.find_all("h3", class_="tit_h3")[0]
        else:
            title = soup.find_all("h3", class_="se_textarea")[0]
        post_title = ""
        for i in range(len(title.contents)):
            post_title = post_title + str(title.contents[i])
        # 글 제목에 맛집이 포함될 때
        if "맛집" in post_title and "위드블로그" not in post_title and "홍보" not in post_title and "광고" not in post_title and "앱" not in post_title and "어플" not in post_title:
            #블로그 본문 내용 찾기
            if soup.find_all(
                    "div",
                    class_="se_component_wrap sect_dsc __se_component_area"):
                article = soup.find_all(
                    "div",
                    class_="se_component_wrap sect_dsc __se_component_area")[0]
            else:
                article = soup.find_all("div", class_="post_ct")[0]
            # 지도 javascript code 삭제, 정보 저장
            restname = ''
            temp = soup.find_all("span", class_="_mapInfo")  ##네이버 지도1
            temp2 = soup.find_all('a',
                                  class_='se_map_link __se_link')  ##네이버 지도2
            if temp:
                restname = str(soup.find_all("a", class_="tit"))
                restname = onlytext(restname)
                temp[0].decompose()
            elif temp2:
                restname = str(soup.find_all("div", class_="se_title")[0])
                restname = onlytext(restname)
                temp2[0].decompose()
            # 사진,이모티콘 정보 저장
            photo = 0
            temp = soup.find_all("span", class_="_img")  # 사진테그1
            temp_ = soup.find_all("img", class_="fx")  # 사진테그1-2
            temp2 = soup.find_all('img',
                                  class_='se_mediaImage __se_img_el')  # 사진테그2
            if temp or temp_:
                photo = len(temp) + len(temp_)
            elif temp2:
                photo = len(temp2)
            imo = 0
            temp = soup.find_all('img', class_='_sticker_img')  # 이모티콘
            temp2 = soup.find_all('img', class_='__se_img_el')  # 사진 + 이모티콘
            if temp:
                imo = len(temp)
            elif temp2:
                imo = len(temp2) - photo
            # 데이터 클렌징
            post_article = str(article)
            index1 = 0
            index2 = 0
            mainarticle = onlytext(post_article)
            for Searchekyword in [
                    "제공받아", "제공 받고", "제공받고", "후원 받아", "후원받아", "후원 받고", "후원받고",
                    "소정의", "원고료를", "지원 받고", "지원받고", "지원 받아", "지원받아", "업체로부터",
                    "제공 받아", "제돈", "제 돈", "내돈", "내 돈", "오빠가 사준", "오빠가사준",
                    "개인사비", "개인 사비"
            ]:
                KWindex = mainarticle.find(Searchekyword)
                if KWindex != -1:
                    for a in range(KWindex, -1, -1):
                        if not re.findall(regex, mainarticle[a]):
                            index1 = a
                            break
                    for a in range(KWindex, len(mainarticle)):
                        if not re.findall(regex, mainarticle[a]):
                            index2 = a
                            break
                    sentence = mainarticle[index1 + 1:index2]
                    sentence = [
                        x[0] for x in nlp.pos(sentence, norm=True, stem=True)
                    ]
                    '''                
                    for neg in negative:
                        if neg in sentence:
                            errorset.add(url)
                            print(url, sentence)
                            break
                    '''
                    #break

            datastring = datastring + "what\t%s\t%s\t%s\t%d\t%d\t%s\n" % (
                url, enter_tab(post_title), mainarticle[:index1 + 1] + ' ' +
                mainarticle[index2 + 1:], photo, imo, restname)
            data.write(datastring)
    except:
        print("error")
    #nt(len(url_set))

    data.close()
    end_time = time.time()
    print("모든 프로세스: %f 분" % ((end_time - start_time) / 60))
Beispiel #26
0
import csv

from konlpy.tag import Twitter


reader = csv.reader(open("../sample/top_song.csv",'r'))
writer = csv.writer(open("../sample/top_song_lemma.csv",'w'))
twitter = Twitter()

lema = str()

for i in reader:
    s = twitter.pos(i[4],norm=True)
    x = [i[0] for i in s if i[1] in ['Noun','Verb','Adjective','Alpha'] and len(i[0])>1]
    print(i[4],"\n",x,"\n"," ".join(x),"\n")

    result = [seg for seg in i]
    result.append(" ".join(x))
    writer.writerow(result)
Beispiel #27
0
en_stop = get_stop_words('en')

## Create p_stemmer of class PorterStemmer
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()  # 접미사 제거

# pprint(en_stop)

## list for tokenized documents in loop
texts = []
## loop through document list
for i in lyrics_set:
    # clean and tokenize document string
    i0 = i.lower()
    tokens_pos = tw.pos(i0)
    stopped_tokens = []
    for pos in tokens_pos:
        # print(pos[0], pos[1])
        if pos[1] == "Alpha" and pos[0] in en_stop:
            #print(pos[0], pos[1])
            continue
        #elif pos[1] in ["Josa", "Suffix", "Punctuation", "Number", "Eomi"]:
            # 조사, 접미사, 구두점, 숫자, 어미
        elif pos[1] not in ["Noun", "Verb", "Adjective", "Adverb"]:
            # 명사, 동사, 형용사, 부사
            #print(pos[0], pos[1])
            continue
        ## lemmatization 대신 stemming
        ## porter 방식이 가장 효과적인 방법인지에 대해서는 논의 필요
        #print("insert value:", pos[0], pos[1])
Beispiel #28
0
    return manu_word

def manufacture_word2(word):
    manu_word = str(word).replace("'", '').replace("(", '').replace(" ", '')
    manu_word = str(manu_word).replace("]", '').replace("[", '').replace(")", '').replace(",", '/')

    return manu_word

if __name__ == "__main__":
    init()

    file = open("example.txt", 'r')

    exmaple = file.readline()
    TDM = np.zeros([WORD_NUM,1])

    exmaple_set = twiiter.pos(exmaple)
    for word in exmaple_set:
        try:
            TDM[int(word_to_number_dic[str(manufacture_word2(word))])][0] += 1.
        except:
            print(str(manufacture_word2(word)))

    weight_matrix = np.load("Weight_matrix.npy")

    result = weight_matrix.dot(TDM)
    max_index = result.argmax()

    print("분류 결과는 : ", keyword_set[max_index] , " 입니다")
Beispiel #29
0
from konlpy.tag import Twitter
"""
twitter = Twitter()
malist = twitter.pos("아버지 가방에 들어가신다.", norm=True, stem=True)
print(malist)

malist = twitter.pos("그래욬ㅋㅋ?", norm=True, stem=True)
print(malist)

malist = twitter.pos("그래욬ㅋㅋ?", norm=True, stem=False)
print(malist)

twitter = Twitter()
malist = twitter.pos("2분할 지도 띄워줘", norm=True, stem=True)
print(malist)

twitter = Twitter()
malist = twitter.pos("오늘자 보고서 만들어서 울산으로 메일 보내줘", norm=True, stem=True)
print(malist)

twitter = Twitter()
malist = twitter.pos("몇 번을 쓰러지더라도 몇 번을 무너지더라도 다시 일어나라", norm=True, stem=True)
print(malist)
"""

twitter = Twitter()
malist = twitter.pos("아버지가방에들어가신다", norm=True, stem=True)
print(malist)
Beispiel #30
0
        input_text = ""
        text_file_name = date
        count = 1
        urls = get_url_list(date)
        df = {}
        word_dic = {}
        # news article
        for url in urls:
            if url_already_exist(url) is True:
                continue
            count += 1
            print_url = gen_print_url(url)
            html = get_html(print_url)
            new_input = ext_body(html)
            #print("date : %s, count : %d" % (date, count))
            pos_list = twitter.pos(new_input)
            for word in pos_list: # for loop inside word count list
                if word[1] == "Noun" and len(word[0]) != 1:
                    if not (word[0] in word_dic):
                        word_dic[word[0]] = 0
                    word_dic[word[0]] += 1 # word count
        keys = sorted(word_dic.items(), key = lambda x:x[1], reverse=True)
        with open("wordcount_ko.csv", 'w', encoding="utf-8") as csvfile:
            pencil = csv.writer(csvfile, delimiter = ' ')
            for word, count in keys[:1000]:
            # pencil.writerow([date])
                pencil.writerow([keys[:1000]])

                print("{0}({1}) ".format(word,count), end="")
        print()
Beispiel #31
0
    return {'exists({})'.format(word): (word in set(doc)) for word in base}


#val = alldatalist[0]['name']
#val = dl.Trim(val)

# extract only word
t = Twitter()
#kkma = Kkma()
#tokens_ko = kkma.nouns(val)
wordlist = [dl.Trim(item['name']) for item in alldatalist]

tokens_ko = []
templist = []
for i in range(len(wordlist)):
    temp = t.pos(wordlist[i], norm=True, stem=True)
    categ = alldatalist[i]['cate']
    tokens_ko.append((temp, categ))

from gensim.models import Word2Vec
min_count = 2
size = 50
window = 4

model = Word2Vec(tokens_ko, min_count=min_count, size=size, window=window)
vocab = list(model.vocab.keys())
vocab[:10]

tokens = [t for d in tokens_ko for t in d[0]]
text = nltk.Text(tokens, name='NMSC')
temp = text.vocab().most_common(10)
Beispiel #32
0
    filter = filter+scaling.most_common_words_filter(100)
    scaled = scaling.filtering_data(data, filter)
    '''
    f = open("most_common_korean_words.txt")
    most_common_words = []
    for i in range(deep):
        most_common_word = f.readline().rstrip('\n')
        most_common_words.append(most_common_word)

    return most_common_words


if __name__ == "__main__":

    text_file_directory = 'data/'

    text_file_number = 40030
    text_file_name = str(text_file_number) + '.txt'

    text_file = text_file_directory + text_file_name

    f = open(text_file)

    data = f.read()

    pos_data = T.pos(data, norm=True)

    for i in enumerate(pos_data):
        if pos_data[i[0]][1] == 'Josa':
            print(pos_data[i[0] - 1])
Beispiel #33
0
from konlpy.tag import Twitter
from gensim.models import word2vec
from bs4 import BeautifulSoup
import codecs

file = open('./test.txt', 'r', encoding='utf-8')
lines = file.readlines()
twitter = Twitter()
result = []
for line in lines:
    r = []
    words = twitter.pos(line, norm=True)
    for word in words:
        if word[1] not in ["Punctuation", "Eomi", "Josa"]:
            r.append(word[0])
    result.append(" ".join(r).strip())

fileName = "toji.wakati"

with open(fileName, 'w') as fp:
    fp.write("\n".join(result))

data = word2vec.LineSentence(fileName)
model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)
# size -> 200차원백터로 바꾸어주라
# window -> 주면 단어는 앞뒤로 10개
# min_count -> 출현 빈도는 2개 미만은 제외하라
# sg -> 분석 방법론은 CBOW와 Skip-Gram 둘중 후자를 선택해라
# hs -> hs가 1이면 softmax를 트레이닝할때 사용  0이면 0이 아닌경우 음수로 샘플링됩니다.
model.save("toji.model")
Beispiel #34
0
    return random.choice(list(keys))


# 문장 읽어 들이기 --- (※4)
toji_file = "toji.txt"
dict_file = "markov-toji.json"
if not os.path.exists(dict_file):
    # 토지 텍스트 파일 읽어 들이기
    fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16")
    soup = BeautifulSoup(fp, "html.parser")
    body = soup.select_one("body > text")
    text = body.getText()
    text = text.replace("…", "")  # 현재 koNLPy가 …을 구두점으로 잡지 못하는 문제 임시 해결
    # 형태소 분석
    twitter = Twitter()
    malist = twitter.pos(text, norm=True)
    words = []
    for word in malist:
        # 구두점 등은 대상에서 제외(단 마침표는 포함)
        if not word[1] in ["Punctuation"]:
            words.append(word[0])
        if word[0] == ".":
            words.append(word[0])
    # 딕셔너리 생성
    dic = make_dic(words)
    json.dump(dic, open(dict_file, "w", encoding="utf-8"))
else:
    dic = json.load(open(dict_file, "r"))

# 문장 만들기 --- (※6)
for i in range(3):
Beispiel #35
0
import numpy as np
from predict_client.prod_client import ProdClient
from konlpy.tag import Twitter

twitter = Twitter()

input_sent = []
with open('input.log', 'r', encoding='utf-8') as content_file:
    for line in content_file:
        tag = twitter.pos(line)[:-1]
        input_sent.append([i[0] for i in tag])

output_sent = []
with open('output.log', 'r', encoding='utf-8') as content_file:
    for line in content_file:
        tag = twitter.pos(line)[:-1]
        output_sent.append([i[0] for i in tag])

vocab_list = []
with open('vocab.log', 'r', encoding='utf-8') as content_file:
    for line in content_file:
        vocab_list.append(line[:-1])

vocab_dict = {n: i for i, n in enumerate(vocab_list)}
num_dic = len(vocab_dict)

input_length = [len(i) for i in input_sent]
output_length = [len(o) for o in output_sent]
max_len_i = max(input_length)
max_len_o = max(output_length)
Beispiel #36
0
#-*- coding:utf-8 -*-
from flask import Flask, jsonify, render_template, request
from flask.ext.restful import reqparse, abort, Api, Resource
from sklearn import linear_model
from sklearn.externals import joblib
import gensim, re
import numpy as np
from konlpy.tag import Twitter
t = Twitter()
pos = lambda d: ['/'.join(p) for p in t.pos(d)]

app = Flask(__name__)
api = Api(app)
model = gensim.models.doc2vec.Doc2Vec.load(
    '{cname}.model'.format(cname='reviews'))
rs = model.random.get_state()
review_rmodel = joblib.load('{cname}.pkl'.format(cname='reviews'))
parser = reqparse.RequestParser()
parser.add_argument('query', type=unicode)


def over_filter(x):
    if x > 5: return 5.0
    elif x < 0.5: return 0.5
    else: return x


class Predict(Resource):
    global rs

    def post(self):
Beispiel #37
0
twitter = Twitter()
f = open('practice/train.csv', 'r', encoding='utf-8')
rdr = csv.reader(f)
i = 0
worddict = {}
wordlist = []
wordfreq = []

for line in rdr:

    if (i != 0):

        message = list(line)
        sms = message[2]  #문자 한개
        sms = sms.replace(' ', '')  #띄어쓰기 제거
        divlist = list(twitter.pos(sms))  #의미있는 단어 추출

        #단어 리스트 검사
        for (word, wordtype) in divlist:

            if ((wordtype == 'Noun') | (wordtype == 'Adjective') |
                (wordtype == 'Numver')):

                if (worddict.get(word) == None):
                    worddict[word] = 1
                else:
                    worddict[word] += 1
                pass

            pass
Beispiel #38
0
sno = result[0][4]
#학생 이메일(or 학번)

#C1
if result[0][0] == '유기산':
    total_score = total_score + 1
#C2
if result[0][1] == '마늘':
    total_score = total_score + 1
#C3
if result[0][2] == '고춧가루':
    total_score = total_score + 1
#C4
twitter = Twitter()
example_pos = twitter.pos(result[0][3], norm=True, stem=True)
nltk_text = nltk.Text(example_pos, name='text')
nltk_result = nltk_text.vocab().most_common(30)

answer = [(('발효', 'Noun'), 1), (('부패', 'Noun'), 1), (('는', 'Josa'), 1),
          (('인간', 'Verb'), 1), (('유익하다', 'Adjective'), 1),
          (('해롭다', 'Adjective'), 1)]
n_answer = [
    (('안', 'VerbPrefix'), 1),
    (('않다', 'Verb'), 1),
]
score = 0

for w in nltk_result:
    if w in answer:
        score = score + 1
Beispiel #39
0
def main():
    # def job():
    conn = pymysql.connect(host='192.168.0.61',
                           user='******',
                           password='******',
                           db='one_db',
                           charset='utf8mb4')
    cursor = conn.cursor()

    sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s'
    cursor.execute(sql, 'N')

    original = cursor.fetchall()

    print('original data')
    print(original)

    # 신조어 필터링
    sql = 'SELECT word FROM tb_newdic'
    cursor.execute(sql)

    newdic = cursor.fetchall()

    # print('신조어 사전')
    # print(newdic)

    # 예외사전 데이터 가져오기
    sql = 'SELECT word FROM tb_excdic'
    cursor.execute(sql)

    excdic = cursor.fetchall()
    print('예외 사전')
    print(excdic)

    originalList = []
    for data in original:
        dataList = list(data)

        for word in newdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

                sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'
                cursor.execute(sql, (dataList[0], word[0], dataList[2]))
                conn.commit()

        for word in excdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

        originalList.append(dataList)

    original = originalList

    # 트위터로 분석
    from konlpy.tag import Twitter
    twitter = Twitter()

    tresult = []

    for data in original:
        tresult.append([data[0], twitter.nouns(data[1]), data[2]])
        print(twitter.pos(data[1]))

    # 트위터 분석 결과 확인
    print('twitter result')
    print(tresult)

    # 코모란으로 분석
    from konlpy.tag import Komoran
    komoran = Komoran()

    kresult = []

    for data in tresult:
        words = data[1]

        # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False
        state = True

        for word in words:
            try:
                type = komoran.pos(word)[0][1]
                if type == 'NNG' or type == 'NNP':
                    kresult.append([data[0], komoran.morphs(word)[0]])

                    # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우
                    exist = False
                    # 예외 사전에 있는 단어는 INSERT 전에 필터링
                    for exc in excdic:
                        sql = 'SELECT INSTR(%s, %s)'
                        cursor.execute(sql, (word, exc[0]))

                        count = cursor.fetchone()
                        if count[0] != 0:
                            print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치')
                            exist = True
                            break

                    if exist:
                        continue

                    # NNG, NNP 타입만 DB에 INSERT
                    # 예외 발생 시 rollback, 아닌 경우 commit으로 처리
                    sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'

                    try:
                        if len(komoran.morphs(word)[0]) != 1:
                            cursor.execute(
                                sql,
                                (data[0], komoran.morphs(word)[0], data[2]))

                    except Exception as err:
                        state = False
                        print('ERROR : komoran result의 ' + str(data[0]) +
                              '번 글의 에서 insert 처리 중 오류 발생')
                        print(str(err))
                        conn.rollback()
                    else:
                        conn.commit()

            except Exception as err:
                state = False
                print('ERROR : komoran 키워드 분석 중 오류 발생')
                continue

        ssql = 'UPDATE test_original SET state = %s WHERE ono = %s'
        state = 'Y' if state == True else 'E'
        cursor.execute(ssql, (state, data[0]))

        conn.commit()

    # 코모란 분석 결과 확인
    print('komoran result')
    print(kresult)

    print('-----')
    print('끝')


# schedule.every().day.at("").do(job)
#
# while 1:
#     schedule.run_pending()
#     time.sleep(1)
Beispiel #40
0
        keyword += str(val[i])
    print(keyword)
    __cnt__ = keyword + 'count'
    result = []
    counter = 0
    for tweet in tweepy.Cursor(api.search, q= keyword + "  -filter:retweets", lang='ko').items(200):
        result.append([tweet.id_str, tweet.text, tweet.created_at])
    outname = keyword + '.txt'
    k = open(outname, 'w', encoding='utf-8')
    d = open(__cnt__ +'.txt', 'w', encoding='utf-8')
    malist = []
    for temp in result:
        counter+=1
        k.write("%s|" %temp[0])
        malist = twitter.pos(temp[1], norm=True, stem=True)
        word_dic = ""
        for word in malist:
            if word[1] != "Josa" and word[1] != "Conjunction" and word[1] != "Punctuation" and word[1] != "ScreenName" and word[1] != "URL":
                word_dic += word[0] + '/' + word[1] + " "
        k.write("%s\n" %word_dic)
    d.write("%d\n" %counter)
    k.close()
    d.close()
    #print(detail)
    
    name = keyword + '.txt'
    counter = keyword +'count.txt'
    test_data = read_data(name)
    d = open(counter,'r')
    cnt = int(d.readline())
Beispiel #41
0
                max_tag = str(max_tag)

                for media_id in more_media:     # items from instagram media_id: str(media_id.id), str(media_id.get_standard_resolution_url()), str(media_id.caption), str(media_id.comments), str(media_id.tags)

                    # option 1) search by keyword_caption
                    taglist = str(media_id.caption).decode('utf-8')
                    print(media_id.caption)
                    num = num + 1
                    print num

                    # # option 2) search by keyword_hashtag
                    # taglist = str(media_id.tags).decode('utf-8')


                    # extract keywords from konlpy and insert into db ematewha
                    tags = twitter.pos(taglist, norm=True, stem=True)
                    for j in range(len(tags)):
                        if(tags[j][1] == 'Noun' or tags[j][1] == 'Adjective'):
                            print tags[j][0],"/",tags[j][1]
                            sql = 'INSERT IGNORE into keyword_caption values ("%s", "%s")' % (upso_id[n][0], tags[j][0])
                            cur.execute(sql)
                        elif(tags[j][1] == 'Hashtag'):
                            nohash = tags[j][0].replace("#", "")
                            nohash = nohash.replace("\"", "")
                            print nohash,"/",tags[j][1]
                            sql = 'INSERT IGNORE into keyword_caption values ("%s", "%s")' % (upso_id[n][0], nohash)
                            cur.execute(sql)
                counter += 1
            # 게시물 갯수 확인
            print "게시물 수"
            print num
Beispiel #42
0
def build_dataset(train_text, min_count, sampling_rate):
    words = list()
    with open(train_text, 'r', encoding='UTF8') as f:
        lines = f.readlines()
        for line in lines:
            sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split()
            if sentence:
                words.append(sentence)

    word_counter = [['UNK', -1]]
    word_counter.extend(
        collections.Counter([word for sentence in words
                             for word in sentence]).most_common())
    word_counter = [
        item for item in word_counter
        if item[1] >= min_count or item[0] == 'UNK'
    ]

    word_dict = dict()
    for word, count in word_counter:
        word_dict[word] = len(word_dict)
    word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys()))

    word_to_pos_li = dict()
    pos_list = list()
    twitter = Twitter()
    for w in word_dict:
        w_pos_li = list()
        for pos in twitter.pos(w, norm=True):
            w_pos_li.append(pos)

        word_to_pos_li[word_dict[w]] = w_pos_li
        pos_list += w_pos_li

    pos_counter = collections.Counter(pos_list).most_common()

    pos_dict = dict()
    for pos, _ in pos_counter:
        pos_dict[pos] = len(pos_dict)

    pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys()))

    word_to_pos_dict = dict()

    for word_id, pos_li in word_to_pos_li.items():
        pos_id_li = list()
        for pos in pos_li:
            pos_id_li.append(pos_dict[pos])
        word_to_pos_dict[word_id] = pos_id_li

    data = list()
    unk_count = 0
    for sentence in words:
        s = list()
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = word_dict['UNK']
                unk_count += 1
            s.append(index)
        data.append(s)
    word_counter[0][1] = max(1, unk_count)

    data = sub_sampling(data, word_counter, word_dict, sampling_rate)

    return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict
Beispiel #43
0
import os
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter

print('cwd: ' + os.getcwd())
os.chdir('./day5')
#exit()

# utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1)
fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16")
soup = BeautifulSoup(fp, "html.parser")
body = soup.select_one("body > text")
text = body.getText()
# 텍스트를 한 줄씩 처리하기 --- (※2)
twitter = Twitter()
word_dic = {}
lines = text.split("\n")
for line in lines:
    malist = twitter.pos(line)
    for word in malist:
        if word[1] == "Noun":  #  명사 확인하기 --- (※3)
            if not (word[0] in word_dic):
                word_dic[word[0]] = 0
            word_dic[word[0]] += 1  # 카운트하기
# 많이 사용된 명사 출력하기 --- (※4)
keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True)
for word, count in keys[:50]:
    print("{0}({1}) ".format(word, count), end="")
print()
import json, pymongo, requests, os, sys
import time, dateutil.parser, codecs
from konlpy.tag import Twitter; t = Twitter()
pos = lambda d: ['/'.join(p) for p in t.pos(d) if p[1] in ['Noun', 'Adjective', 'Determiner', 'Adverb', 'KoreanParticle']]

conn = pymongo.MongoClient("mongodb://localhost")
db = conn.watcha
cnames = db.collection_names()
collections = dict()

for cname in cnames:
	collections[cname] = eval('db.' + cname)

del collections['reviews']
del collections['system.indexes']

cursor = collections['comedy'].find()
length = collections['comedy'].count()
cnt = 0

with codecs.open('D:\watcha_reviews\comedy.txt', 'w', encoding='utf-8') as fp:
	for row in cursor:
		cnt += 1
		if cnt % 1000 == 0:
			print str(cnt) + ' / ' + str(length)
		rating = row['rating']
		cid = row['comment_id']
		text = row['text']
		fp.write(' '.join([str(rating), str(cid)] + pos(text)) + '\n')
import sys
reload(sys)

sys.setdefaultencoding('utf-8')

print ("load")
#load from kobill 
from konlpy.corpus import kobill
#docs_ko =kobill.open('kobill/news.txt').read()
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]
print ("tokenize")

#tokenize
from konlpy.tag import Twitter; t = Twitter()
print ("tokenize1")
pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)]
print ('tokenize2')
texts_ko = [pos(doc) for doc in docs_ko]
#texts_ko = pos(docs_ko)
print ("train")
import time
now_time = time.time()
#train
from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300)
wv_model_ko.init_sims(replace=True)


wv_model_ko.save('ko_word2vec_e.model')
print ("training time "+str(time.time()-now_time)+"sec")
print (wv_model_ko.most_similar((pos('서울대학교'))))
Beispiel #46
0
from konlpy.tag import Twitter
twitter = Twitter()
file = open("/sample/data/loadofthering.txt", 'r')
lines = file.readlines()
word_dic = {}
count = 0
for line in lines:
    malist = twitter.pos(line)
    #print(count, malist)
    for taeso, pumsa in malist:
        if pumsa == "Noun":
            if not (taeso in word_dic):
                word_dic[taeso] = 0
            word_dic[taeso] += 1
            
    if count > 1000:
        break
    count += 1
keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True)
for word, count in keys[:50]:
    print("{0}({1})".format(word, count), end="\n")
print()
Beispiel #47
0
data = pickle.load(f)

#wordbag = []
doc_list = []
termdoc = {}

for datum in data:
	doc_list.append(datum['no'])
	
#data = None
#gc.collect()
	
for datum in data:
	doc_id = datum['no']
	lec_no = datum['lec_no'] #
	pos = twitter.pos(datum['eval_content'],stem = True)
	for p in pos:
		tag = p[1]
		if ('Exclamation' or 'Josa' or 'Eomi' or 'Suffix' or 'Punctuation' or 'Foreign' or 'Alpha' or 'Unknown' or 'KoreanParticle' or 'Hashtag' or 'ScreenName') in tag:
			continue
		if p[0] not in termdoc:
			termdoc[p[0]] = dict.fromkeys(doc_list,0)
		termdoc[p[0]][doc_id] += 1
	print doc_id
'''
tmp = termdoc.keys()
for j in range(10):
	print doc_list[j],
print
for i in range(10):
	print tmp[i],
Beispiel #48
0
def tokenize(doc):  #spliting into morpheme
    tagger = Twitter()
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]
Beispiel #49
0
lines = file.readlines()

# 2. 변수 okja에 전체댓글을 다시저장
okja = []
for line in lines:
    okja.append(line)
file.close()

# 3. 트윗터 패키지 안에 konlpy 모듈호출
from konlpy.tag import Twitter
twitter = Twitter()

# 4. 각 문장별로 형태소 구분하기
sentences_tag = []
for sentence in okja:
    morph = twitter.pos(sentence)
    sentences_tag.append(morph)
    print(morph)
    print('-' * 30)

print(sentences_tag)
print(len(sentences_tag))
print('\n' * 3)

# 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기
noun_adj_list = []
for sentence1 in sentences_tag:
    for word, tag in sentence1:
        if tag in ['Noun', 'Adjective']:
            noun_adj_list.append(word)
Beispiel #50
0
from konlpy.tag import Twitter
twitter = Twitter()
#ex) print(twitter.pos(u'이것도 되나욬ㅋㅋ',norm=True, stem=True))

path='/Users/kims/'

# file1
file1 = pd.read_csv(path+'comments_17_df.csv')
file1.head()

# konlpy file1
text = []
len(file1)

for i in range(0,len(file1)):
    text_spider = twitter.pos(file1.loc[i,'value'],norm=True, stem=True) 
    text.append(text_spider)

text
text_df=pd.DataFrame.from_records(text)
text_df=text_df.stack()

text_df.to_csv('text_17.csv', encoding='utf-8')

# file2
file2 = pd.read_csv(path+'comments_12_df.csv')
file2.head()

# konlpy file2
text = []
len(file2)
import json, pymongo, requests, sys
import time, dateutil.parser
import gensim, logging, os
from konlpy.tag import Twitter; t = Twitter()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='review' + '.log')
pos = lambda d: ['/'.join(p) for p in t.pos(d)]

# class LSGenerator(object):
# 	def __init__(self, collname):
# 		self.conn = pymongo.MongoClient("mongodb://localhost")
# 		self.db = self.conn.watcha
# 		self.cnames = self.db.collection_names()
# 		self.collections = dict()
# 		self.collname = collname

# 		for cname in self.cnames:
# 			self.collections[cname] = eval('self.db.' + cname)

# 		del self.collections['reviews']
# 		del self.collections['system.indexes']

# 	def __iter__(self):
# 		for row in self.collections[self.collname].find():
# 			rating = row['rating']
# 			cid = row['comment_id']
# 			text = row['text']
# 			pos_text = pos(text)
# 			tags = [str(rating) + '_' + str(cid) + '_' + self.collname]
# 			yield gensim.models.doc2vec.TaggedDocument(words = pos_text, tags = tags)

class LSGenerator(object):
Beispiel #52
0
if not os.path.exists(target):
	os.mkdir(target)

y = raw_input( 'press y to clear data:')
if y == 'y':
	shutil.rmtree(target)
	os.mkdir(target)	


nonkorean = re.compile(u'[^ 가-힣]+',re.UNICODE)
nomean = re.compile(u'[ㄱ-ㅎ]|[가-힣\S]*대숲|[가-힣\S]*대나무숲|[0-9\S]*번째|[가-힣\S]*학교|#[가-힣\w0-9\S]+',re.UNICODE)
stopwords = [u'외대',u'한양대',u'고대', u'연대',u'중앙대',u'경북대',u'경희대',u'서울대',u'설대',u'성대',u'성균관대',u'서강대',u'서울시립대',u'댓글',u'시립대',u'오전',u'오후',u'외침',u'제보',u'숲',u'대숲',u'대나무숲',u'연대숲',u'서강대숲',u'이야기',u'대나무',u'서시대',u'오전',u'오후']


for dir in os.listdir(base):	
	if os.path.isdir(base + dir):
		os.mkdir(target + dir)
		for file in os.listdir(base + dir):
			text = open(base+dir+'/'+file,'r').read().decode('utf8')
			f = open(target + dir + '/' + file,'w')
			text = nonkorean.sub('',nomean.sub(u' ',text))
			for i in twitter.pos(text,norm=True,stem=True):
				if i[1] != "Josa" and i[0] not in stopwords and len(i[0]) > 1:
					f.write(i[0].encode('utf8') + ' ')
			f.flush()
		print dir+ ' done'

print 'compressing data'
shutil.make_archive(target, 'gztar', target)
print 'compression completed'