class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def pos_tagging_noun(text): noun_terms_list = [] twitter = Twitter() pos_list = twitter.pos(text, norm=True, stem=True) for item in pos_list: if (item[1] == 'Noun'): noun_terms_list.append(item) return noun_terms_list
def pos_tagging(text): available_terms_list = [] twitter = Twitter() pos_list = twitter.pos(text, norm=True, stem=True) for item in pos_list: if (item[1] == 'Verb') | (item[1] == 'Adjective'): available_terms_list.append(item) return available_terms_list
def _twitter_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ twitter = Twitter(jvmpath=None) return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(twitter.pos(str(data)), tag_combine=tag_combine) return return_arr
class KorDisintegrator: def __init__(self): self.ko_twitter = Twitter() def convert2simple(self, sentence="", norm=True, stem=True): disintegrated_sentence = self.ko_twitter.pos(sentence, norm=norm, stem=stem) convert_sentence = [] for w, t in disintegrated_sentence: if t not in ["Eomi", "Josa", "KoreanParticle", "Punctuation"]: convert_sentence.append(w) return " ".join(convert_sentence)
def create_wordbag(x): wordbag = [] if(x['eval_content']) is None: return wordbag twitter = Twitter() for text in twitter.pos(x['eval_content'], stem = True): tag = text[1] if tag in unneeded: continue word = text[0] wordbag.append(word) return wordbag
def main(): """ konlpy 사용시 주의 사항 자바 설치 및 세팅 필요 JAVA_HOME 세팅이 필요합니다. export JAVA_HOME=$(/usr/libexec/java_home) """ konl = Twitter() file_path = '/Users/bongster/Downloads/20160528_jiana.csv' with open(file_path, 'rb') as csv_file: inforeader = csv.reader(csv_file) for row in inforeader: r = konl.pos(unicode(row[4], 'utf-8'), norm=True, stem=True) print '=' * 20 for txt, post in r: print txt, post print '=' * 20
def main(_): is_train = True # if False then test if is_train : train() else: checklist=['Exclamation','Alpha','URL'] twitter=Twitter() dic_file_x='data/xproject_class.dict.pkl' worddict_x = dict() worddict_x = load_dict(dic_file_x) x, x_mask,prediction=build_test('/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83') while 1: choice=raw_input("Me: ") if choice in ["Q","q"]: break #print choice choice=choice.decode('utf-8') sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True) if s[1] not in checklist]) words=(word_tokenize(sen.strip().lower())) #print ' '.join(words) seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words] seqs = [s if s<600 else 1 for s in seqs] seqs=[seqs] res=test_data(seqs,x, x_mask,prediction,'/home/chuckgu/Desktop/project/Alphabeta/save/model.ckpt-83') #print res print "class: "+str(res)
class Parser(object): def __init__(self, filename=None, nth=-1): self.filename = filename self.nth = nth self.twitter = Twitter() self.logger = ParseUtils.logger(self.__class__.__name__, './parse.log') def parse_sentence(self, sentence): return self.twitter.pos(sentence) def parse_all_generator(self, filename=None, nth=None): if filename is None: filename = self.filename or click.prompt('file name is required') if nth is None: nth = self.nth for row in ParseUtils.iter_csv(filename): try: parsed = self.parse_sentence(row[nth]) concated = ' '.join([ParseUtils.concat_tuple(x) for x in parsed]) row[nth] = concated except BaseException as e: msg = '{error:<80} | {data}'.format(error=str(e), data=ParseUtils.list_to_csv(row)) self.logger.error(msg) continue yield row def extract_parsed(self, out_filename, filename=None, nth=None): if filename is None: filename = self.filename or click.prompt('file name is required') filelength = ParseUtils.file_length(filename) if nth is None: nth = self.nth with open(out_filename, 'w') as f: csv_writer = csv.writer(f) for row in Progress(self.parse_all_generator(), filelength, 10): csv_writer.writerow(row)
filter_below_frequency = 7 import collections import csv #read input file if debug_mode: print("reading input.txt file") with open(input_file_path, 'r') as myfile: input_content = myfile.read() input_content = input_content #.decode('string-escape').decode("utf-8") #get nouns, adjectives and verbs if debug_mode: print("analysing text") all_words = twitter.pos(input_content, norm=True, stem=True) if debug_mode: print("filtering words") filtered_words = [] for word, word_type in all_words: #filter only important words if (word_type == "Noun" or word_type == "Verb" or word_type == "Adjectives"): #print(word) filtered_words.append(word) if debug_mode: print("counting words") words_counter = collections.Counter(filtered_words) total_words_count = sum(words_counter.values())
class DeepCNModule: def __init__(self): self.step = 1000 self.batch_size = 100 self.learning_rate = 0.01 self.n_layers = 5 self.n_rnn_hidden = 50 self.n_rnn_output_size = 10 self.n_hidden = 300 self.epoch = 100 self.twitter = Twitter() self.prod_len_max = 50 self.data_path = './data.txt' if not os.path.exists(self.data_path): print('no data file') return self.vocab_path = './vocab' self.vocab_path_dict = { 'name': 'name.txt', 'model': 'model.txt', 'price': 'price.txt', 'maker': 'maker.txt', 'cmpnycate': 'cmpnycate.txt', 'img': 'img.txt', 'cate': 'cate.txt' } # 사전 세팅 self.vocab = {} self.make_vocab() self.label_size = 0 def make_vocab(self): self.vocab['name'] = {} self.vocab['model'] = {} self.vocab['price'] = {} self.vocab['maker'] = {} self.vocab['cmpnycate'] = {} self.vocab['img'] = {} self.vocab['cate'] = {} if os.path.exists(self.vocab_path): for data in self.vocab_path_dict: if os.path.exists(self.vocab_path + "/" + self.vocab_path_dict[data]): os.remove(self.vocab_path + "/" + self.vocab_path_dict[data]) os.rmdir(self.vocab_path) os.makedirs(self.vocab_path) with open(self.data_path, 'rb') as f: while True: line = f.readline().decode('utf-8') if not line: break line = line.replace("\n", '') _, maker, model, prodname, lcatecode, price, cmpny_cate, img_code = line.split( '\t') self.add_vocab(maker, 'maker') self.add_vocab(model, 'model') self.tokenize_word_data(prodname, 'name') self.add_vocab(lcatecode, 'cate') self.add_vocab(price, 'price') self.add_vocab(cmpny_cate, 'cmpnycate') self.add_vocab(img_code, 'img') self.show_dict_status() self.save_vocab() def tokenize_word_data(self, data, param): result = [] for data in self.twitter.pos(data, norm=True, stem=True): if data[1] == 'Foreign': continue if data[1] == 'Punctuation': continue if data[1] == 'Josa': continue result.append(self.add_vocab(data[0], param)) while True: if len(result) >= self.prod_len_max: break result.append(0) return result def add_vocab(self, data, param): vocab_code = 0 if data not in self.vocab[param].keys(): vocab_code = len(self.vocab[param]) + 1 self.vocab[param][data] = vocab_code else: vocab_code = self.vocab[param][data] return vocab_code def save_vocab(self): for data in self.vocab_path_dict: with open(self.vocab_path + '/' + self.vocab_path_dict[data], 'w') as f: f.write(json.dumps(self.vocab[data])) def read_vocab(self, path): vocab = {} with open(self.vocab_path + "/" + path, 'r') as f: vocab = json.loads(f.read()) return vocab def set_vocab(self): if os.path.exists(self.vocab_path): try: self.vocab['name'] = self.read_vocab( self.vocab_path_dict['name']) self.vocab['model'] = self.read_vocab( self.vocab_path_dict['model']) self.vocab['price'] = self.read_vocab( self.vocab_path_dict['price']) self.vocab['maker'] = self.read_vocab( self.vocab_path_dict['maker']) self.vocab['cmpnycate'] = self.read_vocab( self.vocab_path_dict['cmpnycate']) self.vocab['img'] = self.read_vocab( self.vocab_path_dict['img']) self.vocab['cate'] = self.read_vocab( self.vocab_path_dict['cate']) self.show_dict_status() except Exception as e: print(str(e)) self.make_vocab() else: self.make_vocab() def show_dict_status(self): print('name vocab = ', str(len(self.vocab['name']))) print('maker vocab = ', str(len(self.vocab['maker']))) print('model vocab = ', str(len(self.vocab['model']))) print('cate vocab = ', str(len(self.vocab['cate']))) print('price vocab = ', str(len(self.vocab['price']))) print('cmpnycate vocab = ', str(len(self.vocab['cmpnycate']))) print('img vocab = ', str(len(self.vocab['img']))) def make_model(self): print('make_model') # input layer graph1 = tf.Graph() with graph1.as_default(): prodname = tf.placeholder(tf.float32, [None, None, 50], name='prodname') maker = tf.placeholder(tf.float32, [None, None, 1], name='maker') model = tf.placeholder(tf.float32, [None, None, 1], name='model') cate = tf.placeholder(tf.float32, [None, None, 1], name='cate') price = tf.placeholder(tf.float32, [None, None, 1], name='price') cmpnycate = tf.placeholder(tf.float32, [None, None, 1], name='cmpnycate') img = tf.placeholder(tf.float32, [None, None, 1], name='img') label = tf.placeholder(tf.int32, [None], name='label') w = tf.Variable(tf.random_normal([285, self.label_size])) b = tf.Variable(tf.random_normal([self.label_size])) # RNN 연결 # RNN output rnn_list = [] rnn_list.append( self._getoutput_data(prodname, 'prodname', hidden_size=50)) rnn_list.append(self._getoutput_data(maker, 'maker', hidden_size=1)) rnn_list.append(self._getoutput_data(model, 'model', hidden_size=1)) rnn_list.append(self._getoutput_data(cate, 'cate', hidden_size=1)) rnn_list.append(self._getoutput_data(price, 'price', hidden_size=1)) rnn_list.append( self._getoutput_data(cmpnycate, 'cmpnycate', hidden_size=1)) rnn_list.append(self._getoutput_data(img, 'img', hidden_size=1)) # concatenation layer concate_data = tf.concat(rnn_list, 1) hidden1 = tf.layers.dense(concate_data, self.n_hidden, activation=tf.nn.relu) hidden2 = tf.layers.dense(hidden1, self.n_hidden, activation=tf.nn.relu) hidden3 = tf.layers.dense(hidden2, self.n_hidden, activation=tf.nn.relu) hidden4 = tf.layers.dense(hidden3, self.n_hidden, activation=tf.nn.relu) # output layer output = tf.layers.dense(hidden4, self.label_size, activation=tf.nn.softmax) output = tf.transpose(output, [1, 0, 2]) output = output[-1] print(output) model = tf.matmul(output, w) + b cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=label)) print('cost') print(cost) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) print('optimizer') print(optimizer) train_step = optimizer.minimize(cost) predict = tf.cast(tf.argmax(model, 1), tf.int32) return output, cost, train_step, graph1, predict def _get_data(self, outputdata, variable_name): with tf.variable_scope(variable_name): # fully connected layer hidden1 = tf.layers.dense(outputdata, self.n_hidden, activation=tf.nn.relu) hidden2 = tf.layers.dense(hidden1, self.n_hidden, activation=tf.nn.relu) hidden3 = tf.layers.dense(hidden2, self.n_hidden, activation=tf.nn.relu) hidden4 = tf.layers.dense(hidden3, self.n_hidden, activation=tf.nn.relu) # output layer output = tf.layers.dense(hidden4, self.n_rnn_output_size, activation=tf.nn.softmax) return output def _getoutput_data(self, input_data, variable_name, hidden_size): with tf.variable_scope(variable_name): outputdata, _ = tf.nn.dynamic_rnn( self._build_cells(hidden_size=hidden_size), input_data, dtype=tf.float32) return self._get_data(outputdata, variable_name + "layers") def _cell(self, output_keep_prob, hidden_size): rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) rnn_cell = tf.nn.rnn_cell.DropoutWrapper( rnn_cell, output_keep_prob=output_keep_prob) return rnn_cell def _build_cells(self, hidden_size, output_keep_prob=0.5): enc_cell = tf.nn.rnn_cell.MultiRNNCell([ self._cell(output_keep_prob, hidden_size) for _ in range(self.n_layers) ]) return enc_cell def train(self): print('train') # 학습 데이터 로드 prodname_list, maker_list, model_list, price_list, catecode_list, cmpnycate_list, imgcode_list, label_list = self.data_load( self.data_path) # 모델 구축 output, cost, train_step, graph, predict = self.make_model() with tf.Session(graph=graph) as sess: # 추후 saver 추가 sess.run(tf.global_variables_initializer()) print(type(prodname_list)) print(type(maker_list)) print(type(model_list)) print(type(price_list)) print(type(catecode_list)) print(type(cmpnycate_list)) print(type(imgcode_list)) print(type(label_list)) for i in range(self.epoch): print('epoch: ', str(i)) _, cost = sess.run( [train_step, cost], feed_dict={ 'prodname:0': prodname_list, 'maker:0': maker_list, 'model:0': model_list, 'cate:0': catecode_list, 'price:0': price_list, 'cmpnycate:0': cmpnycate_list, 'img:0': imgcode_list, 'label:0': label_list }) print('cost: %f' % cost) def data_load(self, data_path): prodname_list = [] maker_list = [] model_list = [] price_list = [] catecode_list = [] cmpnycate_list = [] imgcode_list = [] label_list = [] try: with open(data_path, 'rb') as f: while True: line = f.readline().decode('utf-8') if not line: break line = line.replace("\n", '') prodcode, maker, model, prodname, lcatecode, price, cmpny_cate, img_code = line.split( '\t') maker_list.append([[self.add_vocab(maker, 'maker')]]) model_list.append([[self.add_vocab(model, 'model')]]) prodname_list.append( [self.tokenize_word_data(prodname, 'name')]) catecode_list.append([[self.add_vocab(lcatecode, 'cate')]]) price_list.append([[self.add_vocab(price, 'price')]]) cmpnycate_list.append( [[self.add_vocab(cmpny_cate, 'cmpnycate')]]) imgcode_list.append([[self.add_vocab(img_code, 'img')]]) label_list.append(prodcode) self.label_dic = {n: i for i, n in enumerate(label_list)} return_label = [i for i, n in enumerate(label_list)] self.show_dict_status() self.save_vocab() except Exception as e: print(str(e)) return -1 self.label_size = len(label_list) print(return_label) return np.array(prodname_list),\ np.array(maker_list), \ np.array(model_list), \ np.array(price_list), \ np.array(catecode_list), \ np.array(cmpnycate_list),\ np.array(imgcode_list),\ np.array(return_label) def predict(self): pass
from konlpy.tag import Twitter twitter = Twitter() malist = twitter.pos('아버지 가방에 들어가신다.', norm=True, stem=True) print(malist)
def tokenize(self, doc): pos_tagger = Twitter() return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
# params = urllib.parse.urlencode({ # "_callback":"", # "q":ret # }) # data = urllib.request.urlopen("https://m.search.naver.com/p/csearch/dcontent/spellchecker.nhn?" + params) # data = data.read().decode("utf-8")[1:-2] # data = json.loads(data) # data = data["message"]['result']["html"] # data = soup = BeautifulSoup(data,"html.parser").getText() # return data return ret i = 0 words = [] while True: if i > 3: break f = open('news' + str(i) + '.txt', 'r', encoding='utf8') line = f.readline() twitter = Twitter() malist = twitter.pos(line, norm=True) for x in malist: if not x[1] in ["Punctuation"]: words.append(x[0]) if x[0] == ".": words.append(x[0]) i += 1 dic = make_dic(words) print(make_sentence(dic))
os.chdir('%s/' % currdir) print currdir with open("text8", 'r') as f: for line in f: sentences.append(line[:100]) print sentences ''' with open("/home/chuckgu/Desktop/project/preprocessing/x-project/word2vec/namuwiki160229/namuwiki_20160229.json") as json_file: json_data = json.load(json_file) for i,j in enumerate(json_data): print i sentences=sent_tokenize(j["text"]) if len(sentences)>5: for line in sentences: line=line.decode('utf-8') #txt.append(' '.join(twitter.morphs(line))) txt.extend([s[0]for s in twitter.pos(line,norm=True) if s[1] not in checklist]) if i==120000: break #np.savetxt("namu.txt",txt,fmt='%s') import cPickle as pkl f = open('namu_wo_josa.pkl', 'wb') pkl.dump(txt, f, -1) f.close() print 'saved'
originalList.append(dataList) i = 1 original = originalList # print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -') # 트위터로 분석 from konlpy.tag import Twitter twitter = Twitter() tresult = [] # print('') print('--- 형태소 분석 결과 : Twitter ---') for data in original: tresult.append([data[0], twitter.nouns(data[1]), data[2]]) print(twitter.pos(data[1])) print('디딩') # 트위터 분석 결과 확인 print('') print('===> 명사') print(tresult[0][1]) print('') print('--- 형태소 분석 결과 : Komoran ---') # print(tresult) # print(tresult[1][1]) # 코모란으로 분석 from konlpy.tag import Komoran
from gensim.models.keyedvectors import KeyedVectors from konlpy.tag import Twitter import numpy as np pos_vectors = KeyedVectors.load_word2vec_format('pos.vec', binary=False) pos_vectors.most_similar("('남자','Noun')") twitter = Twitter() word = "대통령이" pos_list = twitter.pos(word, norm=True) word_vector = np.sum([pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0)
train_sentence_list = [] train_tag_list = [] for i in temp_list: if i != temp_list[0]: a = i.split('\t') train_sentence_list.append(a[1]) train_tag_list.append(int(a[2])) twitter = Twitter() # twitter 객체만들기. train_sentence_w2v = [] for i in train_sentence_list: temp_bef = twitter.pos(i, norm=True, stem=True) temp_del_pos = [] for j in temp_bef: if j[1] != 'Josa' and j[1] != 'Punctuation': temp_del_pos.append(j[0]) # 조사랑 Punctuation은 필요가 없으니까 빼는거임. train_sentence_w2v.append(temp_del_pos) ################# 여기까지가 trainset 아래부터가 testset ############################## file = open('./ratings_test.txt', encoding="utf-8") temp_list = [] cutting = 0 for line in file: if cutting == 10: break cutting += 1
elif mode=='te': if os.path.isfile(filepath): model.load(filepath) else: raise IOError('loading error...') checklist=['Exclamation','Alpha','URL'] twitter=Twitter() while 1: choice=raw_input("Me: ") if choice in ["Q","q"]: break #print choice choice=choice.decode('utf-8') sen=' '.join([s[0]+'/'+s[1] for s in twitter.pos(choice,norm=True) if s[1] not in checklist]) words=(word_tokenize(sen.strip().lower())) #print ' '.join(words) seqs = [worddict_x[w] if w in worddict_x.keys() else 1 for w in words] seqs = [s if s<n_words else 1 for s in seqs] mask_set_x=np.ones((len(seqs))).astype('float32') res=model.predict(seqs,mask_set_x) #print res print "class: "+str(res)
with open("c:\\Users\\myeon\\Desktop\\network_text_문재인.txt", "rb") as f: headlines = pickle.load(f) with open("c:\\Users\\myeon\\Desktop\\network_text_문재인뉴스.txt", "rb") as f: press = pickle.load(f) press = list(map(lambda x: x.replace('언론사 선정', ''), press)) t = Twitter() node_dic = defaultdict(int) edges = [] for headline, news in zip(headlines, press): tags_ko = t.pos(headline) temp = [] for word, pumsa in tags_ko: # print(word, pumsa) # if len(word) > 1 and (pumsa == 'Noun' or pumsa == 'Verb' or pumsa == 'Adjective'): # if len(word) > 1 and (pumsa == 'Noun' or pumsa == 'Verb'): if len(word) > 1 and (pumsa == 'Noun'): node_dic[word] += 1 temp.append(word) edges.extend(generate_edges(temp, news)) sorted_nodes = sorted(node_dic, key=node_dic.get, reverse=True) press_count = defaultdict(int) for i in press:
def pageCrawl(conn): url_input = "환율" plus_url = urllib.parse.quote_plus(url_input, safe='/', encoding='utf-8', errors='strict') pageNum = 1 print() morphs = [] cnt = 0 while True: url = f'https://search.naver.com/search.naver?&where=news&query={plus_url}&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=0&ds=&de=&docid=&nso=so:dd,p:all,a:all&mynews=0&start={pageNum}' html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html, 'html.parser') lis = soup.select( '#main_pack > section > div > div.group_news > ul li') pageNum += 10 for i in lis: new_tit = '' news_link = '' news_name = '' news_date = '' news_article = '' new_tit = i.select('div.news_wrap.api_ani_send > div > a') if len(new_tit) == 0: continue else: new_tit[0].attrs['title'] new_tit[0].attrs['href'] if len(i.select("a.info")) == 2: try: i.select("a.info")[1].attrs['href'] url2 = i.select("a.info")[1].attrs['href'] headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36" } html2 = requests.get(url2, headers=headers) soup2 = BeautifulSoup(html2.text, 'html.parser') # 신문사 불러오기 news_name = soup2.select_one( 'div.press_logo > a.nclicks(atp_press) > img') # 날짜 불러오기 news_date = soup2.find('span', {'class': 't11'}) news_article = soup2.select_one( '#articleBodyContents').text if news_article is None: continue else: news_article = soup2.select_one( '#articleBodyContents').text news_article = news_article.replace( '// flash 오류를 우회하기 위한 함수 추가', '') news_article = news_article.replace( 'function _flash_removeCallback() {}', '') news_article = news_article.replace('동영상 뉴스', '') news_article = news_article.replace( '무단전재 및 재배포 금지', '') news_article = news_article.replace('\'', '') news_article = news_article.strip() pretty_news_article = re.sub( '[가-힣]{2,3} *기자|▶.*|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|\[[가-힣]{2,5} *|\[[가-힣].*\]]', '', news_article) # idk title = new_tit[0].attrs['title'] link = new_tit[0].attrs['href'] company = news_name.attrs['title'] upload_date = news_date.text content = pretty_news_article twitter = Twitter() sentence = twitter.pos(pretty_news_article) noun_adj_adv_list = [] for word, tag in sentence: if tag in [ "Noun" ] and ("것" not in word) and ("내" not in word) and ( "나" not in word) and ("수" not in word) and ( "게" not in word) and ("말" not in word): noun_adj_adv_list.append(word) words = ",".join(noun_adj_adv_list) cnt += 1 if cnt > 100: naver_news_remove(conn) return else: print(str(cnt), '기사입력중 : ', title) naver_news_info(conn, title, link, company, upload_date, content, words) except: print('기사입력오류')
# from nltk.corpus import stopwords # text = "/Users/jeongyoonlee/Desktop/kakao.txt" text = open('/Users/jeongyoonlee/Desktop/kakao.txt').read() wordcloud = WordCloud( background_color="white", font_path='/Users/jeongyoonlee/Desktop/CookieRun Regular.ttf', max_font_size=100).generate(text) # mask = np.array(Image.open('/Desktop/joy.png')) twitter = Twitter() morphs = [] for sentence in text: morphs.append(twitter.pos(sentence)) print(morphs) noun_adj_adv_list = [] for sentence in morphs: for word, tag in sentence: if tag in ['Noun'] and ("것" not in word) and ("내" not in word) and ( "나" not in word) and ("수" not in word) and ( "게" not in word) and ("말" not in word): noun_adj_adv_list.append(word) print(noun_adj_adv_list) count = Counter(noun_adj_adv_list) words = dict(count.most_common())
#! /usr/bin/python # -*- coding: utf-8 -*- from konlpy.corpus import kobill from konlpy.tag import Twitter; t = Twitter() from matplotlib import pyplot as plt pos = lambda x: ['/'.join(p) for p in t.pos(x)] docs = [kobill.open(i).read() for i in kobill.fileids()] # get global unique token counts global_unique = [] global_unique_cnt = [] for doc in docs: tokens = pos(doc) unique = set(tokens) global_unique += list(unique) global_unique = list(set(global_unique)) global_unique_cnt.append(len(global_unique)) print(len(unique), len(global_unique)) # draw heap plt.plot(global_unique_cnt) plt.savefig('heap.png')
from konlpy.tag import Kkma kkma=Kkma() kkma.sentences('한국어 분석을 시작합니다 재미있어요--') kkma.nouns('한국어 분석을 시작합니다 재미있어요--') kkma.pos('한국어 분석을 시작합니다 재미있어요--') #형태소 분석 from konlpy.tag import Hannanum hannanum=Hannanum() hannanum.nouns('한국어 분석을 시작합니다 재미있어요--') hannanum.pos('한국어 분석을 시작합니다 재미있어요--') from konlpy.tag import Twitter t=Twitter() t.nouns('한국어 분석을 시작합니다 재미있어요--') t.morphs('한국어 분석을 시작합니다 재미있어요--') t.pos('한국어 분석을 시작합니다 재미있어요--') from wordcloud import WordCloud, STOPWORDS import numpy as np from PIL import Image text=open('DataScience-master\\data\\09. alice.txt').read() alice_mask=np.array(Image.open('DataScience-master\\data\\09. alice_mask.png')) stopwords=set(STOPWORDS) stopwords.add('said') import matplotlib.pyplot as plt import platform path='c:\Windows\Fonts\malgun.ttf' from matplotlib import font_manager, rc if platform.system()=='Darwin':
def croll(urlinput): #url_set = set({}) # 크롤하는 모든 url 모음 set. fake, real, mix간 중복이 없기 위함 ##-----크롤링------## #list1=[] keywordlist = [ "맛집 \"제공 받아\"", "맛집 \"제공 받고\"", "맛집 \"후원 받아\"", "맛집 \"후원 받고\"", "맛집 \"소정의\" \"받고\"", "맛집 \"소정의\" \"받아\"", "맛집 \"원고료를\"", "맛집 \"지원 받고\"", "맛집 \"지원 받아\"", "맛집 \"업체로부터\"" ] datastring = '' #urlinput="http://blog.naver.com/jys2432176/221156543724" #print(len(list1)) nlp = Twitter() # Twitter 라이브러리 사용 #list1 = list(set(list1)) # 중복된 url 제거하는 원시적인 코드... negative = ('아니다', '절대', '검색', '그냥', '듯', '같다', '대부분', '어디서', '그렇다', '전혀') regex = r'[가-힣, \s ]+' data = open('reviews_rawdata_4.txt', 'w', encoding='UTF-8') if "m.blog.naver.com" in urlinput: url = urlinput elif "blog.naver.com" in urlinput: where = urlinput.find("blog") url = "http://m." + urlinput[where:] else: #print("Wrong URL") result_pop("WURL") sys.exit(1) ''' http://blog.naver.com/sldkfjalskdfj blog.naver.com/sdlkfjlk ''' try: datastring = '' source_code = requests.get(url, timeout=5) soup = BeautifulSoup(source_code.text, "html.parser") # 글 제목 태그 정보 if soup.find_all("h3", class_="tit_h3"): title = soup.find_all("h3", class_="tit_h3")[0] else: title = soup.find_all("h3", class_="se_textarea")[0] post_title = "" for i in range(len(title.contents)): post_title = post_title + str(title.contents[i]) # 글 제목에 맛집이 포함될 때 if "맛집" in post_title and "위드블로그" not in post_title and "홍보" not in post_title and "광고" not in post_title and "앱" not in post_title and "어플" not in post_title: #블로그 본문 내용 찾기 if soup.find_all( "div", class_="se_component_wrap sect_dsc __se_component_area"): article = soup.find_all( "div", class_="se_component_wrap sect_dsc __se_component_area")[0] else: article = soup.find_all("div", class_="post_ct")[0] # 지도 javascript code 삭제, 정보 저장 restname = '' temp = soup.find_all("span", class_="_mapInfo") ##네이버 지도1 temp2 = soup.find_all('a', class_='se_map_link __se_link') ##네이버 지도2 if temp: restname = str(soup.find_all("a", class_="tit")) restname = onlytext(restname) temp[0].decompose() elif temp2: restname = str(soup.find_all("div", class_="se_title")[0]) restname = onlytext(restname) temp2[0].decompose() # 사진,이모티콘 정보 저장 photo = 0 temp = soup.find_all("span", class_="_img") # 사진테그1 temp_ = soup.find_all("img", class_="fx") # 사진테그1-2 temp2 = soup.find_all('img', class_='se_mediaImage __se_img_el') # 사진테그2 if temp or temp_: photo = len(temp) + len(temp_) elif temp2: photo = len(temp2) imo = 0 temp = soup.find_all('img', class_='_sticker_img') # 이모티콘 temp2 = soup.find_all('img', class_='__se_img_el') # 사진 + 이모티콘 if temp: imo = len(temp) elif temp2: imo = len(temp2) - photo # 데이터 클렌징 post_article = str(article) index1 = 0 index2 = 0 mainarticle = onlytext(post_article) for Searchekyword in [ "제공받아", "제공 받고", "제공받고", "후원 받아", "후원받아", "후원 받고", "후원받고", "소정의", "원고료를", "지원 받고", "지원받고", "지원 받아", "지원받아", "업체로부터", "제공 받아", "제돈", "제 돈", "내돈", "내 돈", "오빠가 사준", "오빠가사준", "개인사비", "개인 사비" ]: KWindex = mainarticle.find(Searchekyword) if KWindex != -1: for a in range(KWindex, -1, -1): if not re.findall(regex, mainarticle[a]): index1 = a break for a in range(KWindex, len(mainarticle)): if not re.findall(regex, mainarticle[a]): index2 = a break sentence = mainarticle[index1 + 1:index2] sentence = [ x[0] for x in nlp.pos(sentence, norm=True, stem=True) ] ''' for neg in negative: if neg in sentence: errorset.add(url) print(url, sentence) break ''' #break datastring = datastring + "what\t%s\t%s\t%s\t%d\t%d\t%s\n" % ( url, enter_tab(post_title), mainarticle[:index1 + 1] + ' ' + mainarticle[index2 + 1:], photo, imo, restname) data.write(datastring) except: print("error") #nt(len(url_set)) data.close() end_time = time.time() print("모든 프로세스: %f 분" % ((end_time - start_time) / 60))
import csv from konlpy.tag import Twitter reader = csv.reader(open("../sample/top_song.csv",'r')) writer = csv.writer(open("../sample/top_song_lemma.csv",'w')) twitter = Twitter() lema = str() for i in reader: s = twitter.pos(i[4],norm=True) x = [i[0] for i in s if i[1] in ['Noun','Verb','Adjective','Alpha'] and len(i[0])>1] print(i[4],"\n",x,"\n"," ".join(x),"\n") result = [seg for seg in i] result.append(" ".join(x)) writer.writerow(result)
en_stop = get_stop_words('en') ## Create p_stemmer of class PorterStemmer from nltk.stem.porter import PorterStemmer p_stemmer = PorterStemmer() # 접미사 제거 # pprint(en_stop) ## list for tokenized documents in loop texts = [] ## loop through document list for i in lyrics_set: # clean and tokenize document string i0 = i.lower() tokens_pos = tw.pos(i0) stopped_tokens = [] for pos in tokens_pos: # print(pos[0], pos[1]) if pos[1] == "Alpha" and pos[0] in en_stop: #print(pos[0], pos[1]) continue #elif pos[1] in ["Josa", "Suffix", "Punctuation", "Number", "Eomi"]: # 조사, 접미사, 구두점, 숫자, 어미 elif pos[1] not in ["Noun", "Verb", "Adjective", "Adverb"]: # 명사, 동사, 형용사, 부사 #print(pos[0], pos[1]) continue ## lemmatization 대신 stemming ## porter 방식이 가장 효과적인 방법인지에 대해서는 논의 필요 #print("insert value:", pos[0], pos[1])
return manu_word def manufacture_word2(word): manu_word = str(word).replace("'", '').replace("(", '').replace(" ", '') manu_word = str(manu_word).replace("]", '').replace("[", '').replace(")", '').replace(",", '/') return manu_word if __name__ == "__main__": init() file = open("example.txt", 'r') exmaple = file.readline() TDM = np.zeros([WORD_NUM,1]) exmaple_set = twiiter.pos(exmaple) for word in exmaple_set: try: TDM[int(word_to_number_dic[str(manufacture_word2(word))])][0] += 1. except: print(str(manufacture_word2(word))) weight_matrix = np.load("Weight_matrix.npy") result = weight_matrix.dot(TDM) max_index = result.argmax() print("분류 결과는 : ", keyword_set[max_index] , " 입니다")
from konlpy.tag import Twitter """ twitter = Twitter() malist = twitter.pos("아버지 가방에 들어가신다.", norm=True, stem=True) print(malist) malist = twitter.pos("그래욬ㅋㅋ?", norm=True, stem=True) print(malist) malist = twitter.pos("그래욬ㅋㅋ?", norm=True, stem=False) print(malist) twitter = Twitter() malist = twitter.pos("2분할 지도 띄워줘", norm=True, stem=True) print(malist) twitter = Twitter() malist = twitter.pos("오늘자 보고서 만들어서 울산으로 메일 보내줘", norm=True, stem=True) print(malist) twitter = Twitter() malist = twitter.pos("몇 번을 쓰러지더라도 몇 번을 무너지더라도 다시 일어나라", norm=True, stem=True) print(malist) """ twitter = Twitter() malist = twitter.pos("아버지가방에들어가신다", norm=True, stem=True) print(malist)
input_text = "" text_file_name = date count = 1 urls = get_url_list(date) df = {} word_dic = {} # news article for url in urls: if url_already_exist(url) is True: continue count += 1 print_url = gen_print_url(url) html = get_html(print_url) new_input = ext_body(html) #print("date : %s, count : %d" % (date, count)) pos_list = twitter.pos(new_input) for word in pos_list: # for loop inside word count list if word[1] == "Noun" and len(word[0]) != 1: if not (word[0] in word_dic): word_dic[word[0]] = 0 word_dic[word[0]] += 1 # word count keys = sorted(word_dic.items(), key = lambda x:x[1], reverse=True) with open("wordcount_ko.csv", 'w', encoding="utf-8") as csvfile: pencil = csv.writer(csvfile, delimiter = ' ') for word, count in keys[:1000]: # pencil.writerow([date]) pencil.writerow([keys[:1000]]) print("{0}({1}) ".format(word,count), end="") print()
return {'exists({})'.format(word): (word in set(doc)) for word in base} #val = alldatalist[0]['name'] #val = dl.Trim(val) # extract only word t = Twitter() #kkma = Kkma() #tokens_ko = kkma.nouns(val) wordlist = [dl.Trim(item['name']) for item in alldatalist] tokens_ko = [] templist = [] for i in range(len(wordlist)): temp = t.pos(wordlist[i], norm=True, stem=True) categ = alldatalist[i]['cate'] tokens_ko.append((temp, categ)) from gensim.models import Word2Vec min_count = 2 size = 50 window = 4 model = Word2Vec(tokens_ko, min_count=min_count, size=size, window=window) vocab = list(model.vocab.keys()) vocab[:10] tokens = [t for d in tokens_ko for t in d[0]] text = nltk.Text(tokens, name='NMSC') temp = text.vocab().most_common(10)
filter = filter+scaling.most_common_words_filter(100) scaled = scaling.filtering_data(data, filter) ''' f = open("most_common_korean_words.txt") most_common_words = [] for i in range(deep): most_common_word = f.readline().rstrip('\n') most_common_words.append(most_common_word) return most_common_words if __name__ == "__main__": text_file_directory = 'data/' text_file_number = 40030 text_file_name = str(text_file_number) + '.txt' text_file = text_file_directory + text_file_name f = open(text_file) data = f.read() pos_data = T.pos(data, norm=True) for i in enumerate(pos_data): if pos_data[i[0]][1] == 'Josa': print(pos_data[i[0] - 1])
from konlpy.tag import Twitter from gensim.models import word2vec from bs4 import BeautifulSoup import codecs file = open('./test.txt', 'r', encoding='utf-8') lines = file.readlines() twitter = Twitter() result = [] for line in lines: r = [] words = twitter.pos(line, norm=True) for word in words: if word[1] not in ["Punctuation", "Eomi", "Josa"]: r.append(word[0]) result.append(" ".join(r).strip()) fileName = "toji.wakati" with open(fileName, 'w') as fp: fp.write("\n".join(result)) data = word2vec.LineSentence(fileName) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) # size -> 200차원백터로 바꾸어주라 # window -> 주면 단어는 앞뒤로 10개 # min_count -> 출현 빈도는 2개 미만은 제외하라 # sg -> 분석 방법론은 CBOW와 Skip-Gram 둘중 후자를 선택해라 # hs -> hs가 1이면 softmax를 트레이닝할때 사용 0이면 0이 아닌경우 음수로 샘플링됩니다. model.save("toji.model")
return random.choice(list(keys)) # 문장 읽어 들이기 --- (※4) toji_file = "toji.txt" dict_file = "markov-toji.json" if not os.path.exists(dict_file): # 토지 텍스트 파일 읽어 들이기 fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16") soup = BeautifulSoup(fp, "html.parser") body = soup.select_one("body > text") text = body.getText() text = text.replace("…", "") # 현재 koNLPy가 …을 구두점으로 잡지 못하는 문제 임시 해결 # 형태소 분석 twitter = Twitter() malist = twitter.pos(text, norm=True) words = [] for word in malist: # 구두점 등은 대상에서 제외(단 마침표는 포함) if not word[1] in ["Punctuation"]: words.append(word[0]) if word[0] == ".": words.append(word[0]) # 딕셔너리 생성 dic = make_dic(words) json.dump(dic, open(dict_file, "w", encoding="utf-8")) else: dic = json.load(open(dict_file, "r")) # 문장 만들기 --- (※6) for i in range(3):
import numpy as np from predict_client.prod_client import ProdClient from konlpy.tag import Twitter twitter = Twitter() input_sent = [] with open('input.log', 'r', encoding='utf-8') as content_file: for line in content_file: tag = twitter.pos(line)[:-1] input_sent.append([i[0] for i in tag]) output_sent = [] with open('output.log', 'r', encoding='utf-8') as content_file: for line in content_file: tag = twitter.pos(line)[:-1] output_sent.append([i[0] for i in tag]) vocab_list = [] with open('vocab.log', 'r', encoding='utf-8') as content_file: for line in content_file: vocab_list.append(line[:-1]) vocab_dict = {n: i for i, n in enumerate(vocab_list)} num_dic = len(vocab_dict) input_length = [len(i) for i in input_sent] output_length = [len(o) for o in output_sent] max_len_i = max(input_length) max_len_o = max(output_length)
#-*- coding:utf-8 -*- from flask import Flask, jsonify, render_template, request from flask.ext.restful import reqparse, abort, Api, Resource from sklearn import linear_model from sklearn.externals import joblib import gensim, re import numpy as np from konlpy.tag import Twitter t = Twitter() pos = lambda d: ['/'.join(p) for p in t.pos(d)] app = Flask(__name__) api = Api(app) model = gensim.models.doc2vec.Doc2Vec.load( '{cname}.model'.format(cname='reviews')) rs = model.random.get_state() review_rmodel = joblib.load('{cname}.pkl'.format(cname='reviews')) parser = reqparse.RequestParser() parser.add_argument('query', type=unicode) def over_filter(x): if x > 5: return 5.0 elif x < 0.5: return 0.5 else: return x class Predict(Resource): global rs def post(self):
twitter = Twitter() f = open('practice/train.csv', 'r', encoding='utf-8') rdr = csv.reader(f) i = 0 worddict = {} wordlist = [] wordfreq = [] for line in rdr: if (i != 0): message = list(line) sms = message[2] #문자 한개 sms = sms.replace(' ', '') #띄어쓰기 제거 divlist = list(twitter.pos(sms)) #의미있는 단어 추출 #단어 리스트 검사 for (word, wordtype) in divlist: if ((wordtype == 'Noun') | (wordtype == 'Adjective') | (wordtype == 'Numver')): if (worddict.get(word) == None): worddict[word] = 1 else: worddict[word] += 1 pass pass
sno = result[0][4] #학생 이메일(or 학번) #C1 if result[0][0] == '유기산': total_score = total_score + 1 #C2 if result[0][1] == '마늘': total_score = total_score + 1 #C3 if result[0][2] == '고춧가루': total_score = total_score + 1 #C4 twitter = Twitter() example_pos = twitter.pos(result[0][3], norm=True, stem=True) nltk_text = nltk.Text(example_pos, name='text') nltk_result = nltk_text.vocab().most_common(30) answer = [(('발효', 'Noun'), 1), (('부패', 'Noun'), 1), (('는', 'Josa'), 1), (('인간', 'Verb'), 1), (('유익하다', 'Adjective'), 1), (('해롭다', 'Adjective'), 1)] n_answer = [ (('안', 'VerbPrefix'), 1), (('않다', 'Verb'), 1), ] score = 0 for w in nltk_result: if w in answer: score = score + 1
def main(): # def job(): conn = pymysql.connect(host='192.168.0.61', user='******', password='******', db='one_db', charset='utf8mb4') cursor = conn.cursor() sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s' cursor.execute(sql, 'N') original = cursor.fetchall() print('original data') print(original) # 신조어 필터링 sql = 'SELECT word FROM tb_newdic' cursor.execute(sql) newdic = cursor.fetchall() # print('신조어 사전') # print(newdic) # 예외사전 데이터 가져오기 sql = 'SELECT word FROM tb_excdic' cursor.execute(sql) excdic = cursor.fetchall() print('예외 사전') print(excdic) originalList = [] for data in original: dataList = list(data) for word in newdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' cursor.execute(sql, (dataList[0], word[0], dataList[2])) conn.commit() for word in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') originalList.append(dataList) original = originalList # 트위터로 분석 from konlpy.tag import Twitter twitter = Twitter() tresult = [] for data in original: tresult.append([data[0], twitter.nouns(data[1]), data[2]]) print(twitter.pos(data[1])) # 트위터 분석 결과 확인 print('twitter result') print(tresult) # 코모란으로 분석 from konlpy.tag import Komoran komoran = Komoran() kresult = [] for data in tresult: words = data[1] # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False state = True for word in words: try: type = komoran.pos(word)[0][1] if type == 'NNG' or type == 'NNP': kresult.append([data[0], komoran.morphs(word)[0]]) # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우 exist = False # 예외 사전에 있는 단어는 INSERT 전에 필터링 for exc in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (word, exc[0])) count = cursor.fetchone() if count[0] != 0: print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치') exist = True break if exist: continue # NNG, NNP 타입만 DB에 INSERT # 예외 발생 시 rollback, 아닌 경우 commit으로 처리 sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' try: if len(komoran.morphs(word)[0]) != 1: cursor.execute( sql, (data[0], komoran.morphs(word)[0], data[2])) except Exception as err: state = False print('ERROR : komoran result의 ' + str(data[0]) + '번 글의 에서 insert 처리 중 오류 발생') print(str(err)) conn.rollback() else: conn.commit() except Exception as err: state = False print('ERROR : komoran 키워드 분석 중 오류 발생') continue ssql = 'UPDATE test_original SET state = %s WHERE ono = %s' state = 'Y' if state == True else 'E' cursor.execute(ssql, (state, data[0])) conn.commit() # 코모란 분석 결과 확인 print('komoran result') print(kresult) print('-----') print('끝') # schedule.every().day.at("").do(job) # # while 1: # schedule.run_pending() # time.sleep(1)
keyword += str(val[i]) print(keyword) __cnt__ = keyword + 'count' result = [] counter = 0 for tweet in tweepy.Cursor(api.search, q= keyword + " -filter:retweets", lang='ko').items(200): result.append([tweet.id_str, tweet.text, tweet.created_at]) outname = keyword + '.txt' k = open(outname, 'w', encoding='utf-8') d = open(__cnt__ +'.txt', 'w', encoding='utf-8') malist = [] for temp in result: counter+=1 k.write("%s|" %temp[0]) malist = twitter.pos(temp[1], norm=True, stem=True) word_dic = "" for word in malist: if word[1] != "Josa" and word[1] != "Conjunction" and word[1] != "Punctuation" and word[1] != "ScreenName" and word[1] != "URL": word_dic += word[0] + '/' + word[1] + " " k.write("%s\n" %word_dic) d.write("%d\n" %counter) k.close() d.close() #print(detail) name = keyword + '.txt' counter = keyword +'count.txt' test_data = read_data(name) d = open(counter,'r') cnt = int(d.readline())
max_tag = str(max_tag) for media_id in more_media: # items from instagram media_id: str(media_id.id), str(media_id.get_standard_resolution_url()), str(media_id.caption), str(media_id.comments), str(media_id.tags) # option 1) search by keyword_caption taglist = str(media_id.caption).decode('utf-8') print(media_id.caption) num = num + 1 print num # # option 2) search by keyword_hashtag # taglist = str(media_id.tags).decode('utf-8') # extract keywords from konlpy and insert into db ematewha tags = twitter.pos(taglist, norm=True, stem=True) for j in range(len(tags)): if(tags[j][1] == 'Noun' or tags[j][1] == 'Adjective'): print tags[j][0],"/",tags[j][1] sql = 'INSERT IGNORE into keyword_caption values ("%s", "%s")' % (upso_id[n][0], tags[j][0]) cur.execute(sql) elif(tags[j][1] == 'Hashtag'): nohash = tags[j][0].replace("#", "") nohash = nohash.replace("\"", "") print nohash,"/",tags[j][1] sql = 'INSERT IGNORE into keyword_caption values ("%s", "%s")' % (upso_id[n][0], nohash) cur.execute(sql) counter += 1 # 게시물 갯수 확인 print "게시물 수" print num
def build_dataset(train_text, min_count, sampling_rate): words = list() with open(train_text, 'r', encoding='UTF8') as f: lines = f.readlines() for line in lines: sentence = re.sub(r"[^ㄱ-힣a-zA-Z0-9]+", ' ', line).strip().split() if sentence: words.append(sentence) word_counter = [['UNK', -1]] word_counter.extend( collections.Counter([word for sentence in words for word in sentence]).most_common()) word_counter = [ item for item in word_counter if item[1] >= min_count or item[0] == 'UNK' ] word_dict = dict() for word, count in word_counter: word_dict[word] = len(word_dict) word_reverse_dict = dict(zip(word_dict.values(), word_dict.keys())) word_to_pos_li = dict() pos_list = list() twitter = Twitter() for w in word_dict: w_pos_li = list() for pos in twitter.pos(w, norm=True): w_pos_li.append(pos) word_to_pos_li[word_dict[w]] = w_pos_li pos_list += w_pos_li pos_counter = collections.Counter(pos_list).most_common() pos_dict = dict() for pos, _ in pos_counter: pos_dict[pos] = len(pos_dict) pos_reverse_dict = dict(zip(pos_dict.values(), pos_dict.keys())) word_to_pos_dict = dict() for word_id, pos_li in word_to_pos_li.items(): pos_id_li = list() for pos in pos_li: pos_id_li.append(pos_dict[pos]) word_to_pos_dict[word_id] = pos_id_li data = list() unk_count = 0 for sentence in words: s = list() for word in sentence: if word in word_dict: index = word_dict[word] else: index = word_dict['UNK'] unk_count += 1 s.append(index) data.append(s) word_counter[0][1] = max(1, unk_count) data = sub_sampling(data, word_counter, word_dict, sampling_rate) return data, word_dict, word_reverse_dict, pos_dict, pos_reverse_dict, word_to_pos_dict
import os import codecs from bs4 import BeautifulSoup from konlpy.tag import Twitter print('cwd: ' + os.getcwd()) os.chdir('./day5') #exit() # utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1) fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16") soup = BeautifulSoup(fp, "html.parser") body = soup.select_one("body > text") text = body.getText() # 텍스트를 한 줄씩 처리하기 --- (※2) twitter = Twitter() word_dic = {} lines = text.split("\n") for line in lines: malist = twitter.pos(line) for word in malist: if word[1] == "Noun": # 명사 확인하기 --- (※3) if not (word[0] in word_dic): word_dic[word[0]] = 0 word_dic[word[0]] += 1 # 카운트하기 # 많이 사용된 명사 출력하기 --- (※4) keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True) for word, count in keys[:50]: print("{0}({1}) ".format(word, count), end="") print()
import json, pymongo, requests, os, sys import time, dateutil.parser, codecs from konlpy.tag import Twitter; t = Twitter() pos = lambda d: ['/'.join(p) for p in t.pos(d) if p[1] in ['Noun', 'Adjective', 'Determiner', 'Adverb', 'KoreanParticle']] conn = pymongo.MongoClient("mongodb://localhost") db = conn.watcha cnames = db.collection_names() collections = dict() for cname in cnames: collections[cname] = eval('db.' + cname) del collections['reviews'] del collections['system.indexes'] cursor = collections['comedy'].find() length = collections['comedy'].count() cnt = 0 with codecs.open('D:\watcha_reviews\comedy.txt', 'w', encoding='utf-8') as fp: for row in cursor: cnt += 1 if cnt % 1000 == 0: print str(cnt) + ' / ' + str(length) rating = row['rating'] cid = row['comment_id'] text = row['text'] fp.write(' '.join([str(rating), str(cid)] + pos(text)) + '\n')
import sys reload(sys) sys.setdefaultencoding('utf-8') print ("load") #load from kobill from konlpy.corpus import kobill #docs_ko =kobill.open('kobill/news.txt').read() docs_ko = [kobill.open(i).read() for i in kobill.fileids()] print ("tokenize") #tokenize from konlpy.tag import Twitter; t = Twitter() print ("tokenize1") pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)] print ('tokenize2') texts_ko = [pos(doc) for doc in docs_ko] #texts_ko = pos(docs_ko) print ("train") import time now_time = time.time() #train from gensim.models import word2vec wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300) wv_model_ko.init_sims(replace=True) wv_model_ko.save('ko_word2vec_e.model') print ("training time "+str(time.time()-now_time)+"sec") print (wv_model_ko.most_similar((pos('서울대학교'))))
from konlpy.tag import Twitter twitter = Twitter() file = open("/sample/data/loadofthering.txt", 'r') lines = file.readlines() word_dic = {} count = 0 for line in lines: malist = twitter.pos(line) #print(count, malist) for taeso, pumsa in malist: if pumsa == "Noun": if not (taeso in word_dic): word_dic[taeso] = 0 word_dic[taeso] += 1 if count > 1000: break count += 1 keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True) for word, count in keys[:50]: print("{0}({1})".format(word, count), end="\n") print()
data = pickle.load(f) #wordbag = [] doc_list = [] termdoc = {} for datum in data: doc_list.append(datum['no']) #data = None #gc.collect() for datum in data: doc_id = datum['no'] lec_no = datum['lec_no'] # pos = twitter.pos(datum['eval_content'],stem = True) for p in pos: tag = p[1] if ('Exclamation' or 'Josa' or 'Eomi' or 'Suffix' or 'Punctuation' or 'Foreign' or 'Alpha' or 'Unknown' or 'KoreanParticle' or 'Hashtag' or 'ScreenName') in tag: continue if p[0] not in termdoc: termdoc[p[0]] = dict.fromkeys(doc_list,0) termdoc[p[0]][doc_id] += 1 print doc_id ''' tmp = termdoc.keys() for j in range(10): print doc_list[j], print for i in range(10): print tmp[i],
def tokenize(doc): #spliting into morpheme tagger = Twitter() return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]
lines = file.readlines() # 2. 변수 okja에 전체댓글을 다시저장 okja = [] for line in lines: okja.append(line) file.close() # 3. 트윗터 패키지 안에 konlpy 모듈호출 from konlpy.tag import Twitter twitter = Twitter() # 4. 각 문장별로 형태소 구분하기 sentences_tag = [] for sentence in okja: morph = twitter.pos(sentence) sentences_tag.append(morph) print(morph) print('-' * 30) print(sentences_tag) print(len(sentences_tag)) print('\n' * 3) # 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기 noun_adj_list = [] for sentence1 in sentences_tag: for word, tag in sentence1: if tag in ['Noun', 'Adjective']: noun_adj_list.append(word)
from konlpy.tag import Twitter twitter = Twitter() #ex) print(twitter.pos(u'이것도 되나욬ㅋㅋ',norm=True, stem=True)) path='/Users/kims/' # file1 file1 = pd.read_csv(path+'comments_17_df.csv') file1.head() # konlpy file1 text = [] len(file1) for i in range(0,len(file1)): text_spider = twitter.pos(file1.loc[i,'value'],norm=True, stem=True) text.append(text_spider) text text_df=pd.DataFrame.from_records(text) text_df=text_df.stack() text_df.to_csv('text_17.csv', encoding='utf-8') # file2 file2 = pd.read_csv(path+'comments_12_df.csv') file2.head() # konlpy file2 text = [] len(file2)
import json, pymongo, requests, sys import time, dateutil.parser import gensim, logging, os from konlpy.tag import Twitter; t = Twitter() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='review' + '.log') pos = lambda d: ['/'.join(p) for p in t.pos(d)] # class LSGenerator(object): # def __init__(self, collname): # self.conn = pymongo.MongoClient("mongodb://localhost") # self.db = self.conn.watcha # self.cnames = self.db.collection_names() # self.collections = dict() # self.collname = collname # for cname in self.cnames: # self.collections[cname] = eval('self.db.' + cname) # del self.collections['reviews'] # del self.collections['system.indexes'] # def __iter__(self): # for row in self.collections[self.collname].find(): # rating = row['rating'] # cid = row['comment_id'] # text = row['text'] # pos_text = pos(text) # tags = [str(rating) + '_' + str(cid) + '_' + self.collname] # yield gensim.models.doc2vec.TaggedDocument(words = pos_text, tags = tags) class LSGenerator(object):
if not os.path.exists(target): os.mkdir(target) y = raw_input( 'press y to clear data:') if y == 'y': shutil.rmtree(target) os.mkdir(target) nonkorean = re.compile(u'[^ 가-힣]+',re.UNICODE) nomean = re.compile(u'[ㄱ-ㅎ]|[가-힣\S]*대숲|[가-힣\S]*대나무숲|[0-9\S]*번째|[가-힣\S]*학교|#[가-힣\w0-9\S]+',re.UNICODE) stopwords = [u'외대',u'한양대',u'고대', u'연대',u'중앙대',u'경북대',u'경희대',u'서울대',u'설대',u'성대',u'성균관대',u'서강대',u'서울시립대',u'댓글',u'시립대',u'오전',u'오후',u'외침',u'제보',u'숲',u'대숲',u'대나무숲',u'연대숲',u'서강대숲',u'이야기',u'대나무',u'서시대',u'오전',u'오후'] for dir in os.listdir(base): if os.path.isdir(base + dir): os.mkdir(target + dir) for file in os.listdir(base + dir): text = open(base+dir+'/'+file,'r').read().decode('utf8') f = open(target + dir + '/' + file,'w') text = nonkorean.sub('',nomean.sub(u' ',text)) for i in twitter.pos(text,norm=True,stem=True): if i[1] != "Josa" and i[0] not in stopwords and len(i[0]) > 1: f.write(i[0].encode('utf8') + ' ') f.flush() print dir+ ' done' print 'compressing data' shutil.make_archive(target, 'gztar', target) print 'compression completed'