def accuracy(name): reslut = [] if name == 'kma': mode = Kkma() elif name == 'okt': mode = Okt() elif name == 'komoran': mode = Komoran() else : return 0 mylin = input ("문장을 입력해 주세요: " ) print("형태소분석기",name,"정확도 분석을 시작합니다. ") print('\n') acc = mode.morphs(mylin) # 입력문장 형태소 분석 for sentence in texts: arr.append(sentence) sp_text = mode.morphs(sentence) # 한줄씩 문장별로 잘라서 형태소 분석 Jaccard_similarty(acc,sp_text) # 자칼드 유사도로 유사도 계산 n = 5 Sortsimilarty = sorted(range(len(similarty)),key=lambda i: similarty[i], reverse=True)[:n] # 결과를 sort 해줍니다. k = 0 for i in Sortsimilarty: k = k+1 print( k ,"번째로 유사도가 높은 문장입니다. : ",arr[i],"유사도는 다음과 같습니다. : ", similarty[i] ) print('\n') Sortsimilarty = [] similarty = []
def morphy(review): stop = [ "있다", "다는", "은데", "특히", "있었", "동안", "면서", "을까", "해하", "어떤", "한때", "어야", "듯이", "ㄴ다", 'Story', "cinepark", "co", "kr", "Review", "★★★★★", "★★★★", "★★★", "★★", "★", '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우', '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '개', '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓' ] try: kkma = Kkma() morphs = kkma.morphs(review) morphs_complete = [] for i in morphs: if i not in stop and len(i) > 1: morphs_complete.append(i) except TypeError: print("TypeError has ocurred") return morphs_complete
def run_kkma(): kkma = Kkma() start_time = time.time() print('kkma 시작') kkma_morphs = kkma.morphs(news1) kkma_nouns = kkma.nouns(news1) kkma_pos = kkma.pos(news1) end_time = time.time() print('kkma 끝 - %s 초' % str(end_time - start_time)) kkma_sentences = kkma.sentences(news1) with open('kkma.txt', 'w', encoding='utf-8') as fstream: fstream.write('kkma time : %s s\n' % str(end_time - start_time)) fstream.write('kkma_morphs\n') write_list(kkma_morphs, fstream) fstream.write('\n\n') fstream.write('kkma_nouns\n') write_list(kkma_nouns, fstream) fstream.write('\n\n') fstream.write('kkma_pos\n') write_pos(kkma_pos, fstream) fstream.write('\n\n') fstream.write('kkma_sentences\n') write_list(kkma_sentences, fstream) fstream.write('\n')
class Analyze: def __init__(self, string): self.string = u"%s" %string self.kkma = Kkma() def parse_phrase_to_morphemes(self): return self.kkma.morphs(self.string) def noun_extractor(self): return self.kkma.nouns(self.string)
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def pos_tag(sentences): # KoNLPy 형태소분석기 설정 // 다른 형태소 분석기도 사용해 보자 // 좋습니다 ~~~ tagger = Kkma() # 문장 품사 변수 초기화 sentences_pos = [] # 모든 문장 반복 for sentence in sentences: # 특수기호 제거 sentence = re.sub(RE_FILTER, "", sentence) # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임 sentence = " ".join(tagger.morphs(sentence)) sentences_pos.append(sentence) return sentences_pos
def prepro_like_morphlized(data): # 형태소 분석 모듈 객체를 # 생성합니다. morph_analyzer = Kkma() # 형태소 토크나이즈 결과 문장을 받을 # 리스트를 생성합니다. result_data = list() # 데이터에 있는 매 문장에 대해 토크나이즈를 # 할 수 있도록 반복문을 선언합니다. for seq in data: # Twitter.morphs 함수를 통해 토크나이즈 된 # 리스트 객체를 받고 다시 공백문자를 기준으로 # 하여 문자열로 재구성 해줍니다. morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', ''))) result_data.append(morphlized_seq) return result_data
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) print(max_document_length) print(type(x_text)) kkma = Kkma() x_text = [" ".join(kkma.morphs(x2)) for x2 in x_text] vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print(x) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, vocab_processor, x_dev, y_dev
def tagMORPH(filename): # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print('now tagging morphemes...') tagged = kkma.morphs(text) # Write tagged file (path, fnameExt) = os.path.split(filename) (fname, fext) = os.path.splitext(fnameExt) tagged_file = fname + '_' + 'morph' + fext fw = open(tagged_file, 'w') for line in tagged: strs = line.encode('utf-8') fw.write(strs + "\n") fw.close() print '%s is created' % (tagged_file)
def tagMORPH(filename): # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print ('now tagging morphemes...') tagged = kkma.morphs(text) # Write tagged file (path,fnameExt) = os.path.split(filename) (fname,fext) = os.path.splitext(fnameExt) tagged_file = fname+'_'+'morph'+fext fw = open(tagged_file,'w') for line in tagged: strs = line.encode('utf-8') fw.write(strs+"\n") fw.close() print '%s is created' % (tagged_file)
def translate(model: TransformerTransModel, source_sentence: str, kor2idx: dict, eng2idx: dict, idx2eng: dict, device=torch.device('cpu'), max_length: int = 67): kkma = Kkma() tokenized_sentence = ["<sos>"] + kkma.morphs(source_sentence) + ["<eos>"] encoded_sentence = [ kor2idx[morph] if morph in kor2idx else kor2idx["<unk>"] for morph in tokenized_sentence ] source = torch.LongTensor(encoded_sentence).unsqueeze(1).to(device) target = torch.LongTensor([eng2idx["<sos>"]]).unsqueeze(1).to(device) model.eval() for _ in range(max_length): with torch.no_grad(): output = model(source, target) best_guess = output.argmax(2)[-1, :] last_word = best_guess.item() best_guess = best_guess.unsqueeze(1) target = torch.cat((target, best_guess), 0) if last_word == eng2idx["<eos>"]: break translated_sentence = [ idx2eng[idx] for idx in target.squeeze(1).cpu().numpy() ] if translated_sentence[-1] != "<eos>": return translated_sentence[1:] else: return translated_sentence[1:-1]
from konlpy.tag import Kkma kkma = Kkma() stems = kkma.morphs('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.') print(stems) t_text = list( open("data/korean-english-park.train.ko", "r", encoding='UTF8').readlines()) with open("./data/korean-english-park.train_stem.ko", "w", encoding='UTF8') as f: for sent in t_text: print(sent) stems = kkma.morphs(sent) print(stems) f.write(" ".join(stems) + "\n") t_text = list( open("data/korean-english-park.dev.ko", "r", encoding='UTF8').readlines()) with open("./data/korean-english-park.dev_stem.ko", "w", encoding='UTF8') as f: for sent in t_text: print(sent) stems = kkma.morphs(sent) print(stems) f.write(" ".join(stems) + "\n")
text = args.text print("-"*5,"원본 텍스트", "-"*5) print(text) print("-"*5, "Mecab", "-"*5) print(mecab.morphs(text)) print("-"*5, "Okt", "-"*5) print(okt.morphs(text)) print("-"*5, "Komoran", "-"*5) print(komoran.morphs(text)) print("-"*5, "Hannanum", "-"*5) print(hannanum.morphs(text)) print("-"*5, "Kkma", "-"*5) print(kkma.morphs(text)) print("-"*5, "Khaiii", "-"*5) tokens = [] for word in khaiii.analyze(text): tokens.extend([str(m).split('/')[0] for m in word.morphs]) print(tokens) print("-"*5, "bert-base-multilingual-cased", "-"*5) print(tokenizer.tokenize(text))
# Okt 형태소 분석기 print('Okt 형태소 분석기') from konlpy.tag import Okt okt = Okt() print(okt.morphs(example)) # 형태소 추출 print(okt.pos(example)) # 품사 태깅 print(okt.nouns(example)) # 명사 추출 # 꼬꼬마 형태소 분석기 print('꼬꼬마 형태소 분석기') from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs(example)) print(kkma.pos(example)) print(kkma.nouns(example)) # 하지만 미등록단어 (Out-of-vocabulary word) 가 많은 텍스트를 기존 형태소 분석기로 분석하면 여러 가지 문제가 발생합니다. 예를 들면, 건설분야의 전문용어가 많은 교량점검보고서의 다음 문장을 형태소 분석기로 분석해 보겠습니다. # # > **국부적인 아스콘 패임과 전반적인 골재마모가 조사되었으며, 골재마모가 상대적으로 심한 구간에서는 포장체의 표면이 거칠거나 경미한 골재탈리가 진행되는 상태이다.** # # 실제 점검보고서의 한 문장을 가져왔습니다. 아스콘, 골재마모(골재 + 마모), 골재탈리(골재+탈리)와 같이 전문용어 포함된 문장은 KoNLPy 토크나이저가 올바르게 분석하지 못하는 경우가 생깁니다. example = '국부적인 아스콘 패임과 전반적인 골재마모가 조사되었으며, 골재마모가 상대적으로 심한 구간에서는 포장체의 표면이 거칠거나 경미한 골재탈리가 진행되는 상태이다.' print() print(example) print(okt.pos(example)) print(kkma.pos(example))
def test(keyword): # Parameters # ================================================== #tf.flags.DEFINE_string("checkpoint_dir", "./EvilTest/runs/1573754967/checkpoints/", "Data source for the positive data.") # Data Parameters tf.flags.DEFINE_string("positive_data_file", "./EvilTest/data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") tf.flags.DEFINE_string("negative_data_file", "./EvilTest/data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.") # Eval Parameters tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") tf.flags.DEFINE_string("checkpoint_dir", "./EvilTest/runs/1573754967/checkpoints/", "Checkpoint directory from training run") tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) #print(x_raw) #print(y_test) x_raw = [keyword] y = [[1,0]] y_test = np.concatenate([y], 0) print(y_test) y_test = np.argmax(y_test, axis=1) kkma=Kkma() x_raw=[" ".join(kkma.morphs(x2)) for x2 in x_raw] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name("output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) print(batch_predictions) all_predictions = np.concatenate([all_predictions, batch_predictions]) return "text"
""" DATA Part will be create other *.json file """ # sample data train_data = [["이것 좀 잡아줘", "GRAB"], ["이것 좀 잡아봐", "GRAB"], ["야 잡아", "GRAB"], ["이거", "GRAB"], ["저것 좀 잡아봐", "GRAB"], ["이것 좀 저어줘", "TOOL"], ["젓고 있어", "TOOL"], ["휘저어줘", "TOOL"]] test_data = [["잡아봐", "GRAB"], ["야 이거 잡아", "GRAB"], ["이거 잡아", "GRAB"]["저어봐", "TOOL"], ["계속 젓고 있어"]] # preprocessing train_X, train_y = list(zip(*train_data)) train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize train_X word2index = {'<unk>': 0} for x in train_X: for token in x: if word2index.get(token) == None: word2index[token] = len(word2index) class2index = {'GRAB': 0, 'TOOL': 1} # print(word2index) # print(class2index) len(word2index) word2index.get("패스트")
xy = np.loadtxt('sample.csv', delimiter=',', dtype=np.str) x_data = xy[:,0:1] y_data = xy[:,1:2] x = [] y = [] p = [] #print(x_data) #print(y_data) #print(x_data[0][0]) #x = kkma.morphs(x_data[0][0]) #sen = ["네 안녕하세요 반갑습니다"] #print(sen[0]) #print(kkma.morphs(sen[0])) for i in range(len(x_data)): for j in range(len(x_data[i])): k=kkma.morphs(x_data[i][j]) x.insert(i,k) for i in range(len(y_data)): for j in range(len(y_data[i])): k=[int(y_data[i][j])] y.insert(i,k) print(x) print(y) x_one_hot = [] a=0 x_one_hot = [[0 for a in range(len(x[a]))]for b in range(len(x[a]))] print(x[0]) print(x_one_hot) for i in range(len(x[a])): x_one_hot[i][i] = 1
line[1] = 'NEG' # Create Dictionary # In[724]: kkma = Kkma() word_to_index = {} index_to_word = [] for line in parsed_lines: # 0: title # 1: rating # 2: comment tokens = kkma.morphs(line[2]) bulid_dictionary(tokens, word_to_index, index_to_word) print len(word_to_index) # Count word within class # In[725]: # initialize word count dictionary of each class with bias ( =1 ) cnt_dic_pos = {} cnt_dic_neut = {} nt_dic_neg = {} # Laplace smoothing (add one)
class Tokenizer: def __init__(self): self.t = Kkma() pass; def tokenize(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) token_list = [] for num, input in enumerate(token): if (token[num] in scores) == True: token_list.append(token[num]) elif (token[num] in scores) == False: kkma_token = self.t.morphs(token[num]) token_list= token_list + kkma_token return token_list def noun_extract(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) noun_list = [] compared_noun_list = self.t.nouns(sentence) for num, input in enumerate(token): if (token[num] in scores) == True: noun_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.nouns(token[num]) noun_list= noun_list + twit_token diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) for num, input in enumerate(noun_list): if (noun_list[num] in diff_noun_list) == True: noun_list.pop(num); return noun_list def noun_extract_dup(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) noun_list = [] compared_noun_list = self.t.nouns(sentence) for num, input in enumerate(token): if (token[num] in scores) == True: noun_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.nouns(token[num]) noun_list= noun_list + twit_token diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) noun_list = list(set(noun_list) - set(diff_noun_list)) return noun_list def noun_counter(self, sentence, score_dic, word): noun_list = self.noun_extract(sentence,score_dic) number = 0 for num, input in enumerate(noun_list): if input == word: number = number + 1 return number
print( text_to_word_sequence( "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop." )) #"don't 는 하나로 인식 text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own." # home-based 하나로 인식, does n't로 인식 --> 일반 word tokenizer와 동일 print(tokenizer.tokenize(text)) sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near." print(sent_tokenize(sentence)) korean_sentence = "딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?" print(kss.split_sentences(korean_sentence)) print(pos_tag(tokenizer.tokenize(text))) print(okt.morphs(korean_sentence)) print(okt.pos(korean_sentence)) print(okt.nouns(korean_sentence)) print("====================================================") print(kkma.morphs(korean_sentence)) print(kkma.pos(korean_sentence)) print(kkma.nouns(korean_sentence))
man_cate_names = [] cateid_kr = pd.DataFrame(cate1) for cateid in cateid_kr.index: for cate_name in splitter(cateid): man_cate_names.append(cate_name) man_cate_names = set(man_cate_names) # Kkma kkma = Kkma() cate_names = [] for cateid in cateid_kr.index: for cate_name in kkma.morphs(cateid): cate_names.append(cate_name) cate_names = set(cate_names) fin_cate_names = man_cate_names | set(cate_names) should_del = ['-', '[', ']', '(', ')', '/', '+', '', '[시리얼]'] _cate_names = [] for name in fin_cate_names: if name not in should_del: _cate_names.append(name) fin_cate_names = _cate_names new_cate_names = []
p1 = [] q1 = [] s = "" r = "" ############################################################################# with open(filename1, 'r') as f: for sentence in f: s += sentence m = s.split('\n') for i in m: n1.append(kkma.pos(i)) for i in m: m1.append(kkma.morphs(i)) for i in range(len(n1)): for j in range(len(n1[i])): m1[i][j] = [n1[i][j][0], n1[i][j][1]] if m1[i][j][1] == 'EFN': m1[i][j][0] = '다' if m1[i][j][0] == '?': if m1[i][j - 1][1] == 'EFQ': m1[i][j - 1][0] = '까' for i in range(len(m1)): for j in range(len(m1[i])): m1[i][j] = m1[i][j][0] n = m1 ############################################################################# with open(filename2, 'r') as f:
# 전체 단어에 대한 소문자 변환 ### news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) ######################3 ##불용어 제거 swords=open(r'C:\Users\Z\Desktop\NI\한국어불용어100.txt', encoding='UTF8').read() stop_words=re.findall('[가-힣]+',swords) ####### dataset['morphed']=range(len(dataset.Narrative)) instance=Counter([]) for i in range(len(dataset.Narrative)): instance+=Counter(kkma.morphs(dataset.Narrative[i])) tags=dict(instance) tags_copy=tags keys=tags_copy.keys() for i in list(keys): if len(i) < 2: del tags[i] ###### wc = WordCloud(font_path="NanumGothic", width=1200, height=800,
from konlpy.tag import Mecab, Kkma, Okt from konlpy.utils import pprint mecab = Mecab() kkma = Kkma() twitter = Okt() string = '동해물과백두산이마르고닳도록' print('# Mecab 형태소 분석') pprint(mecab.morphs(string)) print('# 꼬꼬마 형태소 분석') pprint(kkma.morphs(string)) print('# 트위터 형태소 분석') pprint(twitter.morphs(string)) print('# 트위터 문구 추출') pprint(twitter.phrases(string))
tmp = {} while (True): try: line = file.readline() except UnicodeDecodeError: print(path + txt[:-1]) continue if not line: file.close() break if (len(line[:-1]) <= 10): continue try: morphs = kkma.morphs(line[:-1]) except: print(line[:-1] + "line end") for m in morphs: file_terms_cnt += 1 if (m not in s): s.add(m) if (m not in terms): terms.append(m) term_file_cnt.append(0) term_cnt[terms.index(m)] = term_cnt.get(terms.index(m), 0) + 1 for k, v in term_cnt.items(): tmp[k] = v / file_terms_cnt
#Okt (Twitter tokenizer가 v0.5.0.부터 Okt로 변경) from konlpy.tag import Okt okt=Okt() okt_tokens =okt.morphs(text) print(okt_tokens) # In[12]: #Kkma from konlpy.tag import Kkma kkma=Kkma() kkma_tokens=kkma.morphs(text) print(kkma_tokens) # # 2.품사 부착(Pos Tagging) # - 각 토큰에 품사정보 추가 # - 분석시 불필요한 품사 제거 및 필요한 품사 필터링 # In[13]: #코모란 komoranTag=[] for token in komoran_tokens: komoranTag +=komoran.pos(token) print(komoranTag)
pr.to_file('./pr_report.html') """ """ import nltk from nltk.tokenize import word_tokenize text = "I am actively looking for Ph.D. Students. and you are a Ph.D. student." print(word_tokenize(text)) from nltk.tag import pos_tag x = word_tokenize(text) pos_tag(x) print(pos_tag(x)) """ import konlpy from konlpy.tag import Okt okt = Okt() print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요!")) # 형태소 추출 print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요!")) # 품사 태깅 print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요!")) # 명사 추출 from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요!")) print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요!")) print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))
#text중에 형용사(PA)만 추출 tagged_text = han.pos(text, ntags=22) [t[0] for t in tagged_text if t[1] == 'PA'] # In[10]: #명사만 추출할 경우: han.nouns() han.nouns(text) # ### 3.2 Kkma # In[11]: from konlpy.tag import Kkma kkma = Kkma(max_heap_size=1024) #힙 메모리 사이즈 증가시킬 때 사용 print(kkma.morphs(text)) #형태소 분석만 # In[12]: #품사 태깅 print(kkma.pos(text)) #kkma(ntags=56) # In[13]: #보통명사(NNG)만 추출 tagged_text = kkma.pos(text) [t[0] for t in tagged_text if t[1] == 'NNG'] # In[14]: #명사만 추출할 경우: kkma.nouns()
# 짧은 문장 비교 if len(short_line) > len(long_line): short_line, long_line = long_line, short_line print("입력 짧은 문장 : ", short_line) print("입력 긴 문장 : ", long_line) # 음절 집합 구하기기 set_syl_short = set(syll_ngram(short_line, n)) # 음절 개수 구하기 cnt_syllToShort = len(syll_ngram(short_line, n)) cnt_syllToCommon = count_common_syll_ngram(long_line, n, set_syl_short) # 음절 유사도 구하기 similar_syll = (float)(cnt_syllToCommon/ cnt_syllToShort) # 형태소 분석 라이브러리 선언 kkma = Kkma() # 형태소 분석 short_kkma = list(kkma.morphs(short_line)) long_kkma = list(kkma.morphs(long_line)) # 형태소 개수 구하기 cnt_morpToShort = len(short_kkma) cnt_morpToCommon = count_common_syll_morp(long_kkma, short_kkma) # 형태소 유사도 계산 similar_morp = (float)(cnt_morpToCommon/cnt_morpToShort) print('음절 N-gram 유사도 : {0:.3f}%'.format(similar_syll * 100)) print('형태소 유사도 : {0:.3f}%'.format(similar_morp * 100))
# In[3]: for x in range(0, len(onlyfiles)): # print (onlyfiles[x]) path = srcDir + onlyfiles[x] # print ('data read : ' + srcDir + inputFileName) f = io.open(path, 'r', encoding='utf-8') data += (f.read()) #f = open(path, 'r') len(data) # In[4]: kkma = Kkma() token = kkma.morphs(data) len(token) # In[ ]: # In[11]: wordDic = {} colDic = {} w2 = token[0].encode('utf-8') wordDic[w2] = 1 # In[12]: for x in range(1, len(token)):
4. https://konlpy.org/ko/latest/data/#corpora 말뭉치 5. https://konlpy.org/ko/latest/examples/ 사용 예시 ''' # Hannanum Class from konlpy.tag import Hannanum hannanum = Hannanum() print(hannanum.analyze(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.')) #Kkma Class from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs(u'공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.')) # Komoran Class from konlpy.tag import Komoran komoran = Komoran() print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요')) # MeCab installation needed from konlpy.tag import Mecab mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic") print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) # Twitter Class # from konlpy.tag import Twitter # twitter = Twitter() # print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))
from konlpy.tag import Kkma # 꼬꼬마 형태소 분석기 객체 생성 kkma = Kkma() text = "파이썬 라이브러리 꼬꼬마 형태소 분석기 사용" # 형태소 추출 morphs = kkma.morphs(text) print(morphs) # 형태소와 품사 태그 추출 pos = kkma.pos(text) print(pos) # 명사만 추출 nouns = kkma.nouns(text) print(nouns) # 문장 분리 sentences = "파이썬을 배워봐요. 흥미로운 언어에요." s = kkma.sentences(sentences) print(s)