Example #1
0
def accuracy(name):
    reslut = []
    if name == 'kma':
        mode = Kkma()
    elif name == 'okt':
        mode = Okt()
    elif name == 'komoran':
        mode = Komoran()
    else :
        return 0
    
    mylin = input ("문장을 입력해 주세요: " )
    
    print("형태소분석기",name,"정확도 분석을 시작합니다. ")
    print('\n')
    acc = mode.morphs(mylin) # 입력문장 형태소 분석
    for sentence in texts:
        arr.append(sentence)
        sp_text = mode.morphs(sentence) # 한줄씩 문장별로 잘라서 형태소 분석
        Jaccard_similarty(acc,sp_text) # 자칼드 유사도로 유사도 계산
    
    n = 5
    Sortsimilarty = sorted(range(len(similarty)),key=lambda i: similarty[i], reverse=True)[:n] # 결과를 sort 해줍니다.
    
    k = 0
    for i in Sortsimilarty:
        k = k+1
        print( k ,"번째로 유사도가 높은 문장입니다. : ",arr[i],"유사도는 다음과 같습니다. : ", similarty[i] )
        
    print('\n')
    Sortsimilarty = []
    similarty = []
def morphy(review):
    stop = [
        "있다", "다는", "은데", "특히", "있었", "동안", "면서", "을까", "해하", "어떤", "한때", "어야",
        "듯이", "ㄴ다", 'Story', "cinepark", "co", "kr", "Review", "★★★★★", "★★★★",
        "★★★", "★★", "★", '있', '하', '것', '들', '그', '되', '수', '이', '보', '않',
        '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지',
        '대하', '오', '말', '일', '그렇', '위하', '때문', '그것', '두', '말하', '알', '그러나',
        '받', '못하', '일', '그런', '또', '문제', '더', '사회', '많', '그리고', '좋', '크', '따르',
        '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', '그러', '속', '하나', '집',
        '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우', '명', '생각',
        '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '개', '전', '들',
        '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]

    try:
        kkma = Kkma()
        morphs = kkma.morphs(review)
        morphs_complete = []
        for i in morphs:
            if i not in stop and len(i) > 1:
                morphs_complete.append(i)

    except TypeError:
        print("TypeError has ocurred")
    return morphs_complete
Example #3
0
def run_kkma():
    kkma = Kkma()
    start_time = time.time()
    print('kkma 시작')
    kkma_morphs = kkma.morphs(news1)
    kkma_nouns = kkma.nouns(news1)
    kkma_pos = kkma.pos(news1)
    end_time = time.time()
    print('kkma 끝 - %s 초' % str(end_time - start_time))
    kkma_sentences = kkma.sentences(news1)

    with open('kkma.txt', 'w', encoding='utf-8') as fstream:
        fstream.write('kkma time : %s s\n' % str(end_time - start_time))
        fstream.write('kkma_morphs\n')
        write_list(kkma_morphs, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_nouns\n')
        write_list(kkma_nouns, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_pos\n')
        write_pos(kkma_pos, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_sentences\n')
        write_list(kkma_sentences, fstream)
        fstream.write('\n')
Example #4
0
class Analyze:
    def __init__(self, string):
        self.string = u"%s" %string
        self.kkma = Kkma()

    def parse_phrase_to_morphemes(self):
        return self.kkma.morphs(self.string)
    
    def noun_extractor(self):
        return self.kkma.nouns(self.string)
Example #5
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Example #6
0
def pos_tag(sentences):

    # KoNLPy 형태소분석기 설정 // 다른 형태소 분석기도 사용해 보자 // 좋습니다 ~~~
    tagger = Kkma()

    # 문장 품사 변수 초기화
    sentences_pos = []

    # 모든 문장 반복
    for sentence in sentences:
        # 특수기호 제거
        sentence = re.sub(RE_FILTER, "", sentence)

        # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
        sentence = " ".join(tagger.morphs(sentence))
        sentences_pos.append(sentence)

    return sentences_pos
Example #7
0
def prepro_like_morphlized(data):
    # 형태소 분석 모듈 객체를
    # 생성합니다.

    morph_analyzer = Kkma()
    # 형태소 토크나이즈 결과 문장을 받을
    #  리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를
    # 할 수 있도록 반복문을 선언합니다.
    for seq in data:
        # Twitter.morphs 함수를 통해 토크나이즈 된
        # 리스트 객체를 받고 다시 공백문자를 기준으로
        # 하여 문자열로 재구성 해줍니다.
        morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlized_seq)

    return result_data
Example #8
0
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file,
                                                  FLAGS.negative_data_file)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    print(max_document_length)
    print(type(x_text))
    kkma = Kkma()
    x_text = [" ".join(kkma.morphs(x2)) for x2 in x_text]
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    print(x)

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, vocab_processor, x_dev, y_dev
Example #9
0
def tagMORPH(filename):
    # Read file
    f = open(filename, 'r')
    text = f.read().decode('utf-8')  # read file as utf8 decoded
    f.close()

    # tagging
    from konlpy.tag import Kkma
    #from konlpy.utils import pprint
    kkma = Kkma()
    print('now tagging morphemes...')
    tagged = kkma.morphs(text)

    # Write tagged file
    (path, fnameExt) = os.path.split(filename)
    (fname, fext) = os.path.splitext(fnameExt)
    tagged_file = fname + '_' + 'morph' + fext
    fw = open(tagged_file, 'w')
    for line in tagged:
        strs = line.encode('utf-8')
        fw.write(strs + "\n")
    fw.close()
    print '%s is created' % (tagged_file)
Example #10
0
def tagMORPH(filename):
    # Read file
    f = open(filename, 'r')
    text = f.read().decode('utf-8') # read file as utf8 decoded
    f.close()
        
    # tagging
    from konlpy.tag import Kkma
    #from konlpy.utils import pprint
    kkma = Kkma()
    print ('now tagging morphemes...')
    tagged = kkma.morphs(text)
    
    # Write tagged file
    (path,fnameExt) = os.path.split(filename)
    (fname,fext) = os.path.splitext(fnameExt)
    tagged_file = fname+'_'+'morph'+fext
    fw = open(tagged_file,'w')
    for line in tagged:
        strs = line.encode('utf-8')
        fw.write(strs+"\n")
    fw.close()
    print '%s is created' % (tagged_file)    
Example #11
0
def translate(model: TransformerTransModel,
              source_sentence: str,
              kor2idx: dict,
              eng2idx: dict,
              idx2eng: dict,
              device=torch.device('cpu'),
              max_length: int = 67):
    kkma = Kkma()
    tokenized_sentence = ["<sos>"] + kkma.morphs(source_sentence) + ["<eos>"]
    encoded_sentence = [
        kor2idx[morph] if morph in kor2idx else kor2idx["<unk>"]
        for morph in tokenized_sentence
    ]
    source = torch.LongTensor(encoded_sentence).unsqueeze(1).to(device)
    target = torch.LongTensor([eng2idx["<sos>"]]).unsqueeze(1).to(device)

    model.eval()
    for _ in range(max_length):
        with torch.no_grad():
            output = model(source, target)

        best_guess = output.argmax(2)[-1, :]
        last_word = best_guess.item()
        best_guess = best_guess.unsqueeze(1)
        target = torch.cat((target, best_guess), 0)

        if last_word == eng2idx["<eos>"]:
            break

    translated_sentence = [
        idx2eng[idx] for idx in target.squeeze(1).cpu().numpy()
    ]

    if translated_sentence[-1] != "<eos>":
        return translated_sentence[1:]
    else:
        return translated_sentence[1:-1]
Example #12
0
from konlpy.tag import Kkma
kkma = Kkma()

stems = kkma.morphs('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.')
print(stems)

t_text = list(
    open("data/korean-english-park.train.ko", "r",
         encoding='UTF8').readlines())
with open("./data/korean-english-park.train_stem.ko", "w",
          encoding='UTF8') as f:
    for sent in t_text:
        print(sent)
        stems = kkma.morphs(sent)
        print(stems)
        f.write(" ".join(stems) + "\n")

t_text = list(
    open("data/korean-english-park.dev.ko", "r", encoding='UTF8').readlines())
with open("./data/korean-english-park.dev_stem.ko", "w", encoding='UTF8') as f:
    for sent in t_text:
        print(sent)
        stems = kkma.morphs(sent)
        print(stems)
        f.write(" ".join(stems) + "\n")
text = args.text

print("-"*5,"원본 텍스트", "-"*5)
print(text)

print("-"*5, "Mecab", "-"*5)
print(mecab.morphs(text))

print("-"*5, "Okt", "-"*5)
print(okt.morphs(text))

print("-"*5, "Komoran", "-"*5)
print(komoran.morphs(text))

print("-"*5, "Hannanum", "-"*5)
print(hannanum.morphs(text))

print("-"*5, "Kkma", "-"*5)
print(kkma.morphs(text))

print("-"*5, "Khaiii", "-"*5)
tokens = []
for word in khaiii.analyze(text):
    tokens.extend([str(m).split('/')[0] for m in word.morphs])
print(tokens)

print("-"*5, "bert-base-multilingual-cased", "-"*5)
print(tokenizer.tokenize(text))


# Okt 형태소 분석기
print('Okt 형태소 분석기')
from konlpy.tag import Okt
okt = Okt()

print(okt.morphs(example))  # 형태소 추출
print(okt.pos(example))  # 품사 태깅
print(okt.nouns(example))  # 명사 추출

# 꼬꼬마 형태소 분석기
print('꼬꼬마 형태소 분석기')
from konlpy.tag import Kkma
kkma = Kkma()

print(kkma.morphs(example))
print(kkma.pos(example))
print(kkma.nouns(example))

# 하지만 미등록단어 (Out-of-vocabulary word) 가 많은 텍스트를 기존 형태소 분석기로 분석하면 여러 가지 문제가 발생합니다. 예를 들면, 건설분야의 전문용어가 많은 교량점검보고서의 다음 문장을 형태소 분석기로 분석해 보겠습니다.
#
# > **국부적인 아스콘 패임과 전반적인 골재마모가 조사되었으며, 골재마모가 상대적으로 심한 구간에서는 포장체의 표면이 거칠거나 경미한 골재탈리가 진행되는 상태이다.**
#
# 실제 점검보고서의 한 문장을 가져왔습니다. 아스콘, 골재마모(골재 + 마모), 골재탈리(골재+탈리)와 같이 전문용어 포함된 문장은 KoNLPy 토크나이저가 올바르게 분석하지 못하는 경우가 생깁니다.

example = '국부적인 아스콘 패임과 전반적인 골재마모가 조사되었으며, 골재마모가 상대적으로 심한 구간에서는 포장체의 표면이 거칠거나 경미한 골재탈리가 진행되는 상태이다.'
print()
print(example)
print(okt.pos(example))
print(kkma.pos(example))
def test(keyword):
    # Parameters
    # ==================================================

    #tf.flags.DEFINE_string("checkpoint_dir", "./EvilTest/runs/1573754967/checkpoints/", "Data source for the positive data.")
    
    # Data Parameters
    tf.flags.DEFINE_string("positive_data_file", "./EvilTest/data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
    tf.flags.DEFINE_string("negative_data_file", "./EvilTest/data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

    # Eval Parameters
    tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
    tf.flags.DEFINE_string("checkpoint_dir", "./EvilTest/runs/1573754967/checkpoints/", "Checkpoint directory from training run")
    tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")

    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
    FLAGS = tf.flags.FLAGS
    print("\nParameters:")
    for attr, value in sorted(FLAGS.__flags.items()):
        print("{}={}".format(attr.upper(), value))
    print("")

    #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    #print(x_raw)
    #print(y_test)
    x_raw = [keyword]
    y = [[1,0]]
    y_test = np.concatenate([y], 0)
    print(y_test)
    y_test = np.argmax(y_test, axis=1)

    kkma=Kkma() 
    x_raw=[" ".join(kkma.morphs(x2)) for x2 in x_raw]


    # Map data into vocabulary
    vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_raw)))

    print("\nEvaluating...\n")

    # Evaluation
    # ==================================================
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()



    with graph.as_default():
        session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                print(batch_predictions)
                all_predictions = np.concatenate([all_predictions, batch_predictions])



    return "text"
Example #16
0
"""
DATA Part will be create other *.json file
"""

# sample data
train_data = [["이것 좀 잡아줘", "GRAB"], ["이것 좀 잡아봐", "GRAB"], ["야 잡아", "GRAB"],
              ["이거", "GRAB"], ["저것 좀 잡아봐", "GRAB"], ["이것 좀 저어줘", "TOOL"],
              ["젓고 있어", "TOOL"], ["휘저어줘", "TOOL"]]

test_data = [["잡아봐", "GRAB"], ["야 이거 잡아", "GRAB"], ["이거 잡아", "GRAB"]["저어봐",
                                                                     "TOOL"],
             ["계속 젓고 있어"]]

# preprocessing
train_X, train_y = list(zip(*train_data))
train_X = [kor_tagger.morphs(x) for x in train_X]  # Tokenize
train_X

word2index = {'<unk>': 0}
for x in train_X:
    for token in x:
        if word2index.get(token) == None:
            word2index[token] = len(word2index)

class2index = {'GRAB': 0, 'TOOL': 1}
# print(word2index)
# print(class2index)

len(word2index)

word2index.get("패스트")
Example #17
0
xy = np.loadtxt('sample.csv', delimiter=',', dtype=np.str)
x_data = xy[:,0:1]
y_data = xy[:,1:2]
x = []
y = []
p = []
#print(x_data)
#print(y_data)
#print(x_data[0][0])
#x = kkma.morphs(x_data[0][0])
#sen = ["네 안녕하세요 반갑습니다"]
#print(sen[0])
#print(kkma.morphs(sen[0]))
for i in range(len(x_data)):
    for j in range(len(x_data[i])):
        k=kkma.morphs(x_data[i][j])
        x.insert(i,k)
for i in range(len(y_data)):
    for j in range(len(y_data[i])):
        k=[int(y_data[i][j])]
        y.insert(i,k)
print(x)
print(y)

x_one_hot = []
a=0
x_one_hot = [[0 for a in  range(len(x[a]))]for b in range(len(x[a]))]
print(x[0])
print(x_one_hot)
for i in range(len(x[a])):
    x_one_hot[i][i] = 1
        line[1] = 'NEG'

# Create Dictionary

# In[724]:

kkma = Kkma()

word_to_index = {}
index_to_word = []

for line in parsed_lines:
    # 0: title
    # 1: rating
    # 2: comment
    tokens = kkma.morphs(line[2])
    bulid_dictionary(tokens, word_to_index, index_to_word)

print len(word_to_index)

# Count word within class

# In[725]:

# initialize word count dictionary of each class with bias ( =1 )

cnt_dic_pos = {}
cnt_dic_neut = {}
nt_dic_neg = {}

# Laplace smoothing (add one)
Example #19
0
class Tokenizer:
    def __init__(self):
        self.t = Kkma()
        pass;
        
    def tokenize(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        token_list = []
        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                token_list.append(token[num])
            elif (token[num] in scores) == False:
                kkma_token = self.t.morphs(token[num])
                token_list= token_list + kkma_token
        return token_list
    
    def noun_extract(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        noun_list = []
        compared_noun_list = self.t.nouns(sentence)

        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                noun_list.append(token[num])
            elif (token[num] in scores) == False:
                twit_token = self.t.nouns(token[num])
                noun_list= noun_list + twit_token
        
        diff_noun_list = list(set(noun_list) - set(compared_noun_list))
        diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))
        
        for num, input in enumerate(noun_list):
            if (noun_list[num] in diff_noun_list) == True:
                noun_list.pop(num);
                
        return noun_list
            
    def noun_extract_dup(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        noun_list = []
        compared_noun_list = self.t.nouns(sentence)
        
        for num, input in enumerate(token):
            if (token[num] in scores) == True:
                noun_list.append(token[num])
            elif (token[num] in scores) == False:
                twit_token = self.t.nouns(token[num])
                noun_list= noun_list + twit_token
        
        diff_noun_list = list(set(noun_list) - set(compared_noun_list))
        diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))
        noun_list = list(set(noun_list) - set(diff_noun_list))
        return noun_list
    
    def noun_counter(self, sentence, score_dic, word):
        noun_list = self.noun_extract(sentence,score_dic)
        number = 0
        for num, input in enumerate(noun_list):
            if input == word:
                number = number + 1
        
        return number
Example #20
0
print(
    text_to_word_sequence(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))  #"don't 는 하나로 인식

text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."  # home-based 하나로 인식, does n't로 인식 --> 일반 word tokenizer와 동일

print(tokenizer.tokenize(text))

sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(sentence))

korean_sentence = "딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?"

print(kss.split_sentences(korean_sentence))

print(pos_tag(tokenizer.tokenize(text)))

print(okt.morphs(korean_sentence))

print(okt.pos(korean_sentence))

print(okt.nouns(korean_sentence))

print("====================================================")

print(kkma.morphs(korean_sentence))

print(kkma.pos(korean_sentence))

print(kkma.nouns(korean_sentence))
Example #21
0

man_cate_names = []
cateid_kr = pd.DataFrame(cate1)
for cateid in cateid_kr.index:
    for cate_name in splitter(cateid):
        man_cate_names.append(cate_name)

man_cate_names = set(man_cate_names)

# Kkma
kkma = Kkma()

cate_names = []
for cateid in cateid_kr.index:
    for cate_name in kkma.morphs(cateid):
        cate_names.append(cate_name)

cate_names = set(cate_names)

fin_cate_names = man_cate_names | set(cate_names)

should_del = ['-', '[', ']', '(', ')', '/', '+', '', '[시리얼]']
_cate_names = []
for name in fin_cate_names:
    if name not in should_del:
        _cate_names.append(name)
fin_cate_names = _cate_names

new_cate_names = []
Example #22
0
p1 = []
q1 = []

s = ""
r = ""

#############################################################################
with open(filename1, 'r') as f:
    for sentence in f:
        s += sentence
m = s.split('\n')

for i in m:
    n1.append(kkma.pos(i))
for i in m:
    m1.append(kkma.morphs(i))
for i in range(len(n1)):
    for j in range(len(n1[i])):
        m1[i][j] = [n1[i][j][0], n1[i][j][1]]
        if m1[i][j][1] == 'EFN':
            m1[i][j][0] = '다'
        if m1[i][j][0] == '?':
            if m1[i][j - 1][1] == 'EFQ':
                m1[i][j - 1][0] = '까'
for i in range(len(m1)):
    for j in range(len(m1[i])):
        m1[i][j] = m1[i][j][0]
n = m1

#############################################################################
with open(filename2, 'r') as f:
Example #23
0
# 전체 단어에 대한 소문자 변환
### news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

######################3
##불용어 제거
swords=open(r'C:\Users\Z\Desktop\NI\한국어불용어100.txt', encoding='UTF8').read()
stop_words=re.findall('[가-힣]+',swords)


#######
    
dataset['morphed']=range(len(dataset.Narrative))

instance=Counter([])
for i in range(len(dataset.Narrative)):
    instance+=Counter(kkma.morphs(dataset.Narrative[i]))


tags=dict(instance)

tags_copy=tags

keys=tags_copy.keys()


for i in list(keys):
    if len(i) < 2:
        del tags[i]

######
wc = WordCloud(font_path="NanumGothic", width=1200, height=800,
Example #24
0
from konlpy.tag import Mecab, Kkma, Okt
from konlpy.utils import pprint
mecab = Mecab()
kkma = Kkma()
twitter = Okt()
string = '동해물과백두산이마르고닳도록'
print('# Mecab 형태소 분석')
pprint(mecab.morphs(string))
print('# 꼬꼬마 형태소 분석')
pprint(kkma.morphs(string))
print('# 트위터 형태소 분석')
pprint(twitter.morphs(string))
print('# 트위터 문구 추출')
pprint(twitter.phrases(string))
    tmp = {}
    while (True):
        try:
            line = file.readline()
        except UnicodeDecodeError:
            print(path + txt[:-1])
            continue

        if not line:
            file.close()
            break

        if (len(line[:-1]) <= 10):
            continue
        try:
            morphs = kkma.morphs(line[:-1])
        except:
            print(line[:-1] + "line end")

        for m in morphs:
            file_terms_cnt += 1
            if (m not in s):
                s.add(m)
            if (m not in terms):
                terms.append(m)
                term_file_cnt.append(0)

            term_cnt[terms.index(m)] = term_cnt.get(terms.index(m), 0) + 1

    for k, v in term_cnt.items():
        tmp[k] = v / file_terms_cnt
#Okt (Twitter tokenizer가 v0.5.0.부터 Okt로 변경)

from konlpy.tag import Okt
okt=Okt()
okt_tokens =okt.morphs(text)
print(okt_tokens)


# In[12]:


#Kkma
from konlpy.tag import Kkma
kkma=Kkma()
kkma_tokens=kkma.morphs(text)
print(kkma_tokens)


# # 2.품사 부착(Pos Tagging)
# - 각 토큰에 품사정보 추가
# - 분석시 불필요한 품사 제거 및 필요한 품사 필터링

# In[13]:


#코모란
komoranTag=[]
for token in komoran_tokens:
    komoranTag +=komoran.pos(token)
print(komoranTag)
Example #27
0
pr.to_file('./pr_report.html')
"""
"""

import nltk
from nltk.tokenize import word_tokenize
text = "I am actively looking for Ph.D. Students. and you are a Ph.D. student."
print(word_tokenize(text))


from nltk.tag import pos_tag
x = word_tokenize(text)
pos_tag(x)
print(pos_tag(x))

"""
import konlpy
from konlpy.tag import Okt

okt = Okt()
print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))  # 형태소 추출
print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))  # 품사 태깅
print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))  # 명사 추출

from konlpy.tag import Kkma

kkma = Kkma()
print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))
print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))
print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요!"))
Example #28
0
#text중에 형용사(PA)만 추출
tagged_text = han.pos(text, ntags=22)
[t[0] for t in tagged_text if t[1] == 'PA']

# In[10]:

#명사만 추출할 경우: han.nouns()
han.nouns(text)

# ### 3.2 Kkma

# In[11]:

from konlpy.tag import Kkma
kkma = Kkma(max_heap_size=1024)  #힙 메모리 사이즈 증가시킬 때 사용
print(kkma.morphs(text))  #형태소 분석만

# In[12]:

#품사 태깅
print(kkma.pos(text))  #kkma(ntags=56)

# In[13]:

#보통명사(NNG)만 추출
tagged_text = kkma.pos(text)
[t[0] for t in tagged_text if t[1] == 'NNG']

# In[14]:

#명사만 추출할 경우: kkma.nouns()
Example #29
0
    # 짧은 문장 비교
    if len(short_line) > len(long_line):
        short_line, long_line = long_line, short_line

    print("입력 짧은 문장 : ", short_line)
    print("입력 긴   문장 : ", long_line)

    # 음절 집합 구하기기
    set_syl_short = set(syll_ngram(short_line, n))
    # 음절 개수 구하기
    cnt_syllToShort = len(syll_ngram(short_line, n))
    cnt_syllToCommon = count_common_syll_ngram(long_line, n, set_syl_short)
    # 음절 유사도 구하기
    similar_syll = (float)(cnt_syllToCommon/ cnt_syllToShort)

    
    # 형태소 분석 라이브러리 선언
    kkma = Kkma()
    # 형태소 분석
    short_kkma = list(kkma.morphs(short_line))
    long_kkma  = list(kkma.morphs(long_line))
    # 형태소 개수 구하기
    cnt_morpToShort = len(short_kkma)
    cnt_morpToCommon = count_common_syll_morp(long_kkma, short_kkma)
    # 형태소 유사도 계산
    similar_morp = (float)(cnt_morpToCommon/cnt_morpToShort)

    

    print('음절 N-gram 유사도 : {0:.3f}%'.format(similar_syll * 100))
    print('형태소 유사도 : {0:.3f}%'.format(similar_morp * 100))
Example #30
0
# In[3]:

for x in range(0, len(onlyfiles)):
    #    print (onlyfiles[x])
    path = srcDir + onlyfiles[x]
    #    print ('data read : ' + srcDir + inputFileName)
    f = io.open(path, 'r', encoding='utf-8')
    data += (f.read())
    #f = open(path, 'r')
len(data)

# In[4]:

kkma = Kkma()
token = kkma.morphs(data)
len(token)

# In[ ]:

# In[11]:

wordDic = {}
colDic = {}

w2 = token[0].encode('utf-8')
wordDic[w2] = 1

# In[12]:

for x in range(1, len(token)):
Example #31
0
4. https://konlpy.org/ko/latest/data/#corpora
말뭉치

5. https://konlpy.org/ko/latest/examples/
사용 예시
'''

# Hannanum Class
from konlpy.tag import Hannanum
hannanum = Hannanum()
print(hannanum.analyze(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.'))

#Kkma Class
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.morphs(u'공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.'))

# Komoran Class
from konlpy.tag import Komoran
komoran = Komoran()
print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))

# MeCab installation needed
from konlpy.tag import Mecab
mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")
print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))

# Twitter Class
# from konlpy.tag import Twitter
# twitter = Twitter()
# print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))
Example #32
0
from konlpy.tag import Kkma

# 꼬꼬마 형태소 분석기 객체 생성
kkma = Kkma()

text = "파이썬 라이브러리 꼬꼬마 형태소 분석기 사용"

# 형태소 추출
morphs = kkma.morphs(text)
print(morphs)

# 형태소와 품사 태그 추출
pos = kkma.pos(text)
print(pos)

# 명사만 추출
nouns = kkma.nouns(text)
print(nouns)

# 문장 분리
sentences = "파이썬을 배워봐요. 흥미로운 언어에요."
s = kkma.sentences(sentences)
print(s)