コード例 #1
0
ファイル: dataset.py プロジェクト: rlagywns0213/BAN-KVQA
    def __init__(self, split, dictionary, dataroot='data', tokenizer='sp'):
        super(KvqaFeatureDataset, self).__init__()
        assert split in ['train', 'val', 'test']
        self.dataroot = dataroot

        ans2label_path = os.path.join(dataroot, 'cache',
                                      'trainval_ans2label.kvqa.pkl')
        label2ans_path = os.path.join(dataroot, 'cache',
                                      'trainval_label2ans.kvqa.pkl')
        self.ans2label = cPickle.load(open(ans2label_path, 'rb'))
        self.label2ans = cPickle.load(open(label2ans_path, 'rb'))
        self.num_ans_candidates = len(self.ans2label)

        self.dictionary = dictionary

        self.img_id2idx = cPickle.load(
            open(os.path.join(dataroot, '%s_imgid2idx.kvqa.pkl' % split),
                 'rb'))

        h5_path = os.path.join(dataroot, '%s_kvqa.hdf5' % split)

        print('loading features from h5 file')
        with h5py.File(h5_path, 'r') as hf:
            self.features = np.array(hf.get('image_features'))
            self.spatials = np.array(hf.get('spatial_features'))
            self.pos_boxes = np.array(hf.get('pos_boxes'))

        self.entries, self.type2idx, self.idx2type = _load_kvqa(
            dataroot, split, self.img_id2idx)

        if tokenizer == 'sp':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-multilingual-cased', do_lower_case=False)
            self.dictionary = self.tokenizer.vocab
        elif tokenizer == 'mecab':
            self.tokenizer = Mecab()
        elif tokenizer == 'kkma':
            self.tokenizer = Kkma()

        self.tokenize()
        self.tensorize()
        self.v_dim = self.features.size(1)
        self.s_dim = self.spatials.size(1)
コード例 #2
0
ファイル: naver.py プロジェクト: JoonyoungYi/project-news
def get_json():
    import json
    articles = get_articles('http://m.news.naver.com/')
    kkma = Kkma()
    article_results = []
    for article_dict in articles:

        article_result = dict()

        #
        article = Article(article_dict['url'])
        article.download()
        article.parse()

        #
        title = article_dict['title']
        text = article.text
        top_img_url = article.top_image

        print title
        print top_img_url

        #
        sentences = trim_sentences(kkma.sentences(text))
        target_index = forasterisk_algorithm(sentences)

        sentence = sentences[target_index].strip()
        print sentence

        article_result['title'] = title
        if article_dict[
                'img_url'] != u'http://mimgnews2.naver.net/image/navernews_200x200_new.jpg':
            article_result['img_url'] = article_dict['img_url']
        article_result['sentence'] = sentence
        article_result['press'] = article_dict['press']
        article_result['url'] = article_dict['url']

        article_results.append(article_result)

    result = dict()
    result['articles'] = article_results

    return json.dumps(result)
コード例 #3
0
def tokenizer(sentence: str) -> str:
    """텍스트를 토큰화 시킨다. Mecab을 사용해서 토큰화시킨다.

    Return:
        s (string): "안녕/NNG 부산/NNG ..."
    """
    #### Mecab##############
    tag = Okt()
    tag = Komoran()
    tag = Hannanum()
    tag = Kkma()
    tag = Mecab()
    pos = tag.pos(sentence)
    temp = []
    for p in pos:
        temp.append(p[0] + "/" + p[1])

    s = ' '.join(temp)
    return s
コード例 #4
0
ファイル: contents.py プロジェクト: fpem123/word-cloud
    def _mk_word_cloud_korean(self):
        target = ' '.join(self.text)

        kkma = Kkma()
        n = kkma.nouns(target)

        n = [temp for temp in n if len(temp) != 1 if not temp.isdecimal()]

        text = nltk.Text(n)
        data = text.vocab()
        data500 = data.most_common(500)

        dic = dict(data500)

        # Make word cloud object
        wc = WordCloud(font_path='/Library/Fonts/Arial Unicode.ttf', max_font_size=80, min_font_size=10,
                       background_color=self.color, mask=self.mask)

        self.wordcloud = wc.generate_from_frequencies(dic)
コード例 #5
0
def get_wordcloud(dbconn, cursor):
    kkma = Kkma()
    except_word_list = []
    except_keyword_list = []
    in_result_data = []

    today_date = datetime.today().strftime("%Y-%m-%d")
    newsList = TblTotalCarNewsList.objects.all().filter(
        write_date__icontains=today_date)
    news_title_group = []
    for idx in range(len(newsList)):
        news_title_group.append(
            remove_sc(
                remove_html(newsList.values()[idx].get('news_title'))).replace(
                    '&44;', '').replace('&8220;',
                                        '').replace('\r',
                                                    '').replace('\n', ''))

    get_morpheme_words(get_morphemes(news_title_group), today_date)
コード例 #6
0
def preprocessing(data):
    try:
        from konlpy.tag import Okt, Kkma
        import khaiii
        khaiii_api = khaiii.KhaiiiApi(opt.khaiii_so_path)
        khaiii_api.open(opt.khaiii_path)

        kkma = Kkma()
        kkma_tokenizer = kkma.nouns
        twitter = Okt()
        okt_tokenizer = twitter.nouns

        cls, data_path_list, div, out_path, begin_offset, end_offset = data
        data = cls()
        data.load_y_vocab()
        data.preprocessing(data_path_list, div, begin_offset, end_offset,
                           out_path, okt_tokenizer, khaiii_api, kkma_tokenizer)
    except Exception:
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
コード例 #7
0
ファイル: train.py プロジェクト: rokrokss/KAIST-Classes
def predict(text, model, label_list):
    assert isinstance(text, str)
    model.eval()
    kkma = Kkma()
    text = utils.clean_str(text)
    w2vec = KeyedVectors.load_word2vec_format('../kor-word2vec-kkma-200.bin',
                                              binary=True)
    tokens = [
        p[0] + '/' + p[1] for p in kkma.pos(text)
        if p[0] + '/' + p[1] in w2vec.vocab
    ]
    embed = parser.sentenceToEmbedding(tokens, 112, w2vec)
    x = torch.from_numpy(embed).float()
    x = x.view(1, x.size()[0], x.size()[1])
    with torch.no_grad():
        logit = model(x)
        logit = F.softmax(logit, dim=1)
    _, predicted = torch.max(logit, 1)
    return parser.idxToLabel(predicted[0], label_list)
コード例 #8
0
def insert_qna_content(dbconn, cursor):
    file_path = f'qna_set.xlsx'
    load_wb = load_workbook(file_path, data_only=True)
    load_ws = load_wb['chat4']

    all_values = []
    for row in load_ws.rows:
        row_value = []
        for cell in row:
            row_value.append(cell.value)
        all_values.append(row_value)

    kkma = Kkma()
    for idx, values in enumerate(all_values):
        q_text = values[0]
        a_text = values[1]
        q_type = values[2]
        q_nouns = kkma.nouns(q_text)
        if len(q_nouns) > 0:
            q_nouns = str(q_nouns)
        else:
            q_nouns = '[]'
        print(q_nouns)
        try:
            cursor.execute(f"""
				INSERT IGNORE INTO TBL_QNA_CHAT_SET_LIST 
				(
					Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE, UPDATE_DATE
				)
				VALUES
				(
					"{q_text}", "{a_text}", "{q_nouns}", {q_type}, NOW()
				)
			""")
        except Exception as e:
            print(f'error! >> insert_qna_content >> {e}')
        finally:
            print(
                f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!'
            )
            dbconn.commit()
            time.sleep(0.1)
コード例 #9
0
ファイル: tokenizer.py プロジェクト: kjyggg-sketch/DAlmaden
 def __init__(self, nlpEngine = "Mecab"):
     '''e
     원하는 형태소 분석기 엔진으로 형태소 분석기 생성
     :param nlpEngine: 형태소 분석기 이름(첫글자 대문자) str
     '''
     self.nlpEngine = nlpEngine
     if nlpEngine == "Okt":
         self.nlp = Okt()
     elif nlpEngine == "Komoran":
         self.nlp = Komoran()
     elif nlpEngine == "Kkma":
         self.nlp = Kkma()
     elif nlpEngine == "Hannanum":
         self.nlp = Hannanum()
     elif nlpEngine == "Mecab":
         self.nlp = Mecab()
     elif nlpEngine == "Twitter":
         self.nlp = Twitter()
     else:
         raise NameError("unknown nlp name")
コード例 #10
0
    def __init__(self, name, line_analyze=None):
        self.name = name
        self.talkdays = []
        self.people = People()
        self._words = Words()
        self.tot_msg = 0
        self.tot_person = {}
        self.line_analyze = None

        if line_analyze == 'Kkma':
            try:
                from konlpy.tag import Kkma
                self.kkma = Kkma()
                self.line_analyze = self.kkma_analyzer
            except:
                print("Please install konlpy packge.")
                line_analyze = None

        if not line_analyze:
            self.line_analyze = self.line_spliter
コード例 #11
0
def makingpos(katoc):
    pos_data = {
        'N': '',
        'V': '',
        'M': '',
        'I': '',
        'J': '',
        'E': '',
        'X': '',
        'S': '',
        'U': '',
        'O': ''
    }
    kkma = Kkma()
    temp = kkma.pos(katoc)

    for i in temp:
        pum = i[1]  #품사의 첫번쨰 글
        pos_data[pum[0]] = i[0]  #딕셔너리에 넣기
    return pos_data
コード例 #12
0
ファイル: NLP_dial.py プロジェクト: JoJalJoJal/C4T
 def AC_mapping(ac_list):
     new_ac_list = []
     family_list = ['엄마','아빠','할머니','할아버지','아이','아들','딸','조부',\
                    '가족','친척','조카','부모님','유아','어머니','아버지','아기','어른']
     couple_list = [
         '연인', '여자친구', '여친', '남자친구', '남친', '애인', '신랑', '부인', '여자', '남자'
     ]
     friend_list = [
         '여동생', '남동생', '오빠', '형님', '형', '친구', '누나', '언니', '누나', '동생'
     ]
     for ac in ac_list:
         words = Kkma().pos(ac)
         for word in words:
             if (word[1] not in ['NNM', 'NR', 'JC'
                                 ]) and (word[0] != '명'):
                 if word[0] in family_list: new_ac_list.append('가족')
                 if word[0] in couple_list: new_ac_list.append('연인')
                 if word[0] in friend_list: new_ac_list.append('친구')
                 if '혼자' in word: new_ac_list.append('혼자')
     return list(set(new_ac_list))
コード例 #13
0
 def get_nouns(self, text, isPositive, keyword):
     spliter = Kkma()
     isnouns = ['NNG', 'NNP']
     tags = spliter.pos(text)
     # 긍정일 때
     if isPositive == 1:
         for i in tags:
             if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                 self.nouns.append(i[0])
                 self.positive_nouns.append(i[0])
     # 부정일 때
     elif isPositive == -1:
         for i in tags:
             if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                 self.nouns.append(i[0])
                 self.negative_nouns.append(i[0])
     else:
         for i in tags:
             if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                 self.nouns.append(i[0])
コード例 #14
0
def get_noun(msg_txt):
    kkma = Kkma()
    nouns = list()
    # ㅋㅋ, ㅠㅠ, ㅎㅎ 등등 필터링
    pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ"
                         "\U0001F600-\U0001F64F"  # emoticons
                         "\U0001F300-\U0001F5FF"  # symbols & pictographs
                         "\U0001F680-\U0001F6FF"  # transport & map symbols
                         "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                         "]+")
    ## 1000개 게시물 했을 때 팅김 현상 해결하기
    msg_txt = re.sub(pattern, "", msg_txt).strip()
    if len(msg_txt) > 0:
        pos = kkma.pos(msg_txt)
        for keyword, type in pos:
            # 고유명사 또는 보통명사
            if type == "NNG" or type == "NNP":
                nouns.append(keyword)
        #print(msg_txt, "->", nouns)
    return nouns
コード例 #15
0
def dataset_iterator(example_dict, word_dict, verb_dict, batch_size):
    tokenizer = Kkma()
    left_context, verbs, right_context = [], [], []
    for word, example in example_dict.items():
        # split data
        l_c, vb, r_c = example[0], word, example[1]
        # convert to indices
        l_c = [word_dict[word[0]] if word[0] in word_dict else word_dict[UNK] for word in tokenizer.pos(l_c)]
        vb = verb_dict[vb]
        r_c = [word_dict[word[0]] if word[0] in word_dict else word_dict[UNK] for word in tokenizer.pos(r_c)]
        # add to list
        left_context.append(l_c)
        verbs.append(vb)
        right_context.append(r_c)
        # yield batched dataset
        if len(left_context) == batch_size:
            yield build_batch_dataset(left_context, verbs, right_context)
            left_context, verbs, right_context = [], [], []
    if len(left_context) > 0:
        yield build_batch_dataset(left_context, verbs, right_context)
コード例 #16
0
    def __init__(self, srl='framenet', language='ko', only_lu=True):
        self.srl = srl
        self.language = language
        self.only_lu = only_lu

        if self.language == 'ko':
            from konlpy.tag import Kkma
            self.kkma = Kkma()

            with open(target_dir + '/data/targetdic-1.1.json', 'r') as f:
                targetdic = json.load(f)
            self.targetdic = targetdic
        else:
            import nltk
            self.lemmatizer = nltk.WordNetLemmatizer()
            self.pos_tagger = nltk.pos_tag

            with open(target_dir + '/data/targetdic-FN1.7.json', 'r') as f:
                targetdic = json.load(f)
            self.targetdic = targetdic
コード例 #17
0
	def parse_konlpy(self, text):
		from konlpy.tag import Kkma

		kkma = Kkma()

		# --

		from konlpy.tag import Twitter

		twitter = Twitter()

		# --

		sentence_list = kkma.sentences(text)

		# --

		parsing = []

		for sentence in sentence_list:
			parsed_sentence = {}

			# --

			parsed_sentence['text'] = sentence

			# --

			parsed_sentence['morp'] = kkma.pos(sentence)

			# --

			parsed_sentence['phrase'] = twitter.phrases(sentence)

			# --

			parsing.append(parsed_sentence)

		# --

		return parsing
コード例 #18
0
def remove_particle(training_args):
    """
    remove particle

    Args:
        training_args
    """
    # load tokenizer
    mecab = Mecab()
    kkma = Kkma()
    hannanum = Hannanum()
    # load prediction file
    with open(os.path.join(training_args.output_dir, "predictions.json"),
              "r") as f:
        prediction_json = json.load(f)

    prediction_dict = dict()
    for mrc_id in prediction_json.keys():
        final_predictions = prediction_json[mrc_id]
        pos_tag = mecab.pos(final_predictions)

        # 조사가 있는 경우 삭제
        if final_predictions[-1] == "의":
            min_len = min(len(kkma.pos(final_predictions)[-1][0]),
                          len(mecab.pos(final_predictions)[-1][0]),
                          len(hannanum.pos(final_predictions)[-1][0]))
            if min_len == 1:
                final_predictions = final_predictions[:-1]
        elif pos_tag[-1][-1] in {
                "JX", "JKB", "JKO", "JKS", "ETM", "VCP", "JC"
        }:
            final_predictions = final_predictions[:-len(pos_tag[-1][0])]

        prediction_dict[str(mrc_id)] = final_predictions

    # save final results
    with open(os.path.join(training_args.output_dir, "final_predictions.json"),
              'w',
              encoding='utf-8') as make_file:
        json.dump(prediction_dict, make_file, indent="\t", ensure_ascii=False)
    print(prediction_dict)
コード例 #19
0
ファイル: statistical.py プロジェクト: NUGO-NLP/web-demo
    def __init__(self, region):
        # Region can only be 'gs' or 'jl'
        assert region == 'gs' or region == 'jl', 'region should be \'gs\' or \'jl\''

        self.kkma = Kkma()
        self.region = region
        self.sent_dict = dict()
        self.word_dict = dict()
        self.sent_dict_subword = dict()
        self.word_dict_subword = dict()
        self.additional_rule_dict = dict()

        self.sentence_data_filename = os.path.join(
            parent_path, 'data/sent_' + region + '_train.json')
        self.word_data_filename = os.path.join(
            parent_path, 'data/word_' + region + '_train.json')
        self.sentence_data_ex_filename = os.path.join(
            parent_path, 'data/ex/sent_' + region + '_train.json')
        self.word_data_ex_filename = os.path.join(
            parent_path, 'data/ex/word_' + region + '_train.json')

        self.sentence_dict_filename = os.path.join(
            current_path, 'save/statistical_sent_dict_' + region + '.json')
        self.word_dict_filename = os.path.join(
            current_path, 'save/statistical_word_dict_' + region + '.json')
        self.sentence_dict_ex_filename = os.path.join(
            current_path, 'save/ex/statistical_sent_dict_' + region + '.json')
        self.word_dict_ex_filename = os.path.join(
            current_path, 'save/ex/statistical_word_dict_' + region + '.json')

        self.additional_rule_filename = os.path.join(
            current_path, 'save/additional_rule_' + region + '.json')

        # If there is a dictionary created
        if os.path.isfile(self.sentence_dict_filename) and os.path.isfile(self.word_dict_filename) and \
            os.path.isfile(self.sentence_dict_ex_filename) and os.path.isfile(self.word_dict_ex_filename):
            self.load_dict()
            print('Load dictionary for %s' % self.region)
        else:
            self.create_dict()
            print('Create and load dictionary for %s' % self.region)
コード例 #20
0
def tag_all_reviews(norm, stem):
    kkma = Kkma()
    recommend_categories = set()
    nouns = dict()

    for filename in glob.glob('reviews/*.json'):
        with open(filename, 'r') as raw_file:
            print('parsing %s...' % filename)
            raw_data = json.load(raw_file)

            for review in raw_data:
                raw_tags = kkma.pos(review['text'])
                review['tagged'] = list()
                for tag in raw_tags:
                    if tag[1][0] in ['N', 'V']:
                        review['tagged'].append(tag)
                    if tag[1][0] == 'N':
                        if tag[0] in nouns:
                            nouns[tag[0]] += 1
                        else:
                            nouns[tag[0]] = 0
                recommend_categories.update(list(review['recommend'].keys()))

            new_filename = 'tagged_reviews/%s' % filename.split('/')[1]
            with open(new_filename, 'w') as tagged_file:
                json.dump(raw_data,
                          tagged_file,
                          ensure_ascii=False,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

    c = 0
    with open('nouns.csv', 'w') as nouns_file:
        nf = csv.writer(nouns_file)
        for key in nouns.keys():
            if nouns[key] >= 100:
                c += 1
                nf.writerow([key, nouns[key]])
    print(c)
    return recommend_categories
コード例 #21
0
def pos_tag(sentences, labels):
    kkma = Kkma()
    significant_tags = [
        'NNG', 'NNP', 'NNB', 'VV', 'VA', 'VX', 'MAG', 'MAJ', 'XSV', 'XSA'
    ]
    # 일반 명사, 고유 명사, 의존 명사, 동사, 형용사, 보조 용언, 일반 부사, 접속 부사, 동사 파생 접미사, 형용사 파생 접미사

    s, l = [], []
    for sent, label in zip(sentences, labels):
        tmp = []
        for word, tag in kkma.pos(sent):
            print(word + tag + ' ')
            if tag in significant_tags:
                tmp.append(word + '/' + tag)
        s.append(stemming_text(tmp))
        l.append(label)

    result = pd.DataFrame([x for x in zip(s, l)],
                          columns=['sentences', 'labels'])

    return result
コード例 #22
0
ファイル: server.py プロジェクト: noti-dropper/backend
def post_action():
    if request.method == 'POST':
        #json 데이터를 받습니다.
        json_data = request.get_json()
        print('json_data : {json_data}')

        #꼬꼬마 객체를 생성
        kkma = Kkma()
        #api 함수를 사용해 명사를 추출합니다
        nouns_list = kkma.nouns(json_data['sentence'])
        print(nouns_list)
        #list 자료구조를 json 형식으로 변환합니다.
        nouns_json = json.dumps(nouns_list)

        nouns_json = '{"result":' + nouns_json + "}"

        print(nouns_json)

        #명사들을 return 해줍니다
        return nouns_json
    return 'none POST!'
コード例 #23
0
def ttr_check(txt):
    # kkma 객체 생성
    kkma = Kkma()
    # 형태소 및 태그 추출
    pos = kkma.pos(txt)
    # 빈도 카운트 및 저장(dict)
    count = Counter(pos)
    # pprint(count)

    # token
    ttr_token = sum(count.values())
    # type
    ttr_type = len(count.keys())
    # TTR
    ttr = (ttr_type / ttr_token) * 100

    # 출력
    # print(ttr_token, ttr_type)
    # print('TTR은 : {} 입니다.'.format(ttr))

    return round(ttr, 2)
コード例 #24
0
ファイル: analyzer.py プロジェクト: WoohyunNoh/Edu_AI
    def set_tagger(self, tagger):
        tag = None
        if tagger == "mecab":
            tag = Mecab()
        elif tagger == 'komoran':
            from konlpy.tag import Komoran
            tag = Komoran()
        elif tagger == 'kkma':
            from konlpy.tag import Kkma
            tag = Kkma()
        elif tagger == 'hannanum':
            from konlpy.tag import Hannanum
            tag = Hannanum()
        elif tagger == 'okt':
            from konlpy.tag import Okt
            tag = Okt()
        elif tagger == 'twitter':
            from konlpy.tag import Twitter
            tag = Twitter()

        return tag
コード例 #25
0
ファイル: Navernews.py プロジェクト: linkz73/idol_rank
def make_content(url_list, news_content_list,  content_summarize_list, title_list):
    for url in url_list:
        try:
            kkma = Kkma()
            news =Article(url, language = 'ko')
            news.download()
            news.parse()
            title_list.append(news.title)
            news.text = kkma.sentences(news.text)
            news.text = " ".join(news.text)
            news_content_list.append(news.text)
            # print(news.text)
            # print(type(news.text))
            summary_content = summarize(news.text, word_count=100, ratio= 0.5)
            if summary_content:
                content_summarize_list.append(summary_content)
            else:
                content_summarize_list.append("요약 할 기사의 내용이 없습니다.")
        except Exception as e:
            print("exceptions is ", e)
            pass
コード例 #26
0
ファイル: dataio.py プロジェクト: MrBananaHuman/frameBERT
def remove_josa(phrase):
    from konlpy.tag import Kkma
    kkma = Kkma()
    import jpype
    jpype.attachThreadToJVM()
    
    tokens = phrase.split(' ')

    result = []
    for i in range(len(tokens)):
        token = tokens[i]
        if i < len(tokens)-1:
            result.append(token)
        else:
            m = kkma.pos(tokens[i])
            if m[-1][-1].startswith('J'):
                m.pop(-1)
                token = ''.join([t for t,p in m])
            result.append(token)
    result = ' '.join(result)
    return result
コード例 #27
0
def get_morphemes(post_detail_cont) : 
	kkma = Kkma()
	results = []
	except_word_list = []
	# 특수문자 제거
	detail = remove_sc(str(post_detail_cont))
	# 어절 분리 > list
	origin_word_list = list(dict.fromkeys(regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{detail}')))
	if len(origin_word_list) > 100 : 
		del origin_word_list[100 : len(origin_word_list)]
	
	# origin_word_list = ['안녕하세요', '문의사항이', '있습니다']
	for origin_word in origin_word_list :
		if (origin_word not in except_word_list) : 
			for morpheme in kkma.pos(origin_word) :
				in_result = []	
				in_result.append(origin_word)
				in_result.append(morpheme)
				results.append(in_result)

	# results = [['자동차', ('자동차', 'NNG')], ['고장', ('고장', 'NNG')], ['진단', ('진단', 'NNG')], ['APP', ('APP', 'OL')], ['개발', ('개발', 'NNG')], ['데이터', ('데이터', 'NNG')], ['수집', ('수집', 'NNG')]]
	return results 
コード例 #28
0
def analyseText(text_data, results_file_name="results.txt"):
    print("\n---------------------------------------------")
    print("Step2 : 단어별 형태소 및 빈도를 분석합니다... 기다려 주세요")
    kkma = Kkma()
    data_pos = kkma.pos(text_data)
    data_arr = []

    stop_words_file = open("stop_words.txt", "r", encoding="utf-8")
    stop_words = [
        x.replace("\n", "").strip() for x in stop_words_file.readlines()
    ]
    stop_words_file.close()

    print("명사만 필터링하는 중...")
    for word_pos in data_pos:
        word = word_pos[0]
        pos = word_pos[1]
        if pos == "NNG" or pos == "VV" or pos == "VA":  #명사만 필터링함. 동사도 포함하려면 or pos=="VV" (VA는 형용사) 추가할 것
            data_arr.append(word)

    print("단어별 발생빈도를 정렬하고 파일에 저장하는 중...")
    counter = Counter(data_arr).most_common()
    keywords_and_frequency_for_wc = {}
    keywords_and_frequency = []

    print("한 글자 이상 단어, 빈도수 2 이상인 것만 필터링하는 중...")
    for keyword in counter:
        word = keyword[0]
        freq = keyword[1]
        if len(
                word
        ) > 1 and freq > 2 and word not in stop_words:  #한 글자 이상 단어 + 빈도수가 2 이상 + 불용어가 아닌 것만 추출
            keywords_and_frequency_for_wc[word] = freq
            keywords_and_frequency.append({"단어": word, "빈도": freq})

    df = pd.DataFrame(keywords_and_frequency)
    df.to_excel(excel_writer=results_file_name)
    print("형태소 및 빈도 분석 완료!")
    return keywords_and_frequency_for_wc
コード例 #29
0
def crawler(base, root):
    # return dictionary of {sentence:tag}
    import requests
    from bs4 import BeautifulSoup

    # get into each link in the index page
    page = requests.get(root)
    soup = BeautifulSoup(page.text, 'html.parser')
    links = soup.select('body .mw-category-group a')
    linkurls = [link for link in links]

    # get information of the book & get text
    ret = {}
    txt = ''

    for linkurl in linkurls:
        try:
            print('getting into ' + linkurl.text)
            linkurl = linkurl['href']
            page = requests.get(base + linkurl)
            soup = BeautifulSoup(page.text, 'html.parser')
            title = soup.find('span', {'id': 'header_title_text'}).text
            author = soup.find('span', {'class': 'fn'}).text
            tag = title.replace(" ", '') + author.replace(" ", '')
            txt = soup.select(' .mw-parser-output p')[:-1]
        except Exception as e:
            print(e, title, author)
            continue
        for string in txt:
            string = string.text.replace("\n", '')
            sentencer = Kkma()
            sentence = sentencer.sentences(string)
            #refine string
            for s in sentence:
                s = sentenceModifierSTR(s)
                ret[s] = tag
                #print(s + 'appended')
            print('Store Done for' + tag)
    return ret
コード例 #30
0
def makeCloudTag(file_name):
    df = pd.read_excel(file_name)
    contents = list(df.content_main)
    result = ""
    for content in contents:
        result = result + str(content) + " "

    kkma = Kkma()
    m = result.split("\r\n")
    nouns_list = []
    for a in m:
        nouns_list.append(kkma.nouns(a))

    nouns_list_s = []
    for i in nouns_list:
        for j in i:
            nouns_list_s.append(j)

    c = collections.Counter(nouns_list_s)
    x = c.most_common(100)
    d = make_tags(x, maxsize=100)
    create_tag_image(d, "star.jpg", size=(1000, 500), fontname='hangle')