コード例 #1
0
def wordRank():
    #Retreive text from elasticsearch
    results = es.get(index='nkdb',
                     doc_type='nkdb',
                     id='5dc9fc5033ec463330e97e94')
    texts = json.dumps(results['_source'], ensure_ascii=False)

    # split the text by sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts)

    # normalize the text
    texts = [normalize(text, number=True) for text in sentences]

    wordrank_extractor = KRWordRank(
        min_count=3,  # Minimum frequency of word
        max_length=10,  # Maximum length of word
        verbose=True)

    beta = 0.85  # Decaying factor beta of PageRank
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    result = []
    dic = {}
    # Make a dictionary [word, weight]
    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:30]:
        dic["y"] = r
        dic["label"] = word
        result.append(dic)
        dic = {}

    return json.dumps(result, ensure_ascii=False)
コード例 #2
0
def normalizeCF(input_fname, output_fname):
    texts = get_texts(input_fname)
    with open(output_fname, 'w', encoding='utf-8') as f:
        for text in texts:
            text = normalize(text, english=True, number=True)
            text0 = text

            # # # 명사 추출 1 번
            noun_text = expect_noun_text(text)
            text1 = ' '.join(noun_text)
            text = text0 + text1
            #
            # # # 명사 추출 2 번
            # noun_text = extract_noun.findKoNoun(text)
            # noun_text_list = noun_text[0] + noun_text[1]
            # text2 = ' '.join(noun_text_list)
            # text = text0 + ' ' +  text1 + ' ' + text2

            if text.strip() == '':
                continue

            print('*' * 10, text)

            f.write('%s\n' % (text))
    return (texts)
コード例 #3
0
def preprocessing_text(texts):
    #초기 실행시 아래 pip 다 실행해야함!
    #     !pip install git+https://github.com/ssut/py-hanspell.git
    #     !pip install konlpy
    #     !pip install krwordrank
    #오류 발생시 : https://data-scientist-brian-kim.tistory.com/79 침고

    print("preprocessing_text")
    from hanspell import spell_checker
    from tqdm.notebook import tqdm
    from konlpy.tag import Twitter
    from collections import Counter
    from krwordrank.hangle import normalize

    nlpy = Twitter()

    #     lines = [line.rstrip('\n') for line in texts]
    lines = texts.splitlines()
    nouns_word = []  #명사 단어 추출
    normalized_lines = []
    for each_line in tqdm(lines):
        each_line = each_line.replace("\x0c", "")  #json을 로드 하면서 생기는 특수문자 제거
        each_line = normalize(each_line, english=True, number=True)  #특수문자 제거
        each_line = spell_checker.check(each_line).checked  #맞춤법 틀린게 있다면 고쳐줌
        nouns_word = nouns_word + nlpy.nouns(each_line)  # 명사 단어 추출
        normalized_lines.append(each_line)

    return lines, nouns_word, normalized_lines
コード例 #4
0
def load_comments(movie_id_list):
    ### 모든 영화의 댓글 저장
    comments_list = []

    # 모든 댓글 영화id별로 database에서 호출
    for idx in movie_id_list[:]:
        # db = pymysql.connect(host='localhost', port=3306, db='project_db', user='******', passwd='python',
        #                      charset='utf8')
        db = pymysql.connect(host='localhost',
                             port=3306,
                             db='recommend_db',
                             user='******',
                             passwd='1234',
                             charset='utf8')

        query = 'select comment from movieapp_comment where movie_id={}'.format(
            idx)

        try:
            # select, update
            with db.cursor() as cursor:
                cursor.execute(query)
                result_list = cursor.fetchall()  # cursor()의 값을 가져온다.
        finally:
            db.close()

        texts = [row[0] for row in result_list]
        # 영어, 한글, 숫자만
        texts = [normalize(text, english=True, number=True) for text in texts]
        comments_list.append(texts)
    print('comments_list 개수: ', len(comments_list))
    return comments_list
コード例 #5
0
def keyword_normalize(lyricData):
    lyrics = []

    for data in lyricData:
        texts = []
        lyric = data[1].split('\n')

        for l in lyric:
            if not bool(l.strip()):
                continue
            texts.append(normalize(l))

        texts = list(filter(lambda v: v and v != '중국어 병음', texts))
        lyrics.append({'album_id': data[0], 'lyric': texts})

    return lyrics
コード例 #6
0
def get_keywords(title, text):
    """

    :param title: title of article
    :param text: body of article
    :return: key_words
    """
    texts = text
    texts = [texts]
    texts = [normalize(text, english=True, number=True) for text in texts]

    wordrank_extractor = KRWordRank(
        min_count=2,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=10,  # 단어의 최대 길이
        verbose=True)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    # rank 이용 분류
    tagger = Komoran()
    stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')])
    keyword_list = []
    for i in keywords:
        noun = tagger.nouns(i)
        if noun != []:
            keyword_list.append([noun[0], keywords[i]])

    keywords = []
    for i in keyword_list[:5]:
        keywords.append(i[0])

    title_keywords = []

    for j in keywords:
        if j in title:
            title_keywords.append(j)

    for i in title_keywords:
        if i in stopword_list:
            title_keywords.remove(i)

    return title_keywords
コード例 #7
0
ファイル: test_krwordrank.py プロジェクト: ldong94/Itsme
def test_normalize():
    input_str = '한글과 alphabet 으로 이뤄진 20글자에 가까운..   문장이에요'
    form = '\npassed case: {}\ninput : {}\noutput: {}'
    settings = [('Hangle', False, False, False, '한글과 으로 이뤄진 글자에 가까운 문장이에요'),
                ('Hangle + English', True, False, False,
                 '한글과 alphabet 으로 이뤄진 글자에 가까운 문장이에요'),
                ('Hangle + English + Number', True, True, False,
                 '한글과 alphabet 으로 이뤄진 20글자에 가까운 문장이에요'),
                ('Hangle + English + Number + Punctuation', True, True, True,
                 '한글과 alphabet 으로 이뤄진 20글자에 가까운.. 문장이에요')]
    for name, english, number, punctuation, expected in settings:
        pattern = initialize_pattern(english,
                                     number,
                                     punctuation,
                                     remains=None)
        output_str = normalize(input_str, pattern=pattern)
        assert output_str == expected
        message = form.format(name, input_str, output_str)
        print(message)
コード例 #8
0
def keyword_extraction(txt):
    list_str = txt.split()
    list_file = []
    for line in list_str:
        list_file.append(line)

    texts = list_file
    texts = [normalize(text, english=True, number=True) for text in texts]

    wordrank_extractor = KRWordRank(
        min_count=5,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=10,  # 단어의 최대 길이
        verbose=True)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:5]:
        result_list.append(word)
コード例 #9
0
    def _construct_word_graph(self, docs):
        def normalize(graph):
            graph_ = defaultdict(lambda: defaultdict(lambda: 0))
            for from_, to_dict in graph.items():
                sum_ = sum(to_dict.values())
                for to_, w in to_dict.items():
                    graph_[to_][from_] = w / sum_
            graph_ = {t: dict(fd) for t, fd in graph_.items()}
            return graph_

        graph = defaultdict(lambda: defaultdict(lambda: 0))
        for doc in docs:

            tokens = doc.split()

            if not tokens:
                continue

            links = []
            for token in tokens:
                links += self._intra_link(token)

            if len(tokens) > 1:
                tokens = [tokens[-1]] + tokens + [tokens[0]]
                links += self._inter_link(tokens)

            links = self._check_token(links)
            if not links:
                continue

            links = self._encode_token(links)
            for l_node, r_node in links:
                graph[l_node][r_node] += 1
                graph[r_node][l_node] += 1

        # reverse for inbound graph. but it normalized with sum of outbound weight
        graph = normalize(graph)
        return graph
コード例 #10
0
df

# In[2]:

df['TEXT']

# In[3]:

texts = df['TEXT'].values.tolist()
print(texts[0])

# In[4]:

from krwordrank.hangle import normalize

texts = [normalize(str(text), english=True, number=True) for text in texts]

# In[5]:

from krwordrank.word import KRWordRank

wordrank_extractor = KRWordRank(
    min_count=5,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=10,  # 단어의 최대 길이
    verbose=True)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10

keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
コード例 #11
0
 def normalize(self, text):
     text = self.remove_keyboard_out_chractor(text)
     text = normalize(text, english=True, number=True, punctuation=True)
     return text
コード例 #12
0
ファイル: w2vft_util.py プロジェクト: jjeaby/w2v-visual
def normalizeText(text):
    text = normalize(text, english=True, number=True, punctuation=True)
    return text
コード例 #13
0
ファイル: summary.py プロジェクト: LuterGS/ICT_COC
def __get_splited_sentence(whole_text, sentence_tokenizer="re"):
    """
    :param whole_text: 기사의 원문
    :param sentence_tokenizer: 어떤 tokenizer를 사용할 것인지
        re : . ? ! 3개를 기준으로 분리
        enter : 엔터로 구분
        jum : . 하나로 구분
        kss : kss 토크나이저 사용
        만약 특수기호가 있을 경우 특수기호 분류 후 kss 사용
    :return:
    """
    # print(whole_text)
    # 처음 입력이 리스트일 경우
    if type(whole_text) == list:
        splited = [
            normalize(sentence, english=True, number=True)
            for sentence in whole_text
        ]
        splited_num = len(splited)
        return splited, splited_num

    # 문자열에 특수기호가 있을 경우
    special_character = ['○', '□', '▣', '※', '①', '②', '③', '◇', '●', '★', '-']
    special_counter = 0
    for special in special_character:
        special_counter += whole_text.count(special)
    if special_counter > 0:
        special_splited = re.split("[○●□▣※◇①②③★-]",
                                   whole_text.replace("\n", "")[:-1])
        splited = []
        for sentence in special_splited:
            sentence = kss.split_sentences(sentence)
            if type(sentence) == list:
                for sentence_splited in sentence:
                    splited.append(sentence_splited)
            else:
                splited.append(sentence)
        sentence_tokenizer = "special_character"
        # print("TEST ", splited)

    if sentence_tokenizer == "re":
        splited = re.split("[.!?]", whole_text.replace("\n", "")[:-1])
    elif sentence_tokenizer == "kss":
        splited = kss.split_sentences(whole_text.replace("\n", ""))
    elif sentence_tokenizer == "enter":  # 아직 구현중, normalize 부분으로 찾으면 될거같은데...
        splited = whole_text.split("\n")
    elif sentence_tokenizer == "jum":
        splited = whole_text.split(".")
    elif sentence_tokenizer == "kss + re":
        kss_splited = kss.split_sentences(whole_text.replace("\n", ""))
        splited = []
        for sentence in kss_splited:
            print("kss1: ", sentence)
            re_splited_sentences = re.split("[.!?]",
                                            sentence.replace("\n", "")[:-1])
            if type(re_splited_sentences
                    ) == list and len(re_splited_sentences) > 1:
                print("if: ", re_splited_sentences)
                for re_splited_sentence in re_splited_sentences:
                    splited.append(re_splited_sentence)
            else:
                print("else: ", sentence)
                splited.append(sentence)
        print("rekss : ", splited)

    splited = __get_limited_length_sentence(splited, 5, 600)
    keyword_splited = [
        normalize(sentence, english=True, number=False) for sentence in splited
    ]
    print(len(splited), splited)
    splited_num = len(splited)

    return splited, keyword_splited, splited_num
コード例 #14
0
from hanspell import spell_checker
from tqdm.notebook import tqdm
from konlpy.tag import Twitter
from collections import Counter
from krwordrank.hangle import normalize

nlpy = Twitter()

lines = [line.rstrip('\n') for line in texts]  #txt 파일을 개행문자 기준으로 splig

nouns_word = []  #명사 단어 추출
normalized_lines = []
for each_line in tqdm(lines):
    each_line = each_line.replace("\x0c", "")  #json을 로드 하면서 생기는 특수문자 제거
    each_line = normalize(each_line, english=True, number=True)  #특수문자 제거
    each_line = spell_checker.check(each_line).checked  #맞춤법 틀린게 있다면 고쳐줌
    nouns_word = nouns_word + nlpy.nouns(each_line)  # 명사 단어 추출
    normalized_lines.append(each_line)

# In[8]:

normalized_lines

# In[9]:

# 명사의 빈도수 계산
from collections import Counter
count = Counter(nouns_word)

tag_count = []
コード例 #15
0



□ 자동차 압류내역 조회 인터넷 사이트(열람 무료)
  ⊙ 정부24 [ https://www.gov.kr/portal/main ] - 자동차 등록원부등본(초본) 발급·열람신청

□ 차량번호로 전국의 주정차위반 과태료 조회 인터넷 사이트
  ⊙ 위택스( www.wetax.go.kr/ ) ☞ 납부하기 ☞ 지방세외수입 ☞ 차량번호 조회

□ 경찰서의 속도 및 신호위반 과태료(범칙금) 조회 및 납부 인터넷사이트  

  ⊙ 경찰청교통민원24 [ 이파인 https://www.efine.go.kr/  ☎ 182 ]"""
okt_test = Okt()
splited = gisa.split("\n\n")
splited = [normalize(text, english=True, number=True) for text in splited]

# splited = kss.split_sentences(gisa.replace("\n", ""))
# splited = gisa.replace("\n", "").split(".")[:-1]
# splited = re.split("[.!?] ", gisa.replace("\n", "")[:-1])


split_num = len(splited)
print(split_num)

texts = [normalize(text, english=True, number=True) for text in splited]
print(texts, len(texts))
wordrank_extractor = KRWordRank(
    min_count=5,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=10,  # 단어의 최대 길이
    verbose=True