def tokenize_okt(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply( lambda x: text_tokenize(x['content'], okt, stopwords), axis=1) df['title_token'] = df.progress_apply( lambda x: text_tokenize(x['title'], okt, stopwords), axis=1) return df
def tokenize_okt(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) return df
import re from fileIO import openJsonFile, closeJsonFile, saveError from dbIO import readDB, insertDB import nltk from nltk.corpus import stopwords from konlpy.tag import Okt from ckonlpy.tag import Twitter, Postprocessor from ckonlpy.utils import load_wordset, load_ngram # nltk.download('punkt') # nltk.download('stopwords') okt = Okt() twitter = Twitter() stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt', encoding='ANSI') customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt', encoding='ANSI') stopwordsEN = customStopwordsEN.union(set(stopwords.words('english'))) ngrams = load_ngram('cleansing_data/korean_ngram.txt') userdicts = load_wordset('cleansing_data/korean_user_dict.txt') twitter.add_dictionary(list(userdicts), 'Noun', force=True) def getJobGroups(): res = requests.get( 'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all' ) html = res.text soup = BeautifulSoup(html, "html.parser")
import re from itertools import chain import time, csv import json from dbIO import readDB, insertDB import nltk from nltk.corpus import stopwords from konlpy.tag import Okt from ckonlpy.tag import Twitter, Postprocessor from ckonlpy.utils import load_wordset, load_ngram okt = Okt() # twitter = Twitter() # stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt', encoding='ANSI') customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt', encoding='ANSI') stopwordsEN = customStopwordsEN.union(set(stopwords.words('english'))) # ngrams = load_ngram('cleansing_data/korean_ngram.txt') # userdicts = load_wordset('cleansing_data/korean_user_dict.txt') # twitter.add_dictionary(list(userdicts), 'Noun', force=True) def connectWebDriver(web): options = webdriver.ChromeOptions() options.add_argument("disable-gpu") options.add_argument("headless") options.add_argument("lang=ko_KR") # 브라우저 화면 크기에 따라 미디어 쿼리 등에 따라 element 구조가 # 달라질 수 있으므로 고정시키고 시작하기 options.add_argument('--start-maximized')
from konlpy.tag import Okt from konlpy.utils import pprint from collections import Counter from ckonlpy.tag import Postprocessor from ckonlpy.tag import Twitter from sklearn.feature_extraction.text import CountVectorizer import pandas as pd from gensim import corpora, models import pyLDAvis import pyLDAvis.gensim as gensimvis import codecs from ckonlpy.utils import load_wordset passwords = load_wordset('postprocess/passwords.txt') stopwords = load_wordset('postprocess/stopwords.txt') from ckonlpy.utils import load_replace_wordpair replace = load_replace_wordpair('postprocess/replace.txt') from ckonlpy.utils import load_ngram ngrams = load_ngram('postprocess/ngrams.txt') Okt = Okt() twitter = Twitter() new_nouns = [] with open('preprocess/dictionary.txt', encoding='utf8') as fd: for line in fd: new_nouns.append(line.strip('\n'))