def twitter_tokenizer(table, input_col, token_col_name = 'tokens', pos_col_name = 'pos',stemming=False, normalization=False, morpheme=None) : processor = TwitterKoreanProcessor(stemming=stemming, normalization=normalization) out_table = table.copy() tokens_col_data = [] pos_col_data = [] for i in out_table.index : try: sentence = out_table.at[i,input_col] tokenize = processor.tokenize(sentence) tokens_list = [] pos_list = [] for token in tokenize: if(morpheme is None or token.pos in morpheme): tokens_list.append(token.text) pos_list.append(token.pos) if (tokens_list == []) : out_table.drop(i,inplace=True) else : tokens_col_data.append(tokens_list) pos_col_data.append(pos_list) except: out_table.drop(i,inplace=True) out_table[token_col_name] = tokens_col_data out_table[pos_col_name] = pos_col_data return {'out_table': out_table}
def analysis_test(): """ TwitterKoreanProcessor의 동작 테스트 """ text = "#오션파라다이스주소 주소 PKK558,COM 르 돈 승 상 팅 며 진 운 액 진 넘 본 천 어 정 때 낮 은 있 무 장 총 회 직 보 양 라쿠텐 아 크루즈 급 솔레어카지노 바" processor = TwitterKoreanProcessor(normalization=False, stemming=False) tokens = processor.tokenize(text) p = PrintString() p.print_tokens_plain(tokens)
def apriory_similarity_test(): """ apriori property를 이용한 두 text의 유사성검사. 위의 pos기반 similarity 비교에 대한 대응군. """ text = "#오션파라다이스주소 주소 PKK558,COM 르 돈 승 상 팅 며 진 운 액 진 넘 본 천 어 정 때 낮 은 있 무 장 총 회 직 보 양 라쿠텐 아 크루즈 급 솔레어카지노 바" text2 = "#릴게임바다 주소 W W W , S S H 9 9 6, C O M 세 아 법 카 블 게 입 요 분 쪽 올 뾻 임 팅 양 액 며 광 업 것 러 심 돈 스 띄 망 미소 업 카지노게임설명 븐 소프 입" processor = TwitterKoreanProcessor(normalization=False, stemming=False) tokens = processor.tokenize(text) tokens2 = processor.tokenize(text2) apriori_item_search(tokens, 3)
def user_analyze(): sess = Session() ps = PrintString() processor = TwitterKoreanProcessor() users = sess.query(User)\ .filter(User.language_type == None)\ .filter(User.tweet_collected_date != None)\ .all() for user in users: user_id = user.id noun_usage_dict = OrderedDict() pos_set = set([]) tweet_counter = 0 tweets = sess.query(Tweet)\ .filter(Tweet.user == user_id)\ .order_by(desc(Tweet.id))\ .limit(200)\ .all() for tweet in tweets: tweet_counter += 1 tokens = processor.tokenize(tweet.text) for token in tokens: pos_set.add(token.pos) if token.pos == 'Noun': if token.text in noun_usage_dict: noun_usage_dict[token.text] += 1 else: noun_usage_dict[token.text] = 1 if len(tweets) < 200: print(("%d is Unknown User. Tweet Count : %d") % (user_id, len(tweets))) user.language_type = -1 else: if 'Noun' not in pos_set: print(("%d is Foreigner User") % user_id) user.language_type = 0 else: print(("%d is Korean User") % user_id) user.language_type = 1 #print(tweet_counter, pos_set) sess.commit() sess.close() return True
def analysis_tweets_without_bot(analysis_type, tweet_list, bot_list): sess = Session() processor = TwitterKoreanProcessor() word_dict = dict() word_count_dict = dict() temp_count = 0 for tweet in tweet_list: if tweet.user in bot_list: continue tokens = processor.tokenize(tweet.text) for token in tokens: if token.pos == 'URL': continue if token.text not in word_dict: word_cache_query = sess.query(WordTable).filter(WordTable.word == token.text)\ .filter(WordTable.pos == token.pos) word_cache = word_cache_query.first() if word_cache is None: sess.add(WordTable(token.text, token.pos, token.unknown)) sess.commit() word_cache = word_cache_query.first() word = Word(word_cache.word, word_cache.pos, word_cache.id) word_dict[token.text] = word word = word_dict[token.text] if word.id not in word_count_dict: word_count_dict[word.id] = 1 else: word_count_dict[word.id] += 1 temp_count += 1 if temp_count % 5000 == 0: print("{0} words counted".format(temp_count)) tweet_type_data = analysis_type.make_query(sess).first() print("Word Cound Dict generated") if tweet_type_data is None: raise Exception('We need tweet search log type') #NOTE : DELETE sess.query(WordAnalysisLogWithoutBot).filter(WordAnalysisLogWithoutBot.search_log_type == tweet_type_data.id)\ .delete(synchronize_session='evaluate') count = 0 for key, value in word_count_dict.iteritems(): sess.add(WordAnalysisLogWithoutBot(key, value, tweet_type_data.id)) count += value sess.commit() sess.close() return count
class LMScoring: def __init__(self, model_file): self.model = kenlm.Model(model_file) self.tokenizer = TwitterKoreanProcessor(stemming=False) def query(self, sentence): tokens = self.tokenizer.tokenize_to_strings(sentence) tokenized_sentence = " ".join(tokens) score = self.model.score(tokenized_sentence, bos=True, eos=True) return score
def _tokenizer_kor(texts, normalization=True, stemming=True, pos_extraction=None): from twkorean import TwitterKoreanProcessor as Tw tokenizer = Tw(normalization=normalization, stemming=stemming) tagged_doc_list = [tokenizer.tokenize(text) for text in texts] pos_doc_list = [] for tagged_list in tagged_doc_list: pos_list = [] for tagged in tagged_list: for pos in pos_extraction: if tagged[1] == pos: pos_list = pos_list + [tagged[0]] pos_doc_list.append(pos_list) return pos_doc_list
def apriori_item_search(tweet_list, min_sup_value): """ apriori item으로 tokens를 frequent depend search. apriori에 들어오는 tweet_list는 유저별 tweet_list여야함 """ apriori_support = AprioriSupport() processor = TwitterKoreanProcessor(normalization=False, stemming=False) list_of_tokens = list() for tweet in tweet_list: tokens = processor.tokenize(tweet.text) list_of_tokens.append(tokens) for tokens in list_of_tokens: for item in tokens: candidate = AnalyzeItem(len(item.text), item.pos, text=item.text) apriori_support.add(candidate) apriori_support.prune(min_sup_value) print( "item_set after prune : " , apriori_support.item_set ) for tokens in list_of_tokens: for item in tokens: candidate = AnalyzeItem(len(item.text), item.pos) apriori_support.map_new_itemset(candidate) print( "candidate_set after map_new_itemset : ", apriori_support.candidate_set ) print( "item_set after map_new_itemset: ", apriori_support.item_set ) apriori_support.reset_apriori_variables() apriori_support.move_itemset() print( "item_set after move_itemset : ", apriori_support.item_set ) while len(apriori_support.item_set) != 0: apriori_support.reset_apriori_variables() for tokens in list_of_tokens: for token in tokens: item = AnalyzeItem(len(token.text), token.pos, text=token.text) apriori_support.search_add(item) print ("candidate_set after search_add : ", apriori_support.candidate_set) apriori_support.prune(min_sup_value) print ("candidate_set after prune : ", apriori_support.candidate_set) apriori_support.itemset_generate() print("item_set after prune : " ,apriori_support.item_set) print("candidate_set after apriori : " , apriori_support.candidate_set ) return apriori_support.candidate_set
class Ongari: def __init__(self): self.tkp = TwitterKoreanProcessor() def tokenize(self, text): tokens = self.tkp.tokenize(text) rtText = "" for tk in tokens: if tk[1] != "Space": rtText += tk[0] + " : " + tk[1] + "\n" return rtText
def test_user_tweet(): from twkorean import TwitterKoreanProcessor from util import PrintString ps = PrintString() try: tss = TweetSearchSupport() ts = tss.get_ts() today = datetime.datetime.now().date() tso = tss.generate_user_order("Twins", today) count = 0 foreign_tweet_counter = 0 hash_tags = set() processor = TwitterKoreanProcessor() for tweet in ts.search_tweets_iterable(tso): tweet_text = ('%s @%s tweeted: %s' % (tweet['created_at'], tweet['user']['screen_name'], tweet['text'])) print tweet_text tokens = processor.tokenize(tweet['text']) new_tokens = [] for token in tokens: foreign_flag = False if token.pos == 'Foreign': if not foreign_flag: foreign_tweet_counter += 1 foreign_flag = True elif token.pos == 'Hashtag': hash_tags.add(token.text.encode('utf-8')) new_tokens.append((token.text.encode('utf-8'), token.pos)) ps.print_tokens(new_tokens) count += 1 print hash_tags return api_bp.make_response(status=API_STATUS_OK, result = {"result" : True , "count" : count, "foreign_count" : foreign_tweet_counter, "used_hashtags" : list(hash_tags)}) except TwitterSearchException as e: print e return api_bp.make_response(status=API_STATUS_UNKNOWN, result=dict())
def _twitter_tokenizer(table, input_col, token_col_name='tokens', pos_col_name='pos', stemming=False, normalization=False, morphemes=None): if morphemes is None: morphemes = TWITTER_MORPHEMES processor = TwitterKoreanProcessor(stemming=stemming, normalization=normalization) out_table = table.copy() document = out_table[input_col] token_series = document.apply( lambda _: _split_token(_, processor=processor, morphemes=morphemes)) out_table[[token_col_name, pos_col_name]] = pd.DataFrame(data=token_series.tolist()) return {'out_table': out_table}
def __init__(self): self.tkp = TwitterKoreanProcessor()
def makeDataFromSubs(koreanSRTFile, englishSRTFile, directory): """ Usage: Takes Korean and English SRT files Returns a list of lists composed of [line#, timestamp, duration, engsentence, koreansentence,[indvWords]] """ os.chdir(directory) # Need a way to catch errors here koreanSubs = pysrt.open(koreanSRTFile) englishSubs = pysrt.open(englishSRTFile) # Set up the word tokenizer and configure it # so that punctuation, josa, etc. are not extracted indvWordProcessor = TwitterKoreanProcessor() desiredTypes = ['Noun', 'Verb', 'Adjective'] # Make containers for info returndata = [] for i in range(0, len(koreanSubs)): dataFromCurIteration = [] indvWordsContainer = [] # Append the index of Korean sub dataFromCurIteration.append(koreanSubs[i].index) # Perform actions to convert sub start time # to data that can be later used with the wave module allTime = str(koreanSubs[i].start).split(':') minuteMili = str(allTime[2]).split(',') hour = int(allTime[0]) minute = int(allTime[1]) start = (hour * 3600) + (minute * 60) + int( minuteMili[0]) + (int(minuteMili[0]) / 1000) dataFromCurIteration.append(start) # Do the same conversions to get the sub's duration # TO DO: Does it matter that I use the same variable names? allTime = str(koreanSubs[i].duration).split(':') minuteMili = str(allTime[2]).split(',') hour = int(allTime[0]) minute = int(allTime[1]) duration = (hour * 3600) + (minute * 60) + int( minuteMili[0]) + (int(minuteMili[0]) / 1000) dataFromCurIteration.append(duration) # Attempt to find an English translation for the Korean # sub that was just processed. # This is accomplished by only adding English translation data # from the same timestamp in the sub file. subIncrement = 0 while englishSubs[subIncrement].start.ordinal < koreanSubs[i].start.ordinal: subIncrement += 1 if englishSubs[subIncrement].start.ordinal == koreanSubs[i].start.ordinal: dataFromCurIteration.append(englishSubs[subIncrement].text) else: dataFromCurIteration.append('Missing English Translation') dataFromCurIteration.append(koreanSubs[i].text) # Break sentences into indv components using twkorean module indvWords = indvWordProcessor.tokenize(koreanSubs[i].text) for item in range(0, len(indvWords)): if any(word in indvWords[item] for word in desiredTypes): indvWordsContainer.append(str(indvWords[item][0])) dataFromCurIteration.append(indvWordsContainer) returndata.append(dataFromCurIteration) print('Successfully made data from subs. Returning data...') return returndata
def __init__(self, model_file): self.model = kenlm.Model(model_file) self.tokenizer = TwitterKoreanProcessor(stemming=False)
import jieba #Support Janpanese import MeCab #Support Thai import pythainlp #Support VietNamese from pyvi import ViTokenizer #Support Araby import pyarabic.araby as araby #Support Korean from twkorean import TwitterKoreanProcessor import chardet from config import * ko_tokenizer = TwitterKoreanProcessor() ja_tagger = MeCab.Tagger() # no additional dict, for local debug purpose def tokenize(text, language): """ Tokenize text based on language """ if language in SPACE_SEPARATED_LANGUAGES: return text.split() elif language == 'vi': temp = ViTokenizer.tokenize(text) temp = temp.split() return temp elif language == 'th': return pythainlp.tokenize.word_tokenize(text) elif language == 'zh_tw' or language == 'zh_cn': return list(jieba.cut(text, HMM=False))
if isinstance(t, (list, tuple)): print_tokens(t, end=elem_end) else: print(t, end=elem_end) if isinstance(tokens, list): print("]", end=end) elif isinstance(tokens, tuple): print(")", end=end) text = u"한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ" # Tokenize with normalization + stemmer processor = TwitterKoreanProcessor() # output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ] tokens = processor.tokenize_to_strings(text) print_tokens(tokens) # output: [ # (한국어, Noun, 0), (를, Josa, 0), (처리, Noun, 0), (하다, Verb, 0), # (예시, Noun, 0), (이다, Adjective, 0), (ㅋㅋ, KoreanParticle, 0) # ] tokens = processor.tokenize(text) print_tokens(tokens) # Tokenize without stemmer processor = TwitterKoreanProcessor(stemming=False) # output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ] tokens = processor.tokenize_to_strings(text)
if isinstance(t, (list, tuple)): print_tokens(t, end=elem_end) else: print(t, end=elem_end) if isinstance(tokens, list): print("]", end=end) elif isinstance(tokens, tuple): print(")", end=end) text = u"한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ" # Tokenize with normalization + stemmer processor = TwitterKoreanProcessor() # output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ] tokens = processor.tokenize_to_strings(text) print_tokens(tokens) # output: [ # (한국어, Noun, 0), (를, Josa, 0), (처리, Noun, 0), (하다, Verb, 0), # (예시, Noun, 0), (이다, Adjective, 0), (ㅋㅋ, KoreanParticle, 0) # ] tokens = processor.tokenize(text) print_tokens(tokens) # Tokenize without stemmer processor = TwitterKoreanProcessor(stemming=False) # output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]