Esempio n. 1
0
def twitter_tokenizer(table, input_col, token_col_name = 'tokens', pos_col_name = 'pos',stemming=False, normalization=False, morpheme=None) :
    processor = TwitterKoreanProcessor(stemming=stemming, normalization=normalization)
    out_table = table.copy()
    tokens_col_data = []
    pos_col_data = []
    for i in out_table.index :
        try:
            sentence = out_table.at[i,input_col]
            tokenize = processor.tokenize(sentence)
            tokens_list = []
            pos_list = []
            for token in tokenize:
                if(morpheme is None or token.pos in morpheme):
                    tokens_list.append(token.text)
                    pos_list.append(token.pos)

            if (tokens_list == []) :
                out_table.drop(i,inplace=True)
            else :
                tokens_col_data.append(tokens_list)
                pos_col_data.append(pos_list)
        except:
            out_table.drop(i,inplace=True)
    out_table[token_col_name] = tokens_col_data
    out_table[pos_col_name] = pos_col_data

    return {'out_table': out_table} 
Esempio n. 2
0
    def analysis_test():
        """ TwitterKoreanProcessor의 동작 테스트
        """
        text = "#오션파라다이스주소 주소 PKK558,COM  르 돈 승 상 팅 며 진 운 액 진 넘 본 천 어 정 때 낮 은 있 무 장 총 회 직 보 양 라쿠텐 아 크루즈 급 솔레어카지노 바"

        processor = TwitterKoreanProcessor(normalization=False, stemming=False)
        tokens = processor.tokenize(text)
        p = PrintString()
        p.print_tokens_plain(tokens)
Esempio n. 3
0
    def apriory_similarity_test():
        """ apriori property를 이용한 두 text의 유사성검사.
            위의 pos기반 similarity 비교에 대한 대응군.
        """

        text = "#오션파라다이스주소 주소 PKK558,COM  르 돈 승 상 팅 며 진 운 액 진 넘 본 천 어 정 때 낮 은 있 무 장 총 회 직 보 양 라쿠텐 아 크루즈 급 솔레어카지노 바"
        text2 = "#릴게임바다 주소 W W W , S S H 9 9 6, C O M  세 아 법 카 블 게 입 요 분 쪽 올 뾻 임 팅 양 액 며 광 업 것 러 심 돈 스 띄 망 미소 업 카지노게임설명 븐 소프 입"

        processor = TwitterKoreanProcessor(normalization=False, stemming=False)
        tokens = processor.tokenize(text)
        tokens2 = processor.tokenize(text2)

        apriori_item_search(tokens, 3)
Esempio n. 4
0
def user_analyze():
    sess = Session()
    ps = PrintString()

    processor = TwitterKoreanProcessor()

    users = sess.query(User)\
                .filter(User.language_type == None)\
                .filter(User.tweet_collected_date != None)\
                .all()

    for user in users:
        user_id = user.id
        noun_usage_dict = OrderedDict()
        pos_set = set([])
        tweet_counter = 0
        tweets = sess.query(Tweet)\
                 .filter(Tweet.user == user_id)\
                 .order_by(desc(Tweet.id))\
                 .limit(200)\
                 .all()

        for tweet in tweets:
            tweet_counter += 1
            tokens = processor.tokenize(tweet.text)
            for token in tokens:
                pos_set.add(token.pos)
                if token.pos == 'Noun':
                    if token.text in noun_usage_dict:
                        noun_usage_dict[token.text] += 1
                    else:
                        noun_usage_dict[token.text] = 1
        if len(tweets) < 200:
            print(("%d is Unknown User. Tweet Count : %d") % (user_id, len(tweets)))
            user.language_type = -1
        else:
            if 'Noun' not in pos_set:
                print(("%d is Foreigner User") % user_id)
                user.language_type = 0
            else:
                print(("%d is Korean User") % user_id)
                user.language_type = 1
        #print(tweet_counter, pos_set)
        sess.commit()
    sess.close()
    return True
Esempio n. 5
0
def analysis_tweets_without_bot(analysis_type, tweet_list, bot_list):
    sess = Session()
    processor = TwitterKoreanProcessor()
    word_dict = dict()
    word_count_dict = dict()
    temp_count = 0
    for tweet in tweet_list:
        if tweet.user in bot_list:
            continue
        tokens = processor.tokenize(tweet.text)
        for token in tokens:
            if token.pos == 'URL':
                continue
            if token.text not in word_dict:
                word_cache_query = sess.query(WordTable).filter(WordTable.word == token.text)\
                                     .filter(WordTable.pos == token.pos)
                word_cache = word_cache_query.first()
                if word_cache is None:
                    sess.add(WordTable(token.text, token.pos, token.unknown))
                    sess.commit()
                    word_cache = word_cache_query.first()
                word = Word(word_cache.word, word_cache.pos, word_cache.id)
                word_dict[token.text] = word
            word = word_dict[token.text]
            if word.id not in word_count_dict:
                word_count_dict[word.id] = 1
            else:
                word_count_dict[word.id] += 1
            temp_count += 1
            if temp_count % 5000 == 0:
                print("{0} words counted".format(temp_count))
    tweet_type_data = analysis_type.make_query(sess).first()
    print("Word Cound Dict generated")
    if tweet_type_data is None:
        raise Exception('We need tweet search log type')

    #NOTE : DELETE
    sess.query(WordAnalysisLogWithoutBot).filter(WordAnalysisLogWithoutBot.search_log_type == tweet_type_data.id)\
                               .delete(synchronize_session='evaluate')
    count = 0
    for key, value in word_count_dict.iteritems():
        sess.add(WordAnalysisLogWithoutBot(key, value, tweet_type_data.id))
        count += value
    sess.commit()
    sess.close()
    return count
Esempio n. 6
0
class LMScoring:
    def __init__(self, model_file):
        self.model = kenlm.Model(model_file)
        self.tokenizer = TwitterKoreanProcessor(stemming=False)

    def query(self, sentence):
        tokens = self.tokenizer.tokenize_to_strings(sentence)
        tokenized_sentence = " ".join(tokens)
        score = self.model.score(tokenized_sentence, bos=True, eos=True)
        return score
Esempio n. 7
0
def _tokenizer_kor(texts,
                   normalization=True,
                   stemming=True,
                   pos_extraction=None):

    from twkorean import TwitterKoreanProcessor as Tw

    tokenizer = Tw(normalization=normalization, stemming=stemming)
    tagged_doc_list = [tokenizer.tokenize(text) for text in texts]
    pos_doc_list = []

    for tagged_list in tagged_doc_list:
        pos_list = []
        for tagged in tagged_list:
            for pos in pos_extraction:
                if tagged[1] == pos:
                    pos_list = pos_list + [tagged[0]]
        pos_doc_list.append(pos_list)

    return pos_doc_list
Esempio n. 8
0
def apriori_item_search(tweet_list, min_sup_value):
    """ apriori item으로 tokens를 frequent depend search.
        apriori에 들어오는 tweet_list는 유저별 tweet_list여야함
    """
    apriori_support = AprioriSupport()
    processor = TwitterKoreanProcessor(normalization=False, stemming=False)
    list_of_tokens = list()
    for tweet in tweet_list:
        tokens = processor.tokenize(tweet.text)
        list_of_tokens.append(tokens)

    for tokens in list_of_tokens:
        for item in tokens:
            candidate = AnalyzeItem(len(item.text), item.pos, text=item.text)
            apriori_support.add(candidate)
    apriori_support.prune(min_sup_value)
    print( "item_set after prune : " , apriori_support.item_set )
    for tokens in list_of_tokens:
        for item in tokens:
            candidate = AnalyzeItem(len(item.text), item.pos)
            apriori_support.map_new_itemset(candidate)

    print( "candidate_set after map_new_itemset : ", apriori_support.candidate_set )
    print( "item_set after map_new_itemset: ", apriori_support.item_set )
    apriori_support.reset_apriori_variables()
    apriori_support.move_itemset()
    print( "item_set after move_itemset : ", apriori_support.item_set )

    while len(apriori_support.item_set) != 0:
        apriori_support.reset_apriori_variables()
        for tokens in list_of_tokens:
            for token in tokens:
                item = AnalyzeItem(len(token.text), token.pos, text=token.text)
                apriori_support.search_add(item)
        print ("candidate_set after search_add : ", apriori_support.candidate_set)
        apriori_support.prune(min_sup_value)
        print ("candidate_set after prune : ", apriori_support.candidate_set)
        apriori_support.itemset_generate()
        print("item_set after prune : " ,apriori_support.item_set)
    print("candidate_set after apriori : " , apriori_support.candidate_set )
    return apriori_support.candidate_set
Esempio n. 9
0
class Ongari:
    def __init__(self):
        self.tkp = TwitterKoreanProcessor()

    def tokenize(self, text):
        tokens = self.tkp.tokenize(text)

        rtText = ""

        for tk in tokens:
            if tk[1] != "Space": rtText += tk[0] + " : " + tk[1] + "\n"

        return rtText
Esempio n. 10
0
def test_user_tweet():
    from twkorean import TwitterKoreanProcessor
    from util import PrintString
    ps = PrintString()

    try:
        tss = TweetSearchSupport()
        ts = tss.get_ts()
        today = datetime.datetime.now().date()
        tso = tss.generate_user_order("Twins", today)
        count = 0
        foreign_tweet_counter = 0
        hash_tags = set()
        processor = TwitterKoreanProcessor()
        for tweet in ts.search_tweets_iterable(tso):
            tweet_text = ('%s @%s tweeted: %s' % (tweet['created_at'], tweet['user']['screen_name'], tweet['text']))
            print tweet_text
            tokens = processor.tokenize(tweet['text'])
            new_tokens = []
            for token in tokens:
                foreign_flag = False
                if token.pos == 'Foreign':
                    if not foreign_flag:
                        foreign_tweet_counter += 1
                        foreign_flag = True
                elif token.pos == 'Hashtag':
                    hash_tags.add(token.text.encode('utf-8'))

                new_tokens.append((token.text.encode('utf-8'), token.pos))
            ps.print_tokens(new_tokens)
            count += 1
        print hash_tags
        return api_bp.make_response(status=API_STATUS_OK, result = {"result" : True , "count" : count, "foreign_count" : foreign_tweet_counter, "used_hashtags" : list(hash_tags)})
    except TwitterSearchException as e:
        print e
        return api_bp.make_response(status=API_STATUS_UNKNOWN, result=dict())
Esempio n. 11
0
def _twitter_tokenizer(table,
                       input_col,
                       token_col_name='tokens',
                       pos_col_name='pos',
                       stemming=False,
                       normalization=False,
                       morphemes=None):

    if morphemes is None:
        morphemes = TWITTER_MORPHEMES

    processor = TwitterKoreanProcessor(stemming=stemming,
                                       normalization=normalization)
    out_table = table.copy()
    document = out_table[input_col]

    token_series = document.apply(
        lambda _: _split_token(_, processor=processor, morphemes=morphemes))
    out_table[[token_col_name,
               pos_col_name]] = pd.DataFrame(data=token_series.tolist())

    return {'out_table': out_table}
Esempio n. 12
0
 def __init__(self):
     self.tkp = TwitterKoreanProcessor()
Esempio n. 13
0
def makeDataFromSubs(koreanSRTFile, englishSRTFile, directory):
    """
    Usage: Takes Korean and English SRT files
    Returns a list of lists composed of
    [line#, timestamp, duration, engsentence, koreansentence,[indvWords]]
    """
    os.chdir(directory) # Need a way to catch errors here

    koreanSubs = pysrt.open(koreanSRTFile)
    englishSubs = pysrt.open(englishSRTFile)

    # Set up the word tokenizer and configure it
    # so that punctuation, josa, etc. are not extracted
    indvWordProcessor = TwitterKoreanProcessor()
    desiredTypes = ['Noun', 'Verb', 'Adjective']

    # Make containers for info
    returndata = []

    for i in range(0, len(koreanSubs)):
        dataFromCurIteration = []
        indvWordsContainer = []
        # Append the index of Korean sub
        dataFromCurIteration.append(koreanSubs[i].index)
        # Perform actions to convert sub start time
        # to data that can be later used with the wave module
        allTime = str(koreanSubs[i].start).split(':')
        minuteMili = str(allTime[2]).split(',')
        hour = int(allTime[0])
        minute = int(allTime[1])
        start = (hour * 3600) + (minute * 60) + int(
            minuteMili[0]) + (int(minuteMili[0]) / 1000)
        dataFromCurIteration.append(start)
        # Do the same conversions to get the sub's duration
        # TO DO: Does it matter that I use the same variable names?
        allTime = str(koreanSubs[i].duration).split(':')
        minuteMili = str(allTime[2]).split(',')
        hour = int(allTime[0])
        minute = int(allTime[1])
        duration = (hour * 3600) + (minute * 60) + int(
            minuteMili[0]) + (int(minuteMili[0]) / 1000)
        dataFromCurIteration.append(duration)
        # Attempt to find an English translation for the Korean
        # sub that was just processed.
        # This is accomplished by only adding English translation data
        # from the same timestamp in the sub file.
        subIncrement = 0
        while englishSubs[subIncrement].start.ordinal < koreanSubs[i].start.ordinal:
            subIncrement += 1
        if englishSubs[subIncrement].start.ordinal == koreanSubs[i].start.ordinal:
            dataFromCurIteration.append(englishSubs[subIncrement].text)
        else:
            dataFromCurIteration.append('Missing English Translation')
        dataFromCurIteration.append(koreanSubs[i].text)
        # Break sentences into indv components using twkorean module
        indvWords = indvWordProcessor.tokenize(koreanSubs[i].text)
        for item in range(0, len(indvWords)):
            if any(word in indvWords[item] for word in desiredTypes):
                indvWordsContainer.append(str(indvWords[item][0]))
        dataFromCurIteration.append(indvWordsContainer)

        returndata.append(dataFromCurIteration)

    print('Successfully made data from subs. Returning data...')
    return returndata
Esempio n. 14
0
 def __init__(self, model_file):
     self.model = kenlm.Model(model_file)
     self.tokenizer = TwitterKoreanProcessor(stemming=False)
import jieba
#Support Janpanese
import MeCab
#Support Thai
import pythainlp
#Support VietNamese
from pyvi import ViTokenizer
#Support Araby
import pyarabic.araby as araby
#Support Korean 
from twkorean import TwitterKoreanProcessor

import chardet    
from config import *

ko_tokenizer = TwitterKoreanProcessor()
ja_tagger = MeCab.Tagger() # no additional dict, for local debug purpose

def tokenize(text, language):
    """ Tokenize text based on language """

    if language in SPACE_SEPARATED_LANGUAGES:
        return text.split()
    elif language == 'vi':
        temp = ViTokenizer.tokenize(text)
        temp = temp.split()
        return temp
    elif language == 'th':
        return pythainlp.tokenize.word_tokenize(text)
    elif language == 'zh_tw' or language == 'zh_cn':
        return list(jieba.cut(text, HMM=False))
Esempio n. 16
0
        if isinstance(t, (list, tuple)):
            print_tokens(t, end=elem_end)
        else:
            print(t, end=elem_end)

    if isinstance(tokens, list):
        print("]", end=end)
    elif isinstance(tokens, tuple):
        print(")", end=end)


text = u"한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ"

# Tokenize with normalization + stemmer
processor = TwitterKoreanProcessor()
# output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ]
tokens = processor.tokenize_to_strings(text)
print_tokens(tokens)

# output: [
#     (한국어, Noun, 0), (를, Josa, 0), (처리, Noun, 0), (하다, Verb, 0),
#     (예시, Noun, 0), (이다, Adjective, 0), (ㅋㅋ, KoreanParticle, 0)
# ]
tokens = processor.tokenize(text)
print_tokens(tokens)

# Tokenize without stemmer
processor = TwitterKoreanProcessor(stemming=False)
# output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]
tokens = processor.tokenize_to_strings(text)
Esempio n. 17
0
        if isinstance(t, (list, tuple)):
            print_tokens(t, end=elem_end)
        else:
            print(t, end=elem_end)

    if isinstance(tokens, list):
        print("]", end=end)
    elif isinstance(tokens, tuple):
        print(")", end=end)


text = u"한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ"

# Tokenize with normalization + stemmer
processor = TwitterKoreanProcessor()
# output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ]
tokens = processor.tokenize_to_strings(text)
print_tokens(tokens)

# output: [
#     (한국어, Noun, 0), (를, Josa, 0), (처리, Noun, 0), (하다, Verb, 0),
#     (예시, Noun, 0), (이다, Adjective, 0), (ㅋㅋ, KoreanParticle, 0)
# ]
tokens = processor.tokenize(text)
print_tokens(tokens)


# Tokenize without stemmer
processor = TwitterKoreanProcessor(stemming=False)
# output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]