Example #1
0
    def __init__(self):
        
        # Pretrained Word2Vec Model (ko.bin)
        self.load("ko.bin")
        
        # label -> For Tokenized
        # label_nt -> For No Tokenized
        label = list() 
        label_nt = list()
        
        # Priority Score
        scores = {'메일' : 1, '이메일' : 1,'교수님' : 1 , '교수': 0.8,'학식':1,'기식':1,'오늘':0.9,'넘버':0.7,'소웨':0.8,\
         '연락처' : 1, '전화번호' : 1, '번호' : 0.8, '핸드폰' : 1, '휴대폰' : 1, '전화' : 0.8,'전번' : 0.5,\
         '사무실' : 1, '연구실' : 1, '랩실' : 1, '렙실' : 1, '어디':1,'학생식당':1,'기숙사식당':1,'학과사무실':1,'과사':0.8,'과사무실':1.0,'위치':0.8,'소중사':1.0,'소프트웨어중심사업단':1.0}

        # Soynlp Tokenizer
        tokenizer = MaxScoreTokenizer(scores=scores)
        
        # Read Data
        f = open("intend_label.txt", 'r')
        while True:
            line = f.readline()
            if not line: break
            line = line.replace("\n","")
            label_nt.append(line)
            b = tokenizer.tokenize(line)
            label.append(b)
        f.close()
        
        self.tokenizer = tokenizer
        self.label = label
        self.label_nt = label_nt
        self.files = Files()
        self.prep = Preprocess()
Example #2
0
    def __init__(self,
                 domain_dictionary_folders=None,
                 use_base_dictionary=True,
                 dictionary_word_mincount=3,
                 evaluator=None,
                 sents=None,
                 lrgraph=None,
                 lrgraph_lmax=12,
                 lrgraph_rmax=8,
                 base_tokenizer=None,
                 preference=None,
                 verbose=False):

        self.dictionary = Dictionary(domain_dictionary_folders,
                                     use_base_dictionary,
                                     dictionary_word_mincount,
                                     verbose=verbose)
        self.evaluator = evaluator if evaluator else LREvaluator()
        self.preference = preference if preference else {}
        self.lrgraph = lrgraph if lrgraph else {}

        if (not self.lrgraph) and (sents):
            self.lrgraph = _build_lrgraph(sents, lrgraph_lmax, lrgraph_rmax)

        self.lrgraph_norm, self.lcount, self.cohesion_l, self.droprate_l\
            = self._initialize_scores(self.lrgraph)

        self.base_tokenizer = base_tokenizer if base_tokenizer else lambda x: x.split(
        )
        if not base_tokenizer:
            try:
                self.base_tokenizer = MaxScoreTokenizer(scores=self.cohesion_l)
            except Exception as e:
                print('MaxScoreTokenizer(cohesion) exception: {}'.format(e))
Example #3
0
    def __init__(self, config):
        self.basic_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        if not ( os.path.isfile(config.vocab_path) and  os.path.isfile(config.word2vec_path) ):
            assert config.data_path and os.path.isfile(config.data_path), '[{}] 위치에 학습할 파일이 없습니다.'.format(config.data_path)
            noun_dict, word2vec_model = build_vocab(config)
        else:
            noun_dict = open_pickle(config.vocab_path)
            word2vec_model = Word2Vec.load(config.word2vec_path)

        self.config = config
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))

        self.idx2tags = ['I', 'B']
        self.tags2idx = {name: idx for idx, name in enumerate(self.idx2tags)}
        self.tag_size = len(self.idx2tags)
        self.vocab_size = self.basic_tokenizer.vocab_size

        self.pad_token_id = self.basic_tokenizer.pad_token_id
        self.unk_token_id = self.basic_tokenizer.unk_token_id
Example #4
0
    def _extract_compound_nouns(self, eojeols, nouns, suffix):
        def parse_compound(tokens):
            for token in tokens[:-1]:
                if token[3] <= 0:
                    return None
            # Noun* + Josa
            if len(tokens) >= 3 and (tokens[-1][0] in suffix):
                return ''.join(t[0] for t in tokens[:-1])
            # all tokens are noun
            if tokens[-1][3] > 0:
                return ''.join(t[0] for t in tokens)
            # else, not compound
            return None

        tokenizer = MaxScoreTokenizer(
            scores={noun: 1
                    for noun in nouns if len(noun) > 1})

        compounds, removals = {}, set()
        for word, count in eojeols.items():
            # format: [(word, begin, end, score, length)]
            tokens = tokenizer.tokenize(word, flatten=False)[0]
            noun = parse_compound(tokens)
            if noun is not None:
                compounds[noun] = compounds.get(noun, 0) + count
                removals.add(word)

        return compounds, removals
Example #5
0
 def fit(self, sentences):
     self.word_extractor.train(sentences)
     scores = self.word_extractor.extract()
     scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
     self.scores = scores
     self.tokenizer = MaxScoreTokenizer(scores=self.scores)
Example #6
0
    def extract_compounds(self, candidates, prediction_scores, min_noun_score=0.3):

        noun_scores = {noun:len(noun) for noun, score in prediction_scores.items()
                       if score[1] > min_noun_score and len(noun) > 1}

        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {l:rdict.get('', 0) for l,rdict in self.lrgraph._lr_origin.items()
            if (len(l) >= 4) and not (l in noun_scores)}

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage), flush=True, end='')

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:

                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts

                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0,0))[1] for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score)
                compounds_counts[noun] = compounds_counts.get(noun,0) + count

                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]

                    if not subword in candidates:
                        continue

                    candidates[subword] = candidates.get(subword, 0) - count

                # eojeol coverage
                self.lrgraph.remove_eojeol(word)

        if self.verbose:
            print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format(
                len(compounds_scores)))

        compounds = {noun:(score, compounds_counts.get(noun,0))
             for noun, score in compounds_scores.items()}

        self._compounds_components = compounds_components

        return compounds
Example #7
0
 def tokenize(self, sentence, score_dic):
     scores = score_dic
     tokenizer = MaxScoreTokenizer(scores=scores)
     token = tokenizer.tokenize(sentence)
     token_list = []
     for num, input in enumerate(token):
         if (token[num] in scores) == True:
             token_list.append(token[num])
         elif (token[num] in scores) == False:
             kkma_token = self.t.morphs(token[num])
             token_list= token_list + kkma_token
     return token_list
Example #8
0
    def find_pro_name(self, inp):
        name_dic = {'Yenewondim Sinshaw', '강경란', '고영배', '고정길', '김도형',\
           '김동윤','김민구','김성수','김승운','노병희','류기열','변광준',\
           '손경아','안정섭','오상윤','위규범','윤대균','이석원','이정태',\
           '이택균','이환용','임재성','정크리스틴','정태선','조영종',\
           '최경희','최영준','최재영','한경식','황원준','Paul rajib','학과사무실','과사','과사무실',\
                '경란','경란이','영배','정길','정길이','도형','도형이','동윤',\
               '민구','성수','승운','승운이','병희','기열','기열이','광준','광준이',\
               '경아','정섭','정섭이','상윤','상윤이','규범','규범이','대균',\
              '대균이','석원','석원이','정태','택균','택균이','환용','환용이',\
              '정크','정크리','크리스틴','태선','태선이','영종','영종이',\
              '경희','영준','영준이','재영','재영이','경식','경식이','원준',\
              '원준이','예나원딤','에나원딤','yenewondim','Yenewondim',\
              'yenawondim','Yenawondim','라집','폴라집','Paul','폴',\
              'Paulrajib','paulrajib','제작자','소중사','소프트웨어중심사업단',\
               'Ibrahim Mohd Ali Alsofyani','Ran Rong','감동근','구형일','권익진',\
                '김도희','김상배','김상완','김상인','김영길','김영진',\
                '김재현','나상신','박성진','박용배','박익모','선우명훈','양상식',\
                '양회석','오성근','윤원식','이교범','이기근','이상용','이재진',\
                '이정원','이종욱','이채우','이해영','정기현','조성준','조위덕',\
                '조중열','좌동경','지동우','허용석','허준석','홍송남','홍영대',\
                '곽진','김강석','김기형','김상곤','김재훈','손태식','예홍진',\
                '유재희','홍만표','경민호','고욱','김지은','김현희','김효동',\
                '석혜정','신현준','오규환','이경원','이윤진','이주엽','임유상',\
                '장우진','정태영','최정주','구자열','박승규','백호기','이병묵',\
                '이태공','홍성표','란롱'}
        
        name_dic_list = list(name_dic)
        scores_name = {str(name_dic_list[step]) : 1.0 for step, inputs in enumerate(name_dic_list)}
        tokenizer_name = MaxScoreTokenizer(scores=scores_name)
        c = tokenizer_name.tokenize(inp)
        c = self.prep.replace(c)

        # Check Professor name
        professor_name = "0" # initial number
        check = 0
        for step, inputs in enumerate(name_dic):
            for i in range(len(c)):
                if c[i] == inputs:
                    professor_name = inputs
                    check = check + 1
        c = self.tokenizer.tokenize(inp)            
        if professor_name == "0" :
            for i in range(len(c)):
                if self.find_extra_name(c[i])==True:
                    professor_name = "1" # Wrong name
                    break;
        if check > 1 :            
            professor_name = "2"  # More than two Professor names

        return professor_name
    def tokenize(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        token_list = []

        for num, input in enumerate(token):
            twit_token = self.t.pos(token[num], norm=True, stem=True)
            for i in range(0, len(twit_token), 1):
                if twit_token[i][1] != "Josa" and twit_token[i][1] != "Punctuation" and \
                twit_token[i][1] != "KoreanParticle" :
                    token_list.append(twit_token[i][0])

        return token_list
Example #10
0
def rankFunction(texts):
    scores = {
        '선박명': 0.5,
        '총톤수는': 0.7,
        '년': 0.5,
        '월': 0.5,
        '일': 0.5,
        '시': 0.5,
        '분': 0.5,
        '울산': 0.5,
        '예정': 0.5
    }
    tokenizer = MaxScoreTokenizer(scores=scores)
    keywords = tokenizer.tokenize(texts)
    return keywords
Example #11
0
    def reset_tokenizer(self, data):
        noun_dict, word2vec_model = build_vocab(self.config, data)
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))
Example #12
0
 def noun_extract_dup(self, sentence, score_dic):
     scores = score_dic
     tokenizer = MaxScoreTokenizer(scores=scores)
     token = tokenizer.tokenize(sentence)
     noun_list = []
     compared_noun_list = self.t.nouns(sentence)
     
     for num, input in enumerate(token):
         if (token[num] in scores) == True:
             noun_list.append(token[num])
         elif (token[num] in scores) == False:
             twit_token = self.t.nouns(token[num])
             noun_list= noun_list + twit_token
     
     diff_noun_list = list(set(noun_list) - set(compared_noun_list))
     diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))
     noun_list = list(set(noun_list) - set(diff_noun_list))
     return noun_list
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
Example #14
0
    def soynlp_tokenizer(self):
        def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        if self.mode == 'serve':
            with open(self.data_path, 'r') as file:
                word_score_dict = json.load(file)
        elif self.mode == 'train':
            word_extractor = WordExtractor()
            word_extractor.train(self.train_corpus)
            words = word_extractor.extract()
            word_score_dict = { word:word_score(score) for word, score, in words.items()}

            with open('./models/word_dict.json', 'w') as file:
                json.dump(word_score_dict, file)
        else:
            pass
        
        tokenizer = MaxScoreTokenizer(scores=word_score_dict)
        return tokenizer
def train_extractor(begin_d=None,
                    end_d=None,
                    sections: list = None,
                    base_dir='./out',
                    tokenizer=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d,
                                             end_d=end_d,
                                             sections=sections,
                                             base_dir=base_dir)
    # nouns = get_noun_words(begin_d='20201101', end_d='20201130')

    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)  # list of str like
    noun_score = dict([(key, val.score) for key, val in nouns.items()])
    if tokenizer is None:
        tokenize = lambda x: x.strip().split()
    elif tokenizer == 'max_score_tokenizer':
        tokenize = MaxScoreTokenizer(noun_score)
    elif tokenizer == 'ltokenizer':
        tokenize = LTokenizer(noun_score)
    else:
        raise NotImplementedError

    if sections is not None and len(sections) >= 1:
        min_tf = 10
        min_df = 2
    else:
        min_tf = 20
        min_df = 2

    keyword_extractor = CorpusbasedKeywordExtractor(
        min_tf=min_tf,
        min_df=min_df,
        # tokenize=lambda x: x.strip().split(),
        tokenize=tokenize,
        verbose=True)
    # docs: list of str like
    keyword_extractor.train(sentences)
    return keyword_extractor, nouns, corpus_class
Example #16
0
def build_vocab(config, data=None):
    if data is not None:
        sents = MyIterator(data)
    else:
        sents = MyIterator(config.data_path)

    noun_extractor = LRNounExtractor_v2(verbose=False)
    nouns = noun_extractor.train_extract(sents)

    noun_dict = {}
    for noun, score in nouns.items():
        if score.frequency >= config.min_frequency and score.score >= config.min_score and len(noun) > config.min_length:
            noun_dict[noun] = score.score

    vocab_path = os.path.join(config.save_path,'vocab.pkl')
    config.vocab_path = vocab_path
    #save_pickle(vocab_path, noun_dict)

    tokenizer = MaxScoreTokenizer(noun_dict)

    if data is not None:
        word2vec_corpus = Word2VecCorpus(data, tokenizer)
    else:
        word2vec_corpus = Word2VecCorpus(config.data_path, tokenizer)

    word2vec_model = Word2Vec(
        word2vec_corpus,
        size=config.word_hidden_size,
        alpha=0.025,
        window=5,
        min_count=config.min_frequency,
        sg=0,
        negative=5)

    word2vec_path = os.path.join(config.save_path, 'word2vec{}.model'.format(config.word_hidden_size))
    config.word2vec_path = word2vec_path
    #word2vec_model.save(word2vec_path)

    return noun_dict, word2vec_model
Example #17
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError(
            "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
                regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".
            format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({
        '데이터': 0.4,
        '데이': 0.35,
        '데이터센터': 0.38
    })
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n')
Example #18
0
 def __init__(self, scores=None):
     from soynlp.tokenizer import MaxScoreTokenizer
     self.inst = MaxScoreTokenizer(scores=scores)
     self.OUT_TYPE = [list, str]
Example #19
0
""" 파일 순서 - 3 - 
주요 활동 시간을 정했다면 다음에 필요한 것은 해당 유저가 관심있어하는 
관심사를 가지고 공격할 주제를 만드는 것입니다. 
관심사는 글의 특정 단어 빈도수로 측정합니다."""


from personal_data import *
import numpy as np
import re
from soynlp.tokenizer import MaxScoreTokenizer

boundmorpheme = ["은", "는", "이", "가", "을", "를", "로써", "에서", "에게서", "부터", "까지", "에게", "한테", "께", "와", "과", "의", "로서", "으로서", "로", "으로"] # 조사
exceptions = boundmorpheme

scores = {'티켓이': 0.3, '티켓': 0.7, '좋아요': 0.2, '좋아':0.5}
tokenizer = MaxScoreTokenizer(scores=scores)

def isHangul(text):
    #Check the Python Version
    pyVer3 =  sys.version_info >= (3, 0)

    if pyVer3 : # for Ver 3 or later
        encText = text
    else: # for Ver 2.x
        if type(text) is not unicode:
            encText = text.decode('utf-8')
        else:
            encText = text

    hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
    return hanCount > 0
Example #20
0
 def __init__(self, config):
     with open(config.soynlp_scores, "r") as f:
         scores = [line.strip().split("\t") for line in f]
         scores = {word: float(score) for word, score in scores}
     self.tokenizer = MaxScoreTokenizer(scores=scores)
    '''
    print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )
    '''
    extracted_words.append(word)

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = MaxScoreTokenizer(scores=cohesion_score)

#=================LDA trian strat========================
#Generate LDAModel
#k = the number of topic
#alpha = ?
#eta = ?
#min_cf = min frequency
model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5)

for i in raw_chat:
    model.add_doc(tokenizer.tokenize(i))

#check the number of words, vocabulary
#prepare the train
model.train(0)
Example #22
0
 def load_state_dict(self, state_dict):
     self.scores = state_dict['scores']
     self.tokenizer = MaxScoreTokenizer(scores=self.scores)