Example #1
0
 def fit(self, sentences):
     self.word_extractor.train(sentences)
     scores = self.word_extractor.extract()
     scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
     self.scores = scores
     self.tokenizer = MaxScoreTokenizer(scores=self.scores)
Example #2
0
    def __init__(self,
                 domain_dictionary_folders=None,
                 use_base_dictionary=True,
                 dictionary_word_mincount=3,
                 evaluator=None,
                 sents=None,
                 lrgraph=None,
                 lrgraph_lmax=12,
                 lrgraph_rmax=8,
                 base_tokenizer=None,
                 preference=None,
                 verbose=False):

        self.dictionary = Dictionary(domain_dictionary_folders,
                                     use_base_dictionary,
                                     dictionary_word_mincount,
                                     verbose=verbose)
        self.evaluator = evaluator if evaluator else LREvaluator()
        self.preference = preference if preference else {}
        self.lrgraph = lrgraph if lrgraph else {}

        if (not self.lrgraph) and (sents):
            self.lrgraph = _build_lrgraph(sents, lrgraph_lmax, lrgraph_rmax)

        self.lrgraph_norm, self.lcount, self.cohesion_l, self.droprate_l\
            = self._initialize_scores(self.lrgraph)

        self.base_tokenizer = base_tokenizer if base_tokenizer else lambda x: x.split(
        )
        if not base_tokenizer:
            try:
                self.base_tokenizer = MaxScoreTokenizer(scores=self.cohesion_l)
            except Exception as e:
                print('MaxScoreTokenizer(cohesion) exception: {}'.format(e))
Example #3
0
    def __init__(self):
        
        # Pretrained Word2Vec Model (ko.bin)
        self.load("ko.bin")
        
        # label -> For Tokenized
        # label_nt -> For No Tokenized
        label = list() 
        label_nt = list()
        
        # Priority Score
        scores = {'메일' : 1, '이메일' : 1,'교수님' : 1 , '교수': 0.8,'학식':1,'기식':1,'오늘':0.9,'넘버':0.7,'소웨':0.8,\
         '연락처' : 1, '전화번호' : 1, '번호' : 0.8, '핸드폰' : 1, '휴대폰' : 1, '전화' : 0.8,'전번' : 0.5,\
         '사무실' : 1, '연구실' : 1, '랩실' : 1, '렙실' : 1, '어디':1,'학생식당':1,'기숙사식당':1,'학과사무실':1,'과사':0.8,'과사무실':1.0,'위치':0.8,'소중사':1.0,'소프트웨어중심사업단':1.0}

        # Soynlp Tokenizer
        tokenizer = MaxScoreTokenizer(scores=scores)
        
        # Read Data
        f = open("intend_label.txt", 'r')
        while True:
            line = f.readline()
            if not line: break
            line = line.replace("\n","")
            label_nt.append(line)
            b = tokenizer.tokenize(line)
            label.append(b)
        f.close()
        
        self.tokenizer = tokenizer
        self.label = label
        self.label_nt = label_nt
        self.files = Files()
        self.prep = Preprocess()
Example #4
0
    def _extract_compound_nouns(self, eojeols, nouns, suffix):
        def parse_compound(tokens):
            for token in tokens[:-1]:
                if token[3] <= 0:
                    return None
            # Noun* + Josa
            if len(tokens) >= 3 and (tokens[-1][0] in suffix):
                return ''.join(t[0] for t in tokens[:-1])
            # all tokens are noun
            if tokens[-1][3] > 0:
                return ''.join(t[0] for t in tokens)
            # else, not compound
            return None

        tokenizer = MaxScoreTokenizer(
            scores={noun: 1
                    for noun in nouns if len(noun) > 1})

        compounds, removals = {}, set()
        for word, count in eojeols.items():
            # format: [(word, begin, end, score, length)]
            tokens = tokenizer.tokenize(word, flatten=False)[0]
            noun = parse_compound(tokens)
            if noun is not None:
                compounds[noun] = compounds.get(noun, 0) + count
                removals.add(word)

        return compounds, removals
Example #5
0
    def __init__(self, config):
        self.basic_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        if not ( os.path.isfile(config.vocab_path) and  os.path.isfile(config.word2vec_path) ):
            assert config.data_path and os.path.isfile(config.data_path), '[{}] 위치에 학습할 파일이 없습니다.'.format(config.data_path)
            noun_dict, word2vec_model = build_vocab(config)
        else:
            noun_dict = open_pickle(config.vocab_path)
            word2vec_model = Word2Vec.load(config.word2vec_path)

        self.config = config
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))

        self.idx2tags = ['I', 'B']
        self.tags2idx = {name: idx for idx, name in enumerate(self.idx2tags)}
        self.tag_size = len(self.idx2tags)
        self.vocab_size = self.basic_tokenizer.vocab_size

        self.pad_token_id = self.basic_tokenizer.pad_token_id
        self.unk_token_id = self.basic_tokenizer.unk_token_id
Example #6
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') 
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
            regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n\n')
Example #7
0
    def extract_compounds(self, candidates, prediction_scores, min_noun_score=0.3):

        noun_scores = {noun:len(noun) for noun, score in prediction_scores.items()
                       if score[1] > min_noun_score and len(noun) > 1}

        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {l:rdict.get('', 0) for l,rdict in self.lrgraph._lr_origin.items()
            if (len(l) >= 4) and not (l in noun_scores)}

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage), flush=True, end='')

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:

                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts

                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0,0))[1] for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score)
                compounds_counts[noun] = compounds_counts.get(noun,0) + count

                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]

                    if not subword in candidates:
                        continue

                    candidates[subword] = candidates.get(subword, 0) - count

                # eojeol coverage
                self.lrgraph.remove_eojeol(word)

        if self.verbose:
            print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format(
                len(compounds_scores)))

        compounds = {noun:(score, compounds_counts.get(noun,0))
             for noun, score in compounds_scores.items()}

        self._compounds_components = compounds_components

        return compounds
Example #8
0
 def tokenize(self, sentence, score_dic):
     scores = score_dic
     tokenizer = MaxScoreTokenizer(scores=scores)
     token = tokenizer.tokenize(sentence)
     token_list = []
     for num, input in enumerate(token):
         if (token[num] in scores) == True:
             token_list.append(token[num])
         elif (token[num] in scores) == False:
             kkma_token = self.t.morphs(token[num])
             token_list= token_list + kkma_token
     return token_list
Example #9
0
    def find_pro_name(self, inp):
        name_dic = {'Yenewondim Sinshaw', '강경란', '고영배', '고정길', '김도형',\
           '김동윤','김민구','김성수','김승운','노병희','류기열','변광준',\
           '손경아','안정섭','오상윤','위규범','윤대균','이석원','이정태',\
           '이택균','이환용','임재성','정크리스틴','정태선','조영종',\
           '최경희','최영준','최재영','한경식','황원준','Paul rajib','학과사무실','과사','과사무실',\
                '경란','경란이','영배','정길','정길이','도형','도형이','동윤',\
               '민구','성수','승운','승운이','병희','기열','기열이','광준','광준이',\
               '경아','정섭','정섭이','상윤','상윤이','규범','규범이','대균',\
              '대균이','석원','석원이','정태','택균','택균이','환용','환용이',\
              '정크','정크리','크리스틴','태선','태선이','영종','영종이',\
              '경희','영준','영준이','재영','재영이','경식','경식이','원준',\
              '원준이','예나원딤','에나원딤','yenewondim','Yenewondim',\
              'yenawondim','Yenawondim','라집','폴라집','Paul','폴',\
              'Paulrajib','paulrajib','제작자','소중사','소프트웨어중심사업단',\
               'Ibrahim Mohd Ali Alsofyani','Ran Rong','감동근','구형일','권익진',\
                '김도희','김상배','김상완','김상인','김영길','김영진',\
                '김재현','나상신','박성진','박용배','박익모','선우명훈','양상식',\
                '양회석','오성근','윤원식','이교범','이기근','이상용','이재진',\
                '이정원','이종욱','이채우','이해영','정기현','조성준','조위덕',\
                '조중열','좌동경','지동우','허용석','허준석','홍송남','홍영대',\
                '곽진','김강석','김기형','김상곤','김재훈','손태식','예홍진',\
                '유재희','홍만표','경민호','고욱','김지은','김현희','김효동',\
                '석혜정','신현준','오규환','이경원','이윤진','이주엽','임유상',\
                '장우진','정태영','최정주','구자열','박승규','백호기','이병묵',\
                '이태공','홍성표','란롱'}
        
        name_dic_list = list(name_dic)
        scores_name = {str(name_dic_list[step]) : 1.0 for step, inputs in enumerate(name_dic_list)}
        tokenizer_name = MaxScoreTokenizer(scores=scores_name)
        c = tokenizer_name.tokenize(inp)
        c = self.prep.replace(c)

        # Check Professor name
        professor_name = "0" # initial number
        check = 0
        for step, inputs in enumerate(name_dic):
            for i in range(len(c)):
                if c[i] == inputs:
                    professor_name = inputs
                    check = check + 1
        c = self.tokenizer.tokenize(inp)            
        if professor_name == "0" :
            for i in range(len(c)):
                if self.find_extra_name(c[i])==True:
                    professor_name = "1" # Wrong name
                    break;
        if check > 1 :            
            professor_name = "2"  # More than two Professor names

        return professor_name
    def tokenize(self, sentence, score_dic):
        scores = score_dic
        tokenizer = MaxScoreTokenizer(scores=scores)
        token = tokenizer.tokenize(sentence)
        token_list = []

        for num, input in enumerate(token):
            twit_token = self.t.pos(token[num], norm=True, stem=True)
            for i in range(0, len(twit_token), 1):
                if twit_token[i][1] != "Josa" and twit_token[i][1] != "Punctuation" and \
                twit_token[i][1] != "KoreanParticle" :
                    token_list.append(twit_token[i][0])

        return token_list
Example #11
0
def rankFunction(texts):
    scores = {
        '선박명': 0.5,
        '총톤수는': 0.7,
        '년': 0.5,
        '월': 0.5,
        '일': 0.5,
        '시': 0.5,
        '분': 0.5,
        '울산': 0.5,
        '예정': 0.5
    }
    tokenizer = MaxScoreTokenizer(scores=scores)
    keywords = tokenizer.tokenize(texts)
    return keywords
Example #12
0
class SoyNLPTokenizer(BaseTokenizer):
    """
    Tokenize text using MaxScoreTokenizer of SoyNLP
    """
    def __init__(self):
        self.tokenizer = None
        self.scores = list()
        self.word_extractor = WordExtractor(min_count=100,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)

    def fit(self, sentences):
        self.word_extractor.train(sentences)
        scores = self.word_extractor.extract()
        scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                   (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
        self.scores = scores
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def state_dict(self):
        return {'scores': self.scores}

    def load_state_dict(self, state_dict):
        self.scores = state_dict['scores']
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def tokenize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        return tokenized_sentence
Example #13
0
class MaxScoreTokenizerKorean(SpecialTokenizer):
    def __init__(self, scores=None):
        from soynlp.tokenizer import MaxScoreTokenizer
        self.inst = MaxScoreTokenizer(scores=scores)
        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
Example #14
0
 def noun_extract_dup(self, sentence, score_dic):
     scores = score_dic
     tokenizer = MaxScoreTokenizer(scores=scores)
     token = tokenizer.tokenize(sentence)
     noun_list = []
     compared_noun_list = self.t.nouns(sentence)
     
     for num, input in enumerate(token):
         if (token[num] in scores) == True:
             noun_list.append(token[num])
         elif (token[num] in scores) == False:
             twit_token = self.t.nouns(token[num])
             noun_list= noun_list + twit_token
     
     diff_noun_list = list(set(noun_list) - set(compared_noun_list))
     diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys()))
     noun_list = list(set(noun_list) - set(diff_noun_list))
     return noun_list
Example #15
0
    def reset_tokenizer(self, data):
        noun_dict, word2vec_model = build_vocab(self.config, data)
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))
Example #16
0
    def extract_compounds(self, candidates, prediction_scores, minimum_noun_score=0.3):

        noun_scores = {noun:len(noun) for noun, score in prediction_scores.items()
                       if score[0] > minimum_noun_score and len(noun) > 1}
        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items()
            if (len(l) >= 4) and not (l in noun_scores)}

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage), flush=True, end='')

            # skip if candidate is substring of longer compound
            if candidates.get(word, 0) <= 0:
                continue

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:
                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts
                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0,0))[0] for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score)
                compounds_counts[noun] = compounds_counts.get(noun,0) + count
                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]
                    if not subword in candidates:
                        continue
                    candidates[subword] = candidates.get(subword, 0) - count
                # eojeol coverage
                self.lrgraph.remove_eojeol(word)
                self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format(
                len(compounds_scores)))

        compounds = {noun:(score, compounds_counts.get(noun,0))
             for noun, score in compounds_scores.items()}

        self._compounds_components = compounds_components

        return compounds
Example #17
0
class SoyNLPTokenizer(BaseTokenizer):
    """
    Tokenize text using MaxScoreTokenizer of SoyNLP
    """
    def __init__(self, config):
        with open(config.soynlp_scores, "r") as f:
            scores = [line.strip().split("\t") for line in f]
            scores = {word: float(score) for word, score in scores}
        self.tokenizer = MaxScoreTokenizer(scores=scores)

    def tokenize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        return tokenized_sentence
Example #18
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError(
            "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
                regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".
            format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({
        '데이터': 0.4,
        '데이': 0.35,
        '데이터센터': 0.38
    })
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n')
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
Example #20
0
    def soynlp_tokenizer(self):
        def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        if self.mode == 'serve':
            with open(self.data_path, 'r') as file:
                word_score_dict = json.load(file)
        elif self.mode == 'train':
            word_extractor = WordExtractor()
            word_extractor.train(self.train_corpus)
            words = word_extractor.extract()
            word_score_dict = { word:word_score(score) for word, score, in words.items()}

            with open('./models/word_dict.json', 'w') as file:
                json.dump(word_score_dict, file)
        else:
            pass
        
        tokenizer = MaxScoreTokenizer(scores=word_score_dict)
        return tokenizer
Example #21
0
def build_vocab(config, data=None):
    if data is not None:
        sents = MyIterator(data)
    else:
        sents = MyIterator(config.data_path)

    noun_extractor = LRNounExtractor_v2(verbose=False)
    nouns = noun_extractor.train_extract(sents)

    noun_dict = {}
    for noun, score in nouns.items():
        if score.frequency >= config.min_frequency and score.score >= config.min_score and len(noun) > config.min_length:
            noun_dict[noun] = score.score

    vocab_path = os.path.join(config.save_path,'vocab.pkl')
    config.vocab_path = vocab_path
    #save_pickle(vocab_path, noun_dict)

    tokenizer = MaxScoreTokenizer(noun_dict)

    if data is not None:
        word2vec_corpus = Word2VecCorpus(data, tokenizer)
    else:
        word2vec_corpus = Word2VecCorpus(config.data_path, tokenizer)

    word2vec_model = Word2Vec(
        word2vec_corpus,
        size=config.word_hidden_size,
        alpha=0.025,
        window=5,
        min_count=config.min_frequency,
        sg=0,
        negative=5)

    word2vec_path = os.path.join(config.save_path, 'word2vec{}.model'.format(config.word_hidden_size))
    config.word2vec_path = word2vec_path
    #word2vec_model.save(word2vec_path)

    return noun_dict, word2vec_model
def train_extractor(begin_d=None,
                    end_d=None,
                    sections: list = None,
                    base_dir='./out',
                    tokenizer=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d,
                                             end_d=end_d,
                                             sections=sections,
                                             base_dir=base_dir)
    # nouns = get_noun_words(begin_d='20201101', end_d='20201130')

    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)  # list of str like
    noun_score = dict([(key, val.score) for key, val in nouns.items()])
    if tokenizer is None:
        tokenize = lambda x: x.strip().split()
    elif tokenizer == 'max_score_tokenizer':
        tokenize = MaxScoreTokenizer(noun_score)
    elif tokenizer == 'ltokenizer':
        tokenize = LTokenizer(noun_score)
    else:
        raise NotImplementedError

    if sections is not None and len(sections) >= 1:
        min_tf = 10
        min_df = 2
    else:
        min_tf = 20
        min_df = 2

    keyword_extractor = CorpusbasedKeywordExtractor(
        min_tf=min_tf,
        min_df=min_df,
        # tokenize=lambda x: x.strip().split(),
        tokenize=tokenize,
        verbose=True)
    # docs: list of str like
    keyword_extractor.train(sentences)
    return keyword_extractor, nouns, corpus_class
Example #23
0
 def __init__(self, scores=None):
     from soynlp.tokenizer import MaxScoreTokenizer
     self.inst = MaxScoreTokenizer(scores=scores)
     self.OUT_TYPE = [list, str]
Example #24
0
""" 파일 순서 - 3 - 
주요 활동 시간을 정했다면 다음에 필요한 것은 해당 유저가 관심있어하는 
관심사를 가지고 공격할 주제를 만드는 것입니다. 
관심사는 글의 특정 단어 빈도수로 측정합니다."""


from personal_data import *
import numpy as np
import re
from soynlp.tokenizer import MaxScoreTokenizer

boundmorpheme = ["은", "는", "이", "가", "을", "를", "로써", "에서", "에게서", "부터", "까지", "에게", "한테", "께", "와", "과", "의", "로서", "으로서", "로", "으로"] # 조사
exceptions = boundmorpheme

scores = {'티켓이': 0.3, '티켓': 0.7, '좋아요': 0.2, '좋아':0.5}
tokenizer = MaxScoreTokenizer(scores=scores)

def isHangul(text):
    #Check the Python Version
    pyVer3 =  sys.version_info >= (3, 0)

    if pyVer3 : # for Ver 3 or later
        encText = text
    else: # for Ver 2.x
        if type(text) is not unicode:
            encText = text.decode('utf-8')
        else:
            encText = text

    hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
    return hanCount > 0
Example #25
0
class LRNounExtractor_v2:
    def __init__(self, l_max_length=10, r_max_length=9, predictor_headers=None,
        verbose=True, min_num_of_features=1, max_count_when_noun_is_eojeol=30,
        eojeol_counter_filtering_checkpoint=0, extract_compound=True,
        extract_determiner=False, extract_josa=False):

        self.l_max_length = l_max_length
        self.r_max_length = r_max_length
        self.lrgraph = None
        self.verbose = verbose
        self.min_num_of_features = min_num_of_features
        self.max_count_when_noun_is_eojeol = max_count_when_noun_is_eojeol
        self.eojeol_counter_filtering_checkpoint = eojeol_counter_filtering_checkpoint
        self.extract_compound = extract_compound

        if not predictor_headers:
            predictor_headers = self._set_default_predictor_header()
        self._load_predictor(predictor_headers)

    @property
    def is_trained(self):
        return self.lrgraph

    def _set_default_predictor_header(self):

        if self.verbose:
            print('[Noun Extractor] use default predictors')

        dirname = '/'.join(os.path.abspath(__file__).replace('\\', '/').split('/')[:-2])
        predictor_header = ['{}/trained_models/noun_predictor_ver2'.format(dirname)]

        return predictor_header

    def _load_predictor(self, headers):

        if type(headers) == str:
            headers = [headers]
        
        pos, neg = set(), set()
        for header in headers:

            # load positive features such as Josa
            pos_path = '{}_pos'.format(header)
            with open(pos_path, encoding='utf-8') as f:
                pos.update({feature.strip() for feature in f})

            # load negative features such as ending (Eomi)
            neg_path = '{}_neg'.format(header)
            with open(neg_path, encoding='utf-8') as f:
                neg.update({feature.strip() for feature in f})

        # common features such as -은 (조사/어미), -라고(조사/어미) 
        common = pos.intersection(neg)

        # remove common features from pos and neg
        pos = {feature for feature in pos if not (feature in common)}
        neg = {feature for feature in neg if not (feature in common)}

        if self.verbose:
            print('[Noun Extractor] num features: pos={}, neg={}, common={}'.format(
                len(pos), len(neg), len(common)))

        self._pos_features = pos
        self._neg_features = neg
        self._common_features = common

    def train_extract(self, sentences, minimum_noun_score=0.3,
        min_count=1, min_eojeol_count=1, reset_lrgraph=True):

        self.train(sentences, min_eojeol_count)

        return self.extract(minimum_noun_score, min_count, reset_lrgraph)

    def train(self, sentences, min_eojeol_count=1):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(sentences, min_eojeol_count,
            max_length=self.l_max_length + self.r_max_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)
        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')
        self.lrgraph = eojeol_counter.to_lrgraph(
            self.l_max_length, self.r_max_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f'%get_process_memory()))

    def _extract_determiner(self):
        raise NotImplemented

    def _extract_josa(self):
        raise NotImplemented

    def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True):

        # reset covered eojeol count
        self._num_of_covered_eojeols = 0

        # base prediction
        noun_candidates = self._noun_candidates_from_positive_features()
        prediction_scores = self._batch_prediction_order_by_word_length(
            noun_candidates, minimum_noun_score)

        # E = N*J+ or N*Posi+
        if self.extract_compound:
            candidates = {l:sum(rdict.values()) for l,rdict in
                self.lrgraph._lr.items() if len(l) >= 4}
            compounds = self.extract_compounds(
                candidates, prediction_scores, minimum_noun_score)
        else:
            compounds = {}

        # combine single nouns and compounds
        nouns = {noun:score for noun, score in prediction_scores.items()
            if score[0] >= minimum_noun_score}
        nouns.update(compounds)

        # frequency filtering
        nouns = {noun:score for noun, score in nouns.items()
            if score[1] >= min_count}

        nouns = self._post_processing(nouns, prediction_scores, compounds)

        if self.verbose:
            print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format(
                len(nouns), len(compounds), min_count), flush=True)

            coverage = '%.2f' % (100 * self._num_of_covered_eojeols
                / self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)

        if self.verbose:
            print('[Noun Extractor] flushing ... ', flush=True, end='')

        self._nouns = nouns
        if reset_lrgraph:
            # when extracting predicates, do not reset lrgraph.
            # the remained lrgraph is predicate (root - ending) graph
            self.lrgraph.reset_lrgraph()
        if self.verbose:
            print('done. mem={} Gb'.format('%.3f'%get_process_memory()))

        nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()}
        return nouns_

    def _get_nonempty_features(self, word, features):
        return [r for r, _ in features if (
            ( (r in self._pos_features) and (not self._exist_longer_pos(word, r)) ) or
            ( (r in self._neg_features) and (not self._exist_longer_neg(word, r)) ) )]

    def _exist_longer_pos(self, word, r):
        for e in range(len(word)-1, -1, -1):
            if (word[e:]+r) in self._pos_features:
                return True
        return False

    def _exist_longer_neg(self, word, r):
        for e in range(len(word)-1, -1, -1):
            if (word[e:]+r) in self._neg_features:
                return True
        return False

    def predict(self, word, minimum_noun_score=0.3, debug=False):

        # scoring
        features = self.lrgraph.get_r(word, -1)
        pos, common, neg, unk, end = self._predict(word, features)

        base = pos + neg
        score = 0 if base == 0 else (pos - neg) / base
        support = pos + end + common if score >= minimum_noun_score else neg + end + common

        # debug code
        if debug:
            print(pos, common, neg, unk, end)

        features_ = self._get_nonempty_features(word, features)
        if len(features_) > self.min_num_of_features:        
            return score, support
        else:
            # exception case
            sum_ = pos + common + neg + unk + end
            if sum_ == 0:
                return 0, support

            # exception. frequent nouns may have various positive R such as Josa
            if ((end > self.max_count_when_noun_is_eojeol) and (neg >= pos) ):
                return score, support

            if (common > 0 or pos > 0) and (end / sum_ >= 0.3) and (common >= neg):
                # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown. 
                # 아이엠텍 + [('은', 2), ('', 2)]
                support = pos + common + end
                return (support / sum_, support)

            # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에}
            first_chars = set()
            for r, _ in features:
                if not r:
                    continue
                if r in self._pos_features or r in self._common_features:
                    if not self._exist_longer_pos(word, r):
                        first_chars.add(r[0])
                if not (r in self._pos_features or r in self._common_features):
                    first_chars.add(r[0])

            if len(first_chars) >= 2:
                support = pos + common + end
                return (support / sum_, support)

            # Handling for post-processing in NounExtractor
            # Case 1.
            # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사
            # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨
            # Case 2.
            # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용)
            # "명사 + Unknown R" 로 후처리
            return (0, support)

    def _predict(self, word, features):

        pos, common, neg, unk, end = 0, 0, 0, 0, 0

        for r, freq in features:
            if r == '':
                end += freq
                continue
            if self._exist_longer_pos(word, r): # ignore
                continue
            if self._exist_longer_neg(word, r): # negative -다고
                neg += freq
                continue
            if r in self._common_features:
                common += freq
            elif r in self._pos_features:            
                pos += freq
            elif r in self._neg_features:
                neg += freq
            else:
                unk += freq

        return pos, common, neg, unk, end

    def _noun_candidates_from_positive_features(self, condition=None):

        def satisfy(word, e):
            return word[:e] == condition

        # noun candidates from positive featuers such as Josa
        N_from_J = {}
        for r in self._pos_features:
            for l, c in self.lrgraph.get_l(r, -1):
                # candidates filtering for debugging
                # condition is first chars in L
                if not condition:
                    N_from_J[l] = N_from_J.get(l,0) + c
                    continue
                # for debugging
                if not satisfy(l, len(condition)):
                    continue
                N_from_J[l] = N_from_J.get(l,0) + c

        # sort by length of word
        N_from_J = sorted(N_from_J.items(), key=lambda x:-len(x[0]))

        return N_from_J

    def _batch_prediction_order_by_word_length(self,
        noun_candidates, minimum_noun_score=0.3):

        prediction_scores = {}

        n = len(noun_candidates)
        for i, (word, _) in enumerate(noun_candidates):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i+1) / n)
                print('\r  -- batch prediction {} % of {} words'.format(
                    percentage, n), flush=True, end='')

            # base prediction
            score, support = self.predict(word, minimum_noun_score)
            prediction_scores[word] = (score, support)

            # if their score is higher than minimum_noun_score,
            # remove eojeol pattern from lrgraph
            if score >= minimum_noun_score:
                for r, count in self.lrgraph.get_r(word, -1):
                    if r == '' or (r in self._pos_features) or (r in self._common_features):
                        self.lrgraph.remove_eojeol(word+r, count)
                        self._num_of_covered_eojeols += count
        if self.verbose:
            print('\r[Noun Extractor] batch prediction was completed for {} words'.format(
                n), flush=True)

        return prediction_scores

    def extract_compounds(self, candidates, prediction_scores, minimum_noun_score=0.3):

        noun_scores = {noun:len(noun) for noun, score in prediction_scores.items()
                       if score[0] > minimum_noun_score and len(noun) > 1}
        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items()
            if (len(l) >= 4) and not (l in noun_scores)}

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage), flush=True, end='')

            # skip if candidate is substring of longer compound
            if candidates.get(word, 0) <= 0:
                continue

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:
                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts
                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0,0))[0] for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score)
                compounds_counts[noun] = compounds_counts.get(noun,0) + count
                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]
                    if not subword in candidates:
                        continue
                    candidates[subword] = candidates.get(subword, 0) - count
                # eojeol coverage
                self.lrgraph.remove_eojeol(word)
                self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format(
                len(compounds_scores)))

        compounds = {noun:(score, compounds_counts.get(noun,0))
             for noun, score in compounds_scores.items()}

        self._compounds_components = compounds_components

        return compounds

    def decompose_compound(self, word):

        tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
        compound_parts = self._parse_compound(tokens)

        return (word, ) if not compound_parts else compound_parts

    def _parse_compound(self, tokens):
        """Check Noun* or Noun*Josa"""

        # format: (word, begin, end, score, length)
        for token in tokens[:-1]:
            if token[3] <= 0:
                return None
        # Noun* + Josa
        if len(tokens) >= 3 and tokens[-1][0] in self._pos_features:
            return tuple(t[0] for t in tokens[:-1])
        # all tokens are noun
        if tokens[-1][3] > 0:
            return tuple(t[0] for t in tokens)
        # else, not compound
        return None

    def _post_processing(self, nouns, prediction_scores, compounds):
        # TODO
        # Not Implemented
        return nouns
Example #26
0
 def load_state_dict(self, state_dict):
     self.scores = state_dict['scores']
     self.tokenizer = MaxScoreTokenizer(scores=self.scores)
Example #27
0
class LRNounExtractor_v2:
    def __init__(self,
                 max_left_length=10,
                 max_right_length=9,
                 predictor_headers=None,
                 verbose=True,
                 min_num_of_features=1,
                 max_frequency_when_noun_is_eojeol=30,
                 eojeol_counter_filtering_checkpoint=200000,
                 min_eojeol_frequency=1,
                 extract_compound=True,
                 extract_pos_feature=False,
                 extract_determiner=False,
                 postprocessing=None,
                 logpath=None):

        self.max_left_length = max_left_length
        self.max_right_length = max_right_length
        self.lrgraph = None
        self.verbose = verbose
        self.min_num_of_features = min_num_of_features
        self.max_frequency_when_noun_is_eojeol = max_frequency_when_noun_is_eojeol
        self.eojeol_counter_filtering_checkpoint = eojeol_counter_filtering_checkpoint
        self.min_eojeol_frequency = min_eojeol_frequency
        self.extract_compound = extract_compound
        self.extract_pos_feature = extract_pos_feature
        self.extract_determiner = extract_determiner
        self.logpath = logpath

        if logpath:
            check_dirs(logpath)

        if not postprocessing:
            postprocessing = [
                'detaching_features', 'ignore_features', 'ignore_NJ'
            ]
        elif isinstance(postprocessing) == str:
            postprocessing = [postprocessing]

        self.postprocessing = postprocessing

        if not predictor_headers:
            predictor_headers = self._set_default_predictor_header()

        self._load_predictor(predictor_headers)

    @property
    def is_trained(self):
        return self.lrgraph

    def _set_default_predictor_header(self):

        if self.verbose:
            print('[Noun Extractor] use default predictors')

        dirname = '/'.join(
            os.path.abspath(__file__).replace('\\', '/').split('/')[:-2])
        predictor_header = [
            '{}/trained_models/noun_predictor_ver2'.format(dirname)
        ]

        return predictor_header

    def _load_predictor(self, headers):

        if type(headers) == str:
            headers = [headers]

        pos, neg = set(), set()
        for header in headers:

            # load positive features such as Josa
            pos_path = '{}_pos'.format(header)
            with open(pos_path, encoding='utf-8') as f:
                pos.update({feature.strip() for feature in f})

            # load negative features such as ending (Eomi)
            neg_path = '{}_neg'.format(header)
            with open(neg_path, encoding='utf-8') as f:
                neg.update({feature.strip() for feature in f})

        # common features such as -은 (조사/어미), -라고(조사/어미)
        common = pos.intersection(neg)

        # remove common features from pos and neg
        pos = {feature for feature in pos if not (feature in common)}
        neg = {feature for feature in neg if not (feature in common)}

        if self.verbose:
            print('[Noun Extractor] num features: pos={}, neg={}, common={}'.
                  format(len(pos), len(neg), len(common)))

        self._pos_features = pos
        self._neg_features = neg
        self._common_features = common

    def _append_features(self, feature_type, features):
        def check_feature_size():
            return (len(self._pos_features), len(self._neg_features),
                    len(self._common_features))

        # size before
        n_pos, n_neg, n_common = check_feature_size()

        if feature_type == 'pos':
            commons = {f for f in features if (f in self._neg_features)}
            self._pos_features.update(
                {f
                 for f in features if not (f in commons)})

        elif feature_type == 'neg':
            commons = {f for f in features if (f in self._pos_features)}
            self._neg_features.update(
                {f
                 for f in features if not (f in commons)})

        elif feature_type == 'common':
            commons = features

        else:
            raise ValueError(
                'Feature type was wrong. Choice = [pos, neg, common]')

        self._common_features.update(commons)

        # size after
        n_pos_, n_neg_, n_common_ = check_feature_size()

        if self.verbose:
            message = 'pos={} -> {}, neg={} -> {}, common={} -> {}'.format(
                n_pos, n_pos_, n_neg, n_neg_, n_common, n_common_)
            print('[Noun Extractor] features appended. {}'.format(message))

    def train_extract(self,
                      sentences,
                      min_noun_score=0.3,
                      min_noun_frequency=1,
                      min_eojeol_frequency=1,
                      reset_lrgraph=True):

        self.train(sentences)

        return self.extract(min_noun_score, min_noun_frequency, reset_lrgraph)

    def train(self, sentences):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(
            sentences,
            self.min_eojeol_frequency,
            max_length=self.max_left_length + self.max_right_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)

        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')

        self.lrgraph = eojeol_counter.to_lrgraph(self.max_left_length,
                                                 self.max_right_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f' % get_process_memory()))

    def _extract_determiner(self):
        raise NotImplemented

    def extract_domain_pos_features(self,
                                    append_extracted_features=True,
                                    noun_candidates=None,
                                    ignore_features=None,
                                    min_noun_score=0.3,
                                    min_noun_frequency=100,
                                    min_pos_score=0.3,
                                    min_pos_feature_frequency=1000,
                                    min_num_of_unique_lastchar=4,
                                    min_entropy_of_lastchar=0.5,
                                    min_noun_entropy=1.5):

        if self.verbose:
            print(
                '[Noun Extractor] batch prediction for extracting pos feature')

        if not noun_candidates:
            noun_candidates = self._noun_candidates_from_positive_features()

        prediction_scores = self._batch_predicting_nouns(
            noun_candidates, min_noun_score)

        self.lrgraph.reset_lrgraph()

        self._pos_features_extracted = extract_domain_pos_features(
            prediction_scores, self.lrgraph, self._pos_features,
            ignore_features, min_noun_score, min_noun_frequency, min_pos_score,
            min_pos_feature_frequency, min_num_of_unique_lastchar,
            min_entropy_of_lastchar, min_noun_entropy)

        if append_extracted_features:
            self._append_features('pos', self._pos_features_extracted)

        if self.verbose:
            print('[Noun Extractor] {} pos features were extracted'.format(
                len(self._pos_features_extracted)))

    def extract(self,
                min_noun_score=0.3,
                min_noun_frequency=1,
                reset_lrgraph=True):

        # reset covered eojeol count
        self._num_of_covered_eojeols = 0

        # base prediction
        noun_candidates = self._noun_candidates_from_positive_features()

        if self.extract_pos_feature:
            if self.verbose:
                print('[Noun Extractor] extract and append pos features')

            self.extract_domain_pos_features(noun_candidates)

        prediction_scores = self._batch_predicting_nouns(
            noun_candidates, min_noun_score)

        if self.logpath:
            with open(self.logpath + '_prediction_score.log',
                      'w',
                      encoding='utf-8') as f:
                f.write('noun score frequency\n')

                for word, score in sorted(prediction_scores.items(),
                                          key=lambda x: -x[1][1]):
                    f.write('{} {} {}\n'.format(word, score[0], score[1]))

        # E = N*J+ or N*Posi+
        if self.extract_compound:
            candidates = {
                l: sum(rdict.values())
                for l, rdict in self.lrgraph._lr.items() if len(l) >= 4
            }
            compounds = self.extract_compounds(candidates, prediction_scores,
                                               min_noun_score)

        else:
            compounds = {}

        # combine single nouns and compounds
        nouns = {
            noun: score
            for noun, score in prediction_scores.items()
            if score[1] >= min_noun_score
        }

        nouns.update(compounds)

        # frequency filtering
        nouns = {
            noun: score
            for noun, score in nouns.items() if score[0] >= min_noun_frequency
        }

        nouns = self._post_processing(nouns, prediction_scores, compounds)

        if self.verbose:
            print(
                '[Noun Extractor] {} nouns ({} compounds) with min frequency={}'
                .format(len(nouns), len(compounds), min_noun_frequency),
                flush=True)
            print('[Noun Extractor] flushing ... ', flush=True, end='')

        self._check_covered_eojeols(nouns)

        self._nouns = nouns

        if reset_lrgraph:
            # when extracting predicates, do not reset lrgraph.
            # the remained lrgraph is predicate (stem - ending) graph
            self.lrgraph.reset_lrgraph()

        nouns_ = {
            noun: NounScore(score[0], score[1])
            for noun, score in nouns.items()
        }
        return nouns_

    def _get_nonempty_features(self, word, features):
        return [
            r for r, _ in features
            if (((r in self._pos_features) and
                 (not self._exist_longer_pos(word, r))) or (
                     (r in self._neg_features) and
                     (not self._exist_longer_neg(word, r))))
        ]

    def _exist_longer_pos(self, word, r):
        for e in range(len(word) - 1, -1, -1):
            if (word[e:] + r) in self._pos_features:
                return True
        return False

    def _exist_longer_neg(self, word, r):
        for e in range(len(word) - 1, -1, -1):
            if (word[e:] + r) in self._neg_features:
                return True
        return False

    def predict(self, word, min_noun_score=0.3, debug=False):

        # scoring
        features = self.lrgraph.get_r(word, -1)
        pos, common, neg, unk, end = self._predict(word, features)

        base = pos + neg
        score = 0 if base == 0 else (pos - neg) / base
        support = pos + end + common if score >= min_noun_score else neg + end + common

        features_ = self._get_nonempty_features(word, features)
        n_features_ = len(features_)

        # debug code
        if debug:
            print('pos={}, common={}, neg={}, unk={}, end={}, n_features_={}'.
                  format(pos, common, neg, unk, end, n_features_))

        if n_features_ > self.min_num_of_features:
            return support, score

        else:
            # exception case
            sum_ = pos + common + neg + unk + end
            if sum_ == 0:
                return support, 0

            # exception. frequent nouns may have various positive R such as Josa
            if ((end > self.max_frequency_when_noun_is_eojeol)
                    and (pos >= neg)):
                return support, score

            if (common > 0
                    or pos > 0) and (end / sum_ >= 0.3) and (common >= neg):
                # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown.
                # 아이엠텍 + [('은', 2), ('', 2)]
                support = pos + common + end
                return (support, support / sum_)

            # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에}
            first_chars = set()
            for r, _ in features:
                if not r:
                    continue
                if r in self._pos_features or r in self._common_features:
                    if not self._exist_longer_pos(word, r):
                        first_chars.add(r[0])
                if not (r in self._pos_features or r in self._common_features):
                    first_chars.add(r[0])

            if len(first_chars) >= 2:
                support = pos + common + end
                return (support, support / sum_)

            # Handling for post-processing in NounExtractor
            # Case 1.
            # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사
            # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨
            # Case 2.
            # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용)
            # "명사 + Unknown R" 로 후처리
            return (support, 0)

    def _predict(self, word, features):

        pos, common, neg, unk, end = 0, 0, 0, 0, 0

        for r, freq in features:
            if r == '':
                end += freq
                continue
            if self._exist_longer_pos(word, r):  # ignore
                continue
            if self._exist_longer_neg(word, r):  # negative -다고
                neg += freq
                continue
            if r in self._common_features:
                common += freq
            elif r in self._pos_features:
                pos += freq
            elif r in self._neg_features:
                neg += freq
            else:
                unk += freq

        return pos, common, neg, unk, end

    def _noun_candidates_from_positive_features(self, condition=None):
        def satisfy(word, e):
            return word[:e] == condition

        # noun candidates from positive featuers such as Josa
        N_from_J = {}

        for r in self._pos_features:
            for l, c in self.lrgraph.get_l(r, -1):

                # candidates filtering for debugging
                # condition is first chars in L
                if not condition:
                    N_from_J[l] = N_from_J.get(l, 0) + c
                    continue

                # for debugging
                if not satisfy(l, len(condition)):
                    continue

                N_from_J[l] = N_from_J.get(l, 0) + c

        return N_from_J

    def _batch_predicting_nouns(self, noun_candidates, min_noun_score=0.3):

        prediction_scores = {}

        n = len(noun_candidates)
        for i, word in enumerate(sorted(noun_candidates,
                                        key=lambda x: -len(x))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i + 1) / n)
                print('\r  -- batch prediction {} % of {} words'.format(
                    percentage, n),
                      flush=True,
                      end='')

            # base prediction
            support, score = self.predict(word, min_noun_score)
            prediction_scores[word] = (support, score)

            # if their score is higher than min_noun_score,
            # remove eojeol pattern from lrgraph
            if score >= min_noun_score:
                for r, count in self.lrgraph.get_r(word, -1):
                    # remove all eojeols that including word at left-side.
                    # we have to assume that pos, neg features are incomplete
                    self.lrgraph.remove_eojeol(word + r, count)
                    # if (r == '' or
                    #    (r in self._pos_features) or
                    #    (r in self._common_features)):
                    #    self.lrgraph.remove_eojeol(word+r, count)

        if self.verbose:
            print(
                '\r[Noun Extractor] batch prediction was completed for {} words'
                .format(n),
                flush=True)

        return prediction_scores

    def extract_compounds(self,
                          candidates,
                          prediction_scores,
                          min_noun_score=0.3):

        noun_scores = {
            noun: len(noun)
            for noun, score in prediction_scores.items()
            if score[1] > min_noun_score and len(noun) > 1
        }

        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {
            l: sum(rdict.values())
            for l, rdict in self.lrgraph._lr.items()
            if (len(l) >= 4) and not (l in noun_scores)
        }

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(
                sorted(candidates.items(), key=lambda x: -len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage),
                      flush=True,
                      end='')

            # skip if candidate is substring of longer compound
            if candidates.get(word, 0) <= 0:
                continue

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:

                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts

                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0, 0))[1]
                                      for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun, 0),
                                             compound_score)
                compounds_counts[noun] = compounds_counts.get(noun, 0) + count

                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]

                    if not subword in candidates:
                        continue

                    candidates[subword] = candidates.get(subword, 0) - count

                # eojeol coverage
                self.lrgraph.remove_eojeol(word)

        if self.verbose:
            print(
                '\r[Noun Extractor] checked compounds. discovered {} compounds'
                .format(len(compounds_scores)))

        compounds = {
            noun: (score, compounds_counts.get(noun, 0))
            for noun, score in compounds_scores.items()
        }

        self._compounds_components = compounds_components

        return compounds

    def decompose_compound(self, word):

        tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
        compound_parts = self._parse_compound(tokens)

        return (word, ) if not compound_parts else compound_parts

    def _parse_compound(self, tokens):
        """Check Noun* or Noun*Josa"""

        # format: (word, begin, end, score, length)
        for token in tokens[:-1]:
            if token[3] <= 0:
                return None

        # Noun* + Josa
        if len(tokens) >= 3 and tokens[-1][0] in self._pos_features:
            return tuple(t[0] for t in tokens[:-1])

        # all tokens are noun
        if tokens[-1][3] > 0:
            return tuple(t[0] for t in tokens)

        # else, not compound
        return None

    def _post_processing(self, nouns, prediction_scores, compounds):
        def print_status(method, nouns, removals):
            n_after = len(nouns)
            n_before = n_after + len(removals)
            print('[Noun Extractor] postprocessing {} : {} -> {}'.format(
                method, n_before, n_after))

        logpath = self.logpath + '_postprocessing.log' if self.logpath else None

        # initialize
        if logpath:
            with open(logpath, 'w', encoding='utf-8') as f:
                f.write('')

        for method in self.postprocessing:

            if method == 'detaching_features':

                logheader = '## Ignore noun candidates from detaching pos features\n'
                nouns, removals = detaching_features(nouns, self._pos_features,
                                                     logpath, logheader)

                if self.verbose:
                    print_status(method, nouns, removals)

            elif method == 'ignore_features':

                features = {f for f in self._pos_features}
                # features.update(self._neg_features)
                features.update(self._common_features)
                nouns, removals = ignore_features(nouns, features, logpath)

                if self.verbose:
                    print_status(method, nouns, removals)

            elif method == 'ignore_NJ':

                nouns, removals = check_N_is_NJ(nouns,
                                                self.lrgraph,
                                                logpath=logpath)

                if self.verbose:
                    print_status(method, nouns, removals)

        return nouns

    def _check_covered_eojeols(self, nouns):

        self.lrgraph.reset_lrgraph()

        noun_candidates = self._noun_candidates_from_positive_features()

        n = len(noun_candidates)
        for i, word in enumerate(sorted(noun_candidates,
                                        key=lambda x: -len(x))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i + 1) / n)
                print(
                    '\r[Noun Extractor] flushing ...  {} %'.format(percentage),
                    flush=True,
                    end='')

            if not (word in nouns):
                continue

            if len(word) > 1:
                for r, count in self.lrgraph.get_r(word, -1):
                    # remove all eojeols that including word at left-side.
                    # we have to assume that pos, neg features are incomplete
                    self.lrgraph.remove_eojeol(word + r, count)
                    self._num_of_covered_eojeols += count
            else:
                # a syllable noun is exception; remove only N + pos feature
                if (r == '' or (r in self._pos_features)
                        or (r in self._common_features)):
                    self.lrgraph.remove_eojeol(word + r, count)
                    self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format(
                '%.3f' % get_process_memory(), ' ' * 20),
                  flush=True)
            coverage = '%.2f' % (100 * self._num_of_covered_eojeols /
                                 self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage),
                  flush=True)
    '''
    print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )
    '''
    extracted_words.append(word)

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = MaxScoreTokenizer(scores=cohesion_score)

#=================LDA trian strat========================
#Generate LDAModel
#k = the number of topic
#alpha = ?
#eta = ?
#min_cf = min frequency
model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5)

for i in raw_chat:
    model.add_doc(tokenizer.tokenize(i))

#check the number of words, vocabulary
#prepare the train
model.train(0)
Example #29
0
 def __init__(self, config):
     with open(config.soynlp_scores, "r") as f:
         scores = [line.strip().split("\t") for line in f]
         scores = {word: float(score) for word, score in scores}
     self.tokenizer = MaxScoreTokenizer(scores=scores)
Example #30
0
관심사를 가지고 공격할 주제를 만드는 것입니다. 
관심사는 글의 특정 단어 빈도수로 측정합니다."""

from personal_data import *
import numpy as np
import re
from soynlp.tokenizer import MaxScoreTokenizer

boundmorpheme = [
    "은", "는", "이", "가", "을", "를", "로써", "에서", "에게서", "부터", "까지", "에게", "한테",
    "께", "와", "과", "의", "로서", "으로서", "로", "으로"
]  # 조사
exceptions = boundmorpheme

scores = {'티켓이': 0.3, '티켓': 0.7, '좋아요': 0.2, '좋아': 0.5}
tokenizer = MaxScoreTokenizer(scores=scores)


def isHangul(text):
    #Check the Python Version
    pyVer3 = sys.version_info >= (3, 0)

    if pyVer3:  # for Ver 3 or later
        encText = text
    else:  # for Ver 2.x
        if type(text) is not unicode:
            encText = text.decode('utf-8')
        else:
            encText = text

    hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
Example #31
0
class ErineTokenizer(object):
    def __init__(self, config):
        self.basic_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        if not ( os.path.isfile(config.vocab_path) and  os.path.isfile(config.word2vec_path) ):
            assert config.data_path and os.path.isfile(config.data_path), '[{}] 위치에 학습할 파일이 없습니다.'.format(config.data_path)
            noun_dict, word2vec_model = build_vocab(config)
        else:
            noun_dict = open_pickle(config.vocab_path)
            word2vec_model = Word2Vec.load(config.word2vec_path)

        self.config = config
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))

        self.idx2tags = ['I', 'B']
        self.tags2idx = {name: idx for idx, name in enumerate(self.idx2tags)}
        self.tag_size = len(self.idx2tags)
        self.vocab_size = self.basic_tokenizer.vocab_size

        self.pad_token_id = self.basic_tokenizer.pad_token_id
        self.unk_token_id = self.basic_tokenizer.unk_token_id

    def reset_tokenizer(self, data):
        noun_dict, word2vec_model = build_vocab(self.config, data)
        self.word_tokenizer = MaxScoreTokenizer(noun_dict)

        self.index2word = [unk_token] + word2vec_model.wv.index2word
        word2index = {}
        for index, word in enumerate(self.index2word):
            word2index[word] = index

        self.word2index = word2index
        self.pad_word_id = 0

        self.word_vec_dim = word2vec_model.vector_size
        self.word_vocab_size = len(word2index)

        unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float)
        embedding = word2vec_model.wv.vectors
        self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float))



    def tokenize(self, sentence: str):
        tokens = []
        temp = ''
        for i in range(len(sentence)):
            if sentence[i] == spc_token:
                temp += sentence[i]
                continue

            temp += sentence[i]
            tokens.append(temp)
            temp = ''

        return tokens

    def encode(self, tokens: list, max_length=None):

        temp_tokens = []
        for token in tokens:
            token = token.replace(spc_token, '')
            temp_tokens.append(token)
        sentence = spc_token.join(temp_tokens)

        if max_length is not None:
            token_ids = self.basic_tokenizer.encode(sentence, max_length=max_length, pad_to_max_length=True, add_special_tokens=False, truncation=True)
        else:
            token_ids = self.basic_tokenizer.encode(sentence, add_special_tokens=False)

        return token_ids

    def word_encode(self, sentence, max_length=None):
        sentence = sentence.replace(spc_token, '')
        word_tokens = self.word_tokenizer.tokenize(sentence)

        word_token_ids = []
        for idx, word in enumerate(word_tokens):

            temp_ids = []
            if word in self.word2index:
                temp_ids.append(self.word2index[word])

            padding = [self.pad_word_id] * (len(word) - len(temp_ids))
            temp_ids = temp_ids + padding

            word_token_ids.extend(temp_ids)

        if max_length is not None:
            word_token_ids = word_token_ids[:max_length]
            padding = [self.pad_word_id] * (max_length - len(word_token_ids))
            word_token_ids = word_token_ids + padding

        return word_token_ids


    def decode(self, token_ids: list(), lables: list()):
        sentences = []
        for word_tokens, word_labels in zip(token_ids, lables):
            sentence = ""
            for token_id, label in zip(word_tokens, word_labels):
                if self.idx2tags[label] == 'B':
                    sentence += " "
                if token_id == self.basic_tokenizer.cls_token_id or token_id == self.basic_tokenizer.sep_token_id:
                    continue
                sentence += self.basic_tokenizer.convert_ids_to_tokens(token_id)
            sentences += [sentence]
        return sentences

    def get_labels(self, tokens: list, max_length=None):
        labels = [1 if spc_token in token else 0 for token in tokens]
        if max_length is not None:
            labels = labels[:max_length]

        labels = labels
        length = len(labels)

        if max_length is not None and len(labels) < max_length:
            pad = [0] * (max_length - len(labels))
            labels = labels + pad
        return labels, length

    def parse(self, sentence: str, max_length=None):
        sentence = sentence.strip()
        tokens = self.tokenize(sentence)
        token_ids = self.encode(tokens, max_length)
        word_token_ids = self.word_encode(sentence, max_length)
        lables, length = self.get_labels(tokens, max_length)
        return token_ids, word_token_ids, lables, length

    def get_id(self, token: str):
        if token in self.basic_tokenizer.get_vocab():
            return self.basic_tokenizer.get_vocab()[token]
        else:
            return self.basic_tokenizer.unk_token_id
Example #32
0
class LRMaxScoreTagger:
    def __init__(self, domain_dictionary_folders=None, use_base_dictionary=True,
                 dictionary_word_mincount=3,
                 evaluator=None, sents=None, lrgraph=None, 
                 lrgraph_lmax=12, lrgraph_rmax=8,
                 base_tokenizer=None, preference=None, verbose=False
                ):
        
        self.dictionary = Dictionary(domain_dictionary_folders, use_base_dictionary, dictionary_word_mincount, verbose=verbose)
        self.evaluator = evaluator if evaluator else LREvaluator()
        self.preference = preference if preference else {}
        self.lrgraph = lrgraph if lrgraph else {}
        
        if (not self.lrgraph) and (sents):
            self.lrgraph = _build_lrgraph(sents, lrgraph_lmax, lrgraph_rmax)
            
        self.lrgraph_norm, self.lcount, self.cohesion_l, self.droprate_l\
            = self._initialize_scores(self.lrgraph)

        self.base_tokenizer = base_tokenizer if base_tokenizer else lambda x:x.split()
        if not base_tokenizer:
            try:
                self.base_tokenizer = MaxScoreTokenizer(scores=self.cohesion_l)
            except Exception as e:
                print('MaxScoreTokenizer(cohesion) exception: {}'.format(e))
        
    def _build_lrgraph(self, sents, lmax=12, rmax=8):
        from collections import Counter
        from collections import defaultdict
        eojeols = Counter((eojeol for sent in sents for eojeol in sent.split() if eojeol))
        lrgraph = defaultdict(lambda: defaultdict(int))
        for eojeol, count in eojeols.items():
            n = len(eojeol)
            for i in range(1, min(n, lmax)+1):
                (l, r) = (eojeol[:i], eojeol[i:])
                if len(r) > rmax:
                    continue
                lrgraph[l][r] += count
                
        return lrgraph
    
    def _initialize_scores(self, lrgraph):
        def to_counter(dd):
            return {k:sum(d.values()) for k,d in dd.items()}
        def to_normalized_graph(dd):
            normed = {}
            for k,d in dd.items():
                sum_ = sum(d.values())
                normed[k] = {k1:c/sum_ for k1,c in d.items()}
            return normed

        lrgraph_norm = to_normalized_graph(lrgraph)
        lcount = to_counter(lrgraph)
        cohesion_l = {w:pow(c/lcount[w[0]], 1/(len(w)-1)) for w, c in lcount.items() if len(w) > 1}
        droprate_l = {w:c/lcount[w[:-1]] for w, c in lcount.items() if len(w) > 1 and w[:-1] in lcount}
        
        return lrgraph_norm, lcount, cohesion_l, droprate_l
    
    def pos(self, sent, flatten=True, debug=False):
        sent_ = [self._pos(eojeol, debug) for eojeol in sent.split() if eojeol]
        if flatten:
            sent_ = [word for words in sent_ for word in words]
        return sent_

    def _pos(self, eojeol, debug=False):
        candidates = self._initialize(eojeol)
        scores = self._scoring(candidates)
        best = self._find_best(scores)
        if best:
            post = self._postprocessing(eojeol, best)
        else:
            post = self._base_tokenizing_subword(eojeol, 0)
            
        if not debug:
            post = [w for lr in post for w in lr[:2] if w[0]]
        return post
    
    def _initialize(self, t):
        candidates = self._initialize_L(t)
        candidates = self._initialize_LR(t, candidates)
        return candidates

    def _initialize_L(self, t):
        n = len(t)
        candidates = []
        for b in range(n):
            for e in range(b+2, min(n, b+self.dictionary._lmax)+1):
                l = t[b:e]
                l_pos = self.dictionary.pos_L(l)
                if not l_pos:
                    continue

                candidates.append([l,       # 0
                                   l_pos,       # 1
                                   b,      # 2                             
                                   e,      # 3
                                   e-b,   # 4
                                  ])

        candidates = self._remove_l_subsets(candidates)
        return sorted(candidates, key=lambda x:x[2])

    def _remove_l_subsets(self, candidates):
        candidates_ = []
        for pos in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Exclamation']:
            # Sort by len_L
            sorted_ = sorted(filter(lambda x:x[1] == pos, candidates), key=lambda x:-x[4])
            while sorted_:
                candidates_.append(sorted_.pop(0))
                (b, e) = (candidates_[-1][2], candidates_[-1][3])
    #             removals = [i for i, c in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap
                removals = [i for i, c in enumerate(sorted_) if b <= c[2] and e >= c[3]] # Subset (Contain)
                for idx in reversed(removals):
                    del sorted_[idx]
        return candidates_

    def _initialize_LR(self, t, candidates, threshold_prop=0.001, threshold_count=2):
        n = len(t)
        expanded = []

        for (l, pos, b, e, len_l) in candidates:        
            for len_r in range(min(self.dictionary._rmax, n-e)+1):

                r = t[e:e+len_r]
                lr_prop = self.lrgraph_norm.get(l, {}).get(r, 0)
                lr_count = self.lrgraph.get(l, {}).get(r, 0)

                if (r) and ((lr_prop <= threshold_prop) or (lr_count <= threshold_count)):
                    continue

                expanded.append([(l, pos),
                                 (r, None if not r else self.dictionary.pos_R(r)),
                                 b,
                                 e,
                                 e + len_r,
                                 len_r,
                                 len_l + len_r,
                                 lr_prop,
                                 lr_count
                                ])

        expanded = self._remove_r_subsets(expanded)
        return sorted(expanded, key=lambda x:x[2])

    def _remove_r_subsets(self, expanded):
        expanded_ = []
        for pos in ['Josa', 'Verb', 'Adjective', None]:
            # Sory by len_R
            sorted_ = sorted(filter(lambda x:x[1][1] == pos, expanded), key=lambda x:-x[5])
            while sorted_:
                expanded_.append(sorted_.pop(0))
                (b, e) = (expanded_[-1][3], expanded_[-1][4])
    #             removals = [i for i, c in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap
                removals = [i for i, c in enumerate(sorted_) if b <= c[3] and e >= c[4]] # Subset (Contain)
                for idx in reversed(removals):
                    del sorted_[idx]
        expanded_ = [[L, R, p0, p2, len_LR, prop, count] for L, R, p0, p1, p2, len_R, len_LR, prop, count in expanded_]
        return expanded_
    
    def _scoring(self, candidates):
        candidates = [self._to_table(c) for c in candidates]
        scores = self.evaluator.evaluate(candidates, self.preference if self.preference else None)
        return scores

    def _to_table(self, c):
        return Table(c[0], c[1], c[2], c[3], c[4], c[5], c[6], 
                     self.cohesion_l.get(c[0][0], 0),
                     self.droprate_l.get(c[0][0], 0),
                     self.lcount.get(c[0][0], 0)
                    )
    
    def _find_best(self, scores):
        best = []
        sorted_ = sorted(scores, key=lambda x:-x[-1])
        while sorted_:
            best.append(sorted_.pop(0)[0])
            (b, e) = (best[-1][2], best[-1][3])
            removals = [i for i, (c, _) in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap
            for idx in reversed(removals):
                del sorted_[idx]
        return sorted(best, key=lambda x:x[2])
    
    def _postprocessing(self, t, words):
        n = len(t)
        adds = []
        if words and words[0][2] > 0:
            adds += self._add_first_subword(t, words)
        if words and words[-1][3] < n:
            adds += self._add_last_subword(t, words, n)
        adds += self._add_inter_subwords(t, words)
        post = [w for w in words] + [self._to_table(a) for a in adds]
        return sorted(post, key=lambda x:x[2])

    def _infer_subword_information(self, subword):
        pos = self.dictionary.pos_L(subword)
        prop = self.lrgraph_norm.get(subword, {}).get('', 0.0)
        count = self.lrgraph.get(subword, {}).get('', 0)    
        if not pos:
            pos = self.dictionary.pos_R(subword)
        return (pos, prop, count)

    def _add_inter_subwords(self, t, words):
        adds = []        
        for i, base in enumerate(words[:-1]):
            if base[3] == words[i+1][2]:
                continue

            b = base[3]
            e = words[i+1][2]
            subword = t[b:e]
            #(pos, prop, count) = self._infer_subword_information(subword)
            #adds.append([(subword, pos), ('', None), b, e, e-b, prop, count, 0.0])
            adds += self._base_tokenizing_subword(subword, b)
        return adds

    def _add_last_subword(self, t, words, n):
        b = words[-1][3]
        subword = t[b:]
        #(pos, prop, count) = self._infer_subword_information(subword)
        #return [[(subword, pos), ('', None), b, n, n-b, prop, count, 0.0]]
        return self._base_tokenizing_subword(subword, b)

    def _add_first_subword(self, t, words):    
        e = words[0][2]
        subword = t[0:e]
        #(pos, prop, count) = self._infer_subword_information(subword)
        #return [[(subword, pos), ('', None), 0, e, e, prop, count, 0.0]]
        return self._base_tokenizing_subword(subword, 0)
    
    def _base_tokenizing_subword(self, t, b):
        subwords = []
        _subwords = self.base_tokenizer.tokenize(t, flatten=False)
        if not _subwords:
            return []
        for w in _subwords[0]:
            (pos, prop, count) = self._infer_subword_information(w[0])
            subwords.append([(w[0], pos), ('', None), b+w[1], b+w[2], w[2]-w[1], prop, count, 0.0])
        return subwords
    
    def add_words_into_dictionary(self, words, tag):
        if not (tag in self.dictionary._pos):
            raise ValueError('{} does not exist base dictionary'.format(tag))
        self.dictionary.add_words(words, tag)
        
    def remove_words_from_dictionary(self, words, tag):
        if not (tag in self.dictionary._pos):
            raise ValueError('{} does not exist base dictionary'.format(tag))
        self.dictionary.remove_words(words, tag)
    
    def save_domain_dictionary(self, folder, head=None):
        self.dictionary.save_domain_dictionary(folder, head)
    
    def set_word_preferance(self, words, tag, preference=10):
        if type(words) == str:
            words = {words}
        preference_table = self.preference.get(tag, {})
        preference_table.update({word:preference for word in words})
        self.preference[tag] = preference_table
    
    def save_tagger(self, fname):
        raise NotImplemented
Example #33
0
class LRNounExtractor_v2:
    def __init__(self, l_max_length=10, r_max_length=9, predictor_headers=None,
        verbose=True, min_num_of_features=1, max_count_when_noun_is_eojeol=30):

        self.l_max_length = l_max_length
        self.r_max_length = r_max_length
        self.lrgraph = None
        self.verbose = verbose
        self.min_num_of_features = min_num_of_features
        self.max_count_when_noun_is_eojeol = max_count_when_noun_is_eojeol

        if not predictor_headers:
            predictor_headers = self._set_default_predictor_header()
        self._load_predictor(predictor_headers)

    @property
    def is_trained(self):
        return self.lrgraph

    def _set_default_predictor_header(self):

        if self.verbose:
            print('[Noun Extractor] use default predictors')

        dirname = '/'.join(os.path.abspath(__file__).replace('\\', '/').split('/')[:-2])
        predictor_header = ['{}/trained_models/noun_predictor_ver2'.format(dirname)]

        return predictor_header

    def _load_predictor(self, headers):

        if type(headers) == str:
            headers = [headers]
        
        pos, neg = set(), set()
        for header in headers:

            # load positive features such as Josa
            pos_path = '{}_pos'.format(header)
            with open(pos_path, encoding='utf-8') as f:
                pos.update({feature.strip() for feature in f})

            # load negative features such as ending (Eomi)
            neg_path = '{}_neg'.format(header)
            with open(neg_path, encoding='utf-8') as f:
                neg.update({feature.strip() for feature in f})

        # common features such as -은 (조사/어미), -라고(조사/어미) 
        common = pos.intersection(neg)

        # remove common features from pos and neg
        pos = {feature for feature in pos if not (feature in common)}
        neg = {feature for feature in neg if not (feature in common)}

        if self.verbose:
            print('[Noun Extractor] num features: pos={}, neg={}, common={}'.format(
                len(pos), len(neg), len(common)))

        self._pos_features = pos
        self._neg_features = neg
        self._common_features = common

    def train_extract(self, sentences, minimum_noun_score=0.3,
        min_count=1, min_eojeol_count=1):

        self.train(sentences, min_eojeol_count)

        return self.extract(minimum_noun_score, min_count)

    def train(self, sentences, min_eojeol_count=1):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(sentences, min_eojeol_count,
            max_length=self.l_max_length + self.r_max_length)
        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')
        self.lrgraph = eojeol_counter.to_lrgraph(
            self.l_max_length, self.r_max_length)

        if self.verbose:
            print('[Noun Extractor] has been trained.')
    
    def extract(self, minimum_noun_score=0.3, min_count=1):

        # base prediction
        noun_candidates = self._noun_candidates_from_positive_features()
        prediction_scores = self._batch_prediction_order_by_word_length(
            noun_candidates, minimum_noun_score)

        # E = N*J+ or N*Posi+
        candidates = {l:sum(rdict.values()) for l,rdict in
            self.lrgraph._lr.items() if len(l) >= 4}
        compounds = self.extract_compounds(
            candidates, prediction_scores, minimum_noun_score)

        # combine single nouns and compounds
        nouns = {noun:score for noun, score in prediction_scores.items()
            if score[0] >= minimum_noun_score}
        nouns.update(compounds)

        # frequency filtering
        nouns = {noun:score for noun, score in nouns.items()
            if score[1] >= min_count}

        nouns = self._post_processing(nouns, prediction_scores, compounds)

        if self.verbose:
            print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format(
                len(nouns), len(compounds), min_count), flush=True)

            coverage = '%.2f' % (100 * self._num_of_covered_eojeols
                / self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)

        if self.verbose:
            print('[Noun Extractor] flushing ... ', flush=True, end='')

        self._nouns = nouns
        self.lrgraph.reset_lrgraph()
        if self.verbose:
            print('done')

        nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()}
        return nouns_

    def _get_nonempty_features(self, word, features):
        return [r for r, _ in features if (
            ( (r in self._pos_features) and (not self._exist_longer_pos(word, r)) ) or
            ( (r in self._neg_features) and (not self._exist_longer_neg(word, r)) ) )]

    def _exist_longer_pos(self, word, r):
        for e in range(len(word)-1, -1, -1):
            if (word[e:]+r) in self._pos_features:
                return True
        return False

    def _exist_longer_neg(self, word, r):
        for e in range(len(word)-1, -1, -1):
            if (word[e:]+r) in self._neg_features:
                return True
        return False

    def predict(self, word, minimum_noun_score=0.3, debug=False):

        # scoring
        features = self.lrgraph.get_r(word, -1)
        pos, common, neg, unk, end = self._predict(word, features)

        base = pos + neg
        score = 0 if base == 0 else (pos - neg) / base
        support = pos + end + common if score >= minimum_noun_score else neg + end + common

        # debug code
        if debug:
            print(pos, common, neg, unk, end)

        features_ = self._get_nonempty_features(word, features)
        if len(features_) > self.min_num_of_features:        
            return score, support
        else:
            # exception case
            sum_ = pos + common + neg + unk + end
            if sum_ == 0:
                return 0, support

            # exception. frequent nouns may have various positive R such as Josa
            if ((end > self.max_count_when_noun_is_eojeol) and (neg >= pos) ):
                return score, support

            if (common > 0 or pos > 0) and (end / sum_ >= 0.3) and (common >= neg):
                # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown. 
                # 아이엠텍 + [('은', 2), ('', 2)]
                support = pos + common + end
                return (support / sum_, support)

            # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에}
            first_chars = set()
            for r, _ in features:
                if not r:
                    continue
                if r in self._pos_features or r in self._common_features:
                    if not self._exist_longer_pos(word, r):
                        first_chars.add(r[0])
                if not (r in self._pos_features or r in self._common_features):
                    first_chars.add(r[0])

            if len(first_chars) >= 2:
                support = pos + common + end
                return (support / sum_, support)

            # Handling for post-processing in NounExtractor
            # Case 1.
            # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사
            # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨
            # Case 2.
            # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용)
            # "명사 + Unknown R" 로 후처리
            return (0, support)

    def _predict(self, word, features):

        pos, common, neg, unk, end = 0, 0, 0, 0, 0

        for r, freq in features:
            if r == '':
                end += freq
                continue
            if self._exist_longer_pos(word, r): # ignore
                continue
            if self._exist_longer_neg(word, r): # negative -다고
                neg += freq
                continue
            if r in self._common_features:
                common += freq
            elif r in self._pos_features:            
                pos += freq
            elif r in self._neg_features:
                neg += freq
            else:
                unk += freq

        return pos, common, neg, unk, end

    def _noun_candidates_from_positive_features(self, condition=None):

        def satisfy(word, e):
            return word[:e] == condition

        # noun candidates from positive featuers such as Josa
        N_from_J = {}
        for r in self._pos_features:
            for l, c in self.lrgraph.get_l(r, -1):
                # candidates filtering for debugging
                # condition is first chars in L
                if not condition:
                    N_from_J[l] = N_from_J.get(l,0) + c
                    continue
                # for debugging
                if not satisfy(l, len(condition)):
                    continue
                N_from_J[l] = N_from_J.get(l,0) + c

        # sort by length of word
        N_from_J = sorted(N_from_J.items(), key=lambda x:-len(x[0]))

        return N_from_J

    def _batch_prediction_order_by_word_length(self,
        noun_candidates, minimum_noun_score=0.3):

        prediction_scores = {}

        n = len(noun_candidates)
        for i, (word, _) in enumerate(noun_candidates):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i+1) / n)
                print('\r  -- batch prediction {} % of {} words'.format(
                    percentage, n), flush=True, end='')

            # base prediction
            score, support = self.predict(word, minimum_noun_score)
            prediction_scores[word] = (score, support)

            # if their score is higher than minimum_noun_score,
            # remove eojeol pattern from lrgraph
            if score >= minimum_noun_score:
                for r, count in self.lrgraph.get_r(word, -1):
                    if r == '' or (r in self._pos_features):
                        self.lrgraph.remove_eojeol(word+r, count)
                        self._num_of_covered_eojeols += count
        if self.verbose:
            print('\r[Noun Extractor] batch prediction was completed for {} words'.format(
                n), flush=True)

        return prediction_scores

    def extract_compounds(self, candidates, prediction_scores, minimum_noun_score=0.3):

        noun_scores = {noun:len(noun) for noun, score in prediction_scores.items()
                       if score[0] > minimum_noun_score and len(noun) > 1}
        self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores)

        candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items()
            if (len(l) >= 4) and not (l in noun_scores)}

        n = len(candidates)
        compounds_scores = {}
        compounds_counts = {}
        compounds_components = {}

        for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.2f' % (100 * i / n)
                print('\r  -- check compound {} %'.format(percentage), flush=True, end='')

            # skip if candidate is substring of longer compound
            if candidates.get(word, 0) <= 0:
                continue

            tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
            compound_parts = self._parse_compound(tokens)

            if compound_parts:
                # store compound components
                noun = ''.join(compound_parts)
                compounds_components[noun] = compound_parts
                # cumulate count and store compound score
                compound_score = max((prediction_scores.get(t, (0,0))[0] for t in compound_parts))
                compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score)
                compounds_counts[noun] = compounds_counts.get(noun,0) + count
                # reduce frequency of substrings
                for e in range(2, len(word)):
                    subword = word[:e]
                    if not subword in candidates:
                        continue
                    candidates[subword] = candidates.get(subword, 0) - count
                # eojeol coverage
                self.lrgraph.remove_eojeol(word)
                self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format(
                len(compounds_scores)))

        compounds = {noun:(score, compounds_counts.get(noun,0))
             for noun, score in compounds_scores.items()}

        self._compounds_components = compounds_components

        return compounds

    def decompose_compound(self, word):

        tokens = self._compound_decomposer.tokenize(word, flatten=False)[0]
        compound_parts = self._parse_compound(tokens)

        return (word, ) if not compound_parts else compound_parts

    def _parse_compound(self, tokens):
        """Check Noun* or Noun*Josa"""

        # format: (word, begin, end, score, length)
        for token in tokens[:-1]:
            if token[3] <= 0:
                return None
        # Noun* + Josa
        if len(tokens) >= 3 and tokens[-1][0] in self._pos_features:
            return tuple(t[0] for t in tokens[:-1])
        # all tokens are noun
        if tokens[-1][3] > 0:
            return tuple(t[0] for t in tokens)
        # else, not compound
        return None

    def _post_processing(self, nouns, prediction_scores, compounds):
        # TODO
        # Not Implemented
        return nouns