コード例 #1
0
    def _train_predicator_extractor(self, sents,
        # predicator train
        min_predicator_frequency=1, min_eojeol_frequency=2,
        # Eomi extractor
        min_num_of_eomi_features=5, min_eomi_score=0.3, min_eomi_frequency=1):

        # prepare predicator_lrgraph
        predicator_lrgraph = LRGraph(self.noun_extractor.lrgraph._lr)
        noun_pos_features = {r for r in self.noun_extractor._pos_features}
        noun_pos_features.update({r for r in self.noun_extractor._common_features})

        # predicator extraction
        self.predicator_extractor = PredicatorExtractor(
            self.nouns,
            noun_pos_features,
            extract_eomi = self._extract_eomi,
            extract_stem = False,
            verbose = self._verbose
        )

        adjectives, verbs = self.predicator_extractor.train_extract(
            sents, min_eojeol_frequency, 100000, #filtering_checkpoint
            None, min_predicator_frequency, True, # filtering_checkpoint, lrgraph_reset
            min_num_of_eomi_features, min_eomi_score, min_eomi_frequency) # Eomi extractor

        return adjectives, verbs
コード例 #2
0
ファイル: _pos_extractor.py プロジェクト: owlur/soynlp
    def _extract_nouns(self, sentences):

        noun_extractor = LRNounExtractor_v2(l_max_length=self.l_max_length,
                                            r_max_length=self.r_max_length,
                                            min_eojeol_count=2,
                                            min_num_of_features=2,
                                            max_count_when_noun_is_eojeol=15,
                                            extract_compound=False,
                                            logpath=self.logpath,
                                            extract_pos_feature=True,
                                            verbose=self.verbose)

        noun_extractor.train(sentences)

        nouns = noun_extractor.extract(
            reset_lrgraph=False,
            min_count=10,
            minimum_noun_score=0.4,
        )

        self._lrgraph = LRGraph({
            l: {r: v
                for r, v in rdict.items()}
            for l, rdict in noun_extractor.lrgraph._lr.items()
        })
        self._num_of_eojeols = noun_extractor._num_of_eojeols
        self._num_of_covered_eojeols = noun_extractor._num_of_covered_eojeols

        self.noun_extractor = noun_extractor

        if self.verbose:
            message = 'noun extraction was done. {} % eojeols are covered'.format(
                '%.2f' %
                (100 * self._num_of_covered_eojeols / self._num_of_eojeols))
            self._print(message, replace=True, newline=True)

        return nouns
コード例 #3
0
ファイル: _noun_ver1.py プロジェクト: owlur/soynlp
 def train(self, sents, min_noun_frequency=5):
     wordset_l, wordset_r = self._scan_vocabulary(sents, min_noun_frequency)
     lrgraph = self._build_lrgraph(sents, wordset_l, wordset_r)
     self.lrgraph = LRGraph(lrgraph)
     self.words = wordset_l
コード例 #4
0
ファイル: _noun_ver1.py プロジェクト: owlur/soynlp
class LRNounExtractor:
    def __init__(self,
                 max_left_length=10,
                 max_right_length=7,
                 predictor_fnames=None,
                 verbose=True,
                 min_num_of_features=1):

        self.coefficient = {}
        self.verbose = verbose
        self.max_left_length = max_left_length
        self.max_right_length = max_right_length
        self.lrgraph = None
        self.words = None
        self._substring_counter = {}
        self.min_num_of_features = min_num_of_features

        if not predictor_fnames:
            import os
            directory = '/'.join(
                os.path.abspath(__file__).replace('\\', '/').split('/')[:-2])
            predictor_fnames = [
                '%s/trained_models/noun_predictor_sejong' % directory
            ]
            if verbose:
                print('used default noun predictor; Sejong corpus predictor')

        for fname in predictor_fnames:
            if verbose:
                print('used %s' % fname.split('/')[-1])
            self._load_predictor(fname)
        if verbose:
            print('All %d r features was loaded' % len(self.coefficient))

    def _load_predictor(self, fname):
        try:
            if sys.version_info.major == 2:
                f = open(fname)
            else:
                f = open(fname, encoding='utf-8')
            try:
                for num_line, line in enumerate(f):
                    r, score = line.strip().split('\t')
                    score = float(score)
                    if r in self.coefficient:
                        self.coefficient[r] = max(self.coefficient[r], score)
                    else:
                        self.coefficient[r] = score
            except Exception as e:
                print('predictor parsing error line {} = {}'.format(
                    num_line + 1, line))
            finally:
                f.close()
        except Exception as e:
            print(e)

    def train_extract(self,
                      sents,
                      min_noun_score=0.5,
                      min_noun_frequency=5,
                      noun_candidates=None):

        self.train(sents, min_noun_frequency)
        return self.extract(min_noun_score, min_noun_frequency,
                            noun_candidates)

    def train(self, sents, min_noun_frequency=5):
        wordset_l, wordset_r = self._scan_vocabulary(sents, min_noun_frequency)
        lrgraph = self._build_lrgraph(sents, wordset_l, wordset_r)
        self.lrgraph = LRGraph(lrgraph)
        self.words = wordset_l

    def _scan_vocabulary(self, sents, min_frequency=5):
        """
        Parameters
        ----------
            sents: list-like iterable object which has string
            
        It computes subtoken frequency first. 
        After then, it builds lr-graph with sub-tokens appeared at least min count
        """

        _ckpt = int(len(sents) / 40)

        wordset_l = defaultdict(lambda: 0)
        wordset_r = defaultdict(lambda: 0)

        for i, sent in enumerate(sents):
            for token in sent.split(' '):
                if not token:
                    continue
                token_len = len(token)
                for i in range(1, min(self.max_left_length, token_len) + 1):
                    wordset_l[token[:i]] += 1
                for i in range(1, min(self.max_right_length, token_len)):
                    wordset_r[token[-i:]] += 1
            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)),
                        100.0 * i / len(sent), '%')
                sys.stdout.write('\rscanning: %s%s (%.3f %s)' % args)

        self._substring_counter = {
            w: f
            for w, f in wordset_l.items() if f >= min_frequency
        }
        wordset_l = set(self._substring_counter.keys())
        wordset_r = {w for w, f in wordset_r.items() if f >= min_frequency}

        if self.verbose:
            print('\rscanning completed')
            print('(L,R) has (%d, %d) tokens' %
                  (len(wordset_l), len(wordset_r)))

        return wordset_l, wordset_r

    def _build_lrgraph(self, sents, wordset_l, wordset_r):
        _ckpt = int(len(sents) / 40)
        lrgraph = defaultdict(lambda: defaultdict(lambda: 0))

        for i, sent in enumerate(sents):
            for token in sent.split():
                if not token:
                    continue
                n = len(token)
                for i in range(1, min(self.max_left_length, n) + 1):
                    l = token[:i]
                    r = token[i:]
                    if not (l in wordset_l):
                        continue
                    if (len(r) > 0) and not (r in wordset_r):
                        continue
                    lrgraph[l][r] += 1

            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)),
                        100.0 * i / len(sents), '%')
                sys.stdout.write('\rbuilding lr-graph: %s%s (%.3f %s)' % args)
        if self.verbose:
            sys.stdout.write('\rbuilding lr-graph completed')
        lrgraph = {
            l: {r: f
                for r, f in rdict.items()}
            for l, rdict in lrgraph.items()
        }
        return lrgraph

    def extract(self,
                min_noun_score=0.5,
                min_noun_frequency=5,
                noun_candidates=None):
        if not noun_candidates:
            noun_candidates = self.words

        # prediction
        nouns = {}
        for word in sorted(noun_candidates, key=lambda w: len(w)):
            if len(word) <= 1:
                continue

            score = self.predict(word, nouns)

            if score[0] < min_noun_score:
                continue
            nouns[word] = score

        # postprocessing
        nouns = self._postprocess(nouns, min_noun_score, min_noun_frequency)

        # summary information as NounScore
        nouns_ = self._to_NounScore(nouns)

        return nouns_

    def _get_r_features(self, word):
        features = self.lrgraph.get_r(word, -1)
        # remove empty str r only in features
        features = [feature for feature in features if feature[0]]
        return features

    def _get_subword_score(self, word, min_noun_score, nouns):
        subword_scores = {}
        for e in range(1, len(word)):
            subword = word[:e]
            suffix = word[e:]
            # Add word if compound
            if (subword in nouns) and (suffix in nouns):
                score1 = nouns[subword]
                score2 = nouns[suffix]
                subword_scores[subword] = max(score1, score2)
            elif (subword in nouns) and (self.coefficient.get(suffix, 0.0) >
                                         min_noun_score):
                subword_scores[subword] = (self.coefficient.get(suffix,
                                                                0.0), 0)
        if not subword_scores:
            return (0.0, 0)
        return sorted(subword_scores.items(), key=lambda x: -x[1][0])[0][1]

    def is_noun(self, word, min_noun_score=0.5):
        return self.predict(word)[0] >= min_noun_score

    def predict(self, word, min_noun_score=0.5, nouns=None):
        """Returns (noun_score, known_r_ratio)
        """
        features = self._get_r_features(word)

        # (감사합니다 + 만) 처럼 뒤에 등장하는 R 의 종류가 한가지 뿐이면 제대로 된 판단이 되지 않음
        if len(features) > self.min_num_of_features:
            score = self._predict(features, word)
        else:
            if nouns is None:
                nouns = {}
            score = self._get_subword_score(word, min_noun_score, nouns)

        return score

    def _predict(self, features, word):
        def exist_longer_r_feature(word, r):
            for e in range(len(word) - 1, -1, -1):
                suffix = word[e:] + r
                if suffix in self.coefficient:
                    return True
            return False

        """Parameters
        ----------
            features: dict
                예시: {을: 35, 는: 22, ...}
        """

        score = 0
        norm = 0
        unknown = 0

        for r, freq in features:
            if r in self.coefficient:
                if not exist_longer_r_feature(word, r):
                    score += freq * self.coefficient[r]
                    norm += freq
            else:
                unknown += freq

        return (0 if norm == 0 else score / norm, 0 if
                (norm + unknown == 0) else norm / (norm + unknown))

    def _postprocess(self, nouns, min_noun_score, min_noun_frequency):
        def is_Noun_Josa(l, r):
            return (l in nouns) and (self.coefficient.get(r, 0.0) >
                                     min_noun_score)

        def cohesion(word):
            base = self._substring_counter.get(word[0], 0)
            n = len(word)
            if not base or n <= 1:
                return 0
            return math.pow(
                self._substring_counter.get(word, 0) / base, 1 / (n - 1))

        def longer_has_larger_cohesion(word):
            return cohesion(word) >= cohesion(word[:-1])

        removals = set()
        for word in nouns:
            if word[-1] == '.' or word[-1] == ',':
                removals.add(word)
                continue
            n = len(word)
            if n <= 2 or longer_has_larger_cohesion(word):
                continue
            for e in range(2, len(word)):
                l = word[:e]
                r = word[e:]
                if is_Noun_Josa(l, r):
                    removals.add(word)
                    break
        nouns_ = {
            word: score
            for word, score in nouns.items() if (word in removals) == False
        }
        return nouns_

    def _to_NounScore(self, nouns):
        noun_frequencies = {}
        for word in sorted(nouns, key=lambda x: -len(x)):
            r_count = self.lrgraph.get_r(word, -1)
            noun_frequencies[word] = sum(c for w, c in r_count)
            for r, count in r_count:
                self.lrgraph.remove_eojeol(word + r, count)
        self.lrgraph.reset_lrgraph()

        nouns_ = {}
        for word, score in nouns.items():
            nouns_[word] = NounScore(noun_frequencies[word], score[0],
                                     score[1])

        return nouns_
コード例 #5
0
ファイル: _noun_ver1.py プロジェクト: yooseonghwan/soynlp
 def train(self, sents, min_count=5):
     wordset_l, wordset_r = self._scan_vocabulary(sents)
     lrgraph = self._build_lrgraph(sents, wordset_l, wordset_r)
     self.lrgraph = LRGraph(lrgraph)
     self.words = wordset_l