コード例 #1
0
    def _build_lrgraph(self, sents, min_count, pruning_min_count=2):
        lset = {}
        rset = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax) + 1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    lset[l] = lset.get(l, 0) + 1
                    rset[r] = rset.get(r, 0) + 1
            if n_sent % 1000 == 999:
                args = (n_sent + 1, len(lset), len(rset), get_process_memory())
                sys.stdout.write(
                    '\rscaning vocabulary ... %d sents #(l= %d, r= %d), mem= %.3f Gb'
                    % args)
            if n_sent % 500000 == 499999:
                lset = {
                    l: f
                    for l, f in lset.items() if f >= pruning_min_count
                }
                rset = {
                    l: f
                    for l, f in rset.items() if f >= pruning_min_count
                }
        lset = {l: f for l, f in lset.items() if f >= min_count}
        rset = {l: f for l, f in rset.items() if f >= min_count}

        n_sents = n_sent

        lrgraph = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax) + 1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    if not (l in lset) or not (r in rset):
                        continue
                    rdict = lrgraph.get(l, {})
                    rdict[r] = rdict.get(r, 0) + 1
                    lrgraph[l] = rdict
            if n_sent % 1000 == 999:
                args = (100 * (n_sent + 1) / n_sents, '%', n_sent + 1, n_sents,
                        get_process_memory())
                sys.stdout.write(
                    '\rbuilding lrgraph ... (%.3f %s, %d in %d), mem= %.3f Gb'
                    % args)
        args = (len(lset), len(rset),
                sum((len(rdict)
                     for rdict in lrgraph.values())), get_process_memory())
        print(
            '\rlrgraph has been built. (#L= %d, #R= %d, #E=%d), mem= %.3f Gb' %
            args)
        return lrgraph, lset, rset
コード例 #2
0
    def _build_graph(self, sents, wordset_l, wordset_r):
        self.wordset_l = wordset_l
        self.wordset_r = wordset_r
        self.wordset_r.add('')
        _ckpt = int(len(sents) / 40)
        lrgraph = defaultdict(lambda: defaultdict(lambda: 0))
        rlgraph = defaultdict(lambda: defaultdict(lambda: 0))
        
        for i, sent in enumerate(sents):
            for token in sent.split():
                if not token:
                    continue
                token_len = len(token)
                for i in range(1, min(self.left_max_length, token_len)+1):
                    l = token[:i]
                    r = token[i:]
                    if (not l in wordset_l) or (not r in wordset_r):
                        continue
                    lrgraph[l][r] += 1
                    rlgraph[r][l] += 1

            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i/_ckpt), '-' * (40 - int(i/_ckpt)), 100.0 * i / len(sents), '%', get_process_memory())
                sys.stdout.write('\rbuilding lr-graph: %s%s (%.3f %s), memory = %.3f Gb' % args)
        if self.verbose:                       
            sys.stdout.write('\rbuilding lr-graph completed. memory = %.3f Gb' % get_process_memory())
            
        lrgraph = {l:{r:f for r,f in rdict.items()} for l,rdict in lrgraph.items()}
        rlgraph = {r:{l:f for l,f in ldict.items()} for r, ldict in rlgraph.items()}
        return lrgraph, rlgraph
コード例 #3
0
ファイル: _noun_ver2.py プロジェクト: head1ton/soynlp
    def _check_covered_eojeols(self, nouns):

        self.lrgraph.reset_lrgraph()

        noun_candidates = self._noun_candidates_from_positive_features()

        n = len(noun_candidates)
        for i, (word, _) in enumerate(noun_candidates):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i+1) / n)
                print('\r[Noun Extractor] flushing ...  {} %'.format(
                    percentage), flush=True, end='')

            if not (word in nouns):
                continue

            for r, count in self.lrgraph.get_r(word, -1):
                if (r == '' or
                    (r in self._pos_features) or
                    (r in self._common_features)):
                    self.lrgraph.remove_eojeol(word+r, count)
                    self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format(
                '%.3f' % get_process_memory(), ' '*20), flush=True)
            coverage = '%.2f' % (100 * self._num_of_covered_eojeols
                / self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)
コード例 #4
0
    def train(self, sents, num_for_pruning = 0, cumulate=True):
        def prune_extreme_case():
            self.L = defaultdict(lambda: 0, {w:f for w,f in self.L.items() if f >= self.min_count})
            self.R = defaultdict(lambda: 0, {w:f for w,f in self.R.items() if f >= self.min_count})
        def prune_extreme_case_a():
            self._aL = defaultdict(lambda: 0, {w:f for w,f in self._aL.items() if f > 1})
            self._aR = defaultdict(lambda: 0, {w:f for w,f in self._aR.items() if f > 1})

        if cumulate:
            self.L = defaultdict(int, self.L)
            self.R = defaultdict(int, self.R)
            self._aL = defaultdict(int, self._aL)
            self._aR = defaultdict(int, self._aR)
        else:
            self.L = defaultdict(int)
            self.R = defaultdict(int)
            self._aL = defaultdict(int)
            self._aR = defaultdict(int)

        for num_sent, sent in enumerate(sents):
            if sys.version_info.major == 2:
                words = map(unicode, sent.strip().split())
            else:
                words = sent.split()

            for word in words:
                if (not word) or (len(word) <= 1):
                    continue
                word_len = len(word)
                for i in range(1, min(self.left_max_length + 1, word_len)+1):
                    self.L[word[:i]] += 1
                for i in range(1, min(self.right_max_length + 1, word_len)):
                    self.R[word[-i:]] += 1

            if len(words) <= 1:
                continue
            for left_word, word, right_word in zip([words[-1]]+words[:-1], words, words[1:]+[words[0]]):
                self._aL['%s %s' % (word, right_word[0])] += 1
                self._aR['%s %s' % (left_word[-1], word)] += 1

                word_len = len(word)
                for i in range(1, min(self.right_max_length + 1, word_len)):
                    self._aL['%s %s' % (word[-i:], right_word[0])] += 1
                for i in range(1, min(self.left_max_length + 1, word_len)):
                    self._aR['%s %s' % (left_word[-1], word[:i])] += 1

            if (num_for_pruning > 0) and ( num_sent % num_for_pruning == 0):
                prune_extreme_case()
            if (self.verbose > 0) and ( num_sent % self.verbose == 0):
                sys.stdout.write('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (num_sent, len(sents), get_process_memory()))

        prune_extreme_case()
        prune_extreme_case_a()
        if (self.verbose > 0):
            print('\rtraining was done. used memory %.3f Gb' % (get_process_memory()))
        self.L = dict(self.L)
        self.R = dict(self.R)
        self._aL = dict(self._aL)
        self._aR = dict(self._aR)
コード例 #5
0
    def _scan_vocabulary(self, sents):
        """
        Parameters
        ----------
            sents: list-like iterable object which has string
            
        It computes subtoken frequency first. 
        After then, it builds lr-graph with sub-tokens appeared at least min count
        """

        _ckpt = int(len(sents) / 40)

        wordset_l = defaultdict(lambda: 0)
        wordset_r = defaultdict(lambda: 0)

        for i, sent in enumerate(sents):
            for token in sent.split(' '):
                if not token:
                    continue
                token_len = len(token)
                for i in range(1, min(self.max_left_length, token_len) + 1):
                    wordset_l[token[:i]] += 1
                for i in range(1, min(self.max_right_length, token_len)):
                    wordset_r[token[-i:]] += 1
            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)),
                        100.0 * i / len(sent), '%', get_process_memory())
                sys.stdout.write('\rscanning: %s%s (%.3f %s) %.3f Gb' % args)

        wordset_l = {
            w
            for w, f in wordset_l.items() if f >= self.min_frequency
        }
        wordset_r = {
            w
            for w, f in wordset_r.items() if f >= self.min_frequency
        }
        if self.verbose:
            print('\rscanning completed')
            print('(L,R) has (%d, %d) tokens. memory = %.3f Gb' %
                  (len(wordset_l), len(wordset_r), get_process_memory()))

        return wordset_l, wordset_r
コード例 #6
0
ファイル: _tokenizer_builder.py プロジェクト: yongduek/soynlp
    def _build_graph(self, sents, wordset_l, wordset_r):
        self.wordset_l = wordset_l
        self.wordset_r = wordset_r
        self.wordset_r.add('')
        _ckpt = int(len(sents) / 40)
        lrgraph = defaultdict(lambda: defaultdict(lambda: 0))
        rlgraph = defaultdict(lambda: defaultdict(lambda: 0))

        for i, sent in enumerate(sents):
            for token in sent.split():
                if not token:
                    continue
                token_len = len(token)
                for i in range(1, min(self.left_max_length, token_len) + 1):
                    l = token[:i]
                    r = token[i:]
                    if (not l in wordset_l) or (not r in wordset_r):
                        continue
                    lrgraph[l][r] += 1
                    rlgraph[r][l] += 1

            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)),
                        100.0 * i / len(sents), '%', get_process_memory())
                sys.stdout.write(
                    '\rbuilding lr-graph: %s%s (%.3f %s), memory = %.3f Gb' %
                    args)
        if self.verbose:
            sys.stdout.write(
                '\rbuilding lr-graph completed. memory = %.3f Gb' %
                get_process_memory())

        lrgraph = {
            l: {r: f
                for r, f in rdict.items()}
            for l, rdict in lrgraph.items()
        }
        rlgraph = {
            r: {l: f
                for l, f in ldict.items()}
            for r, ldict in rlgraph.items()
        }
        return lrgraph, rlgraph
コード例 #7
0
    def _train_with_lrgraph(self, lrgraph, num_of_eojeols=-1):
        self.lrgraph = lrgraph
        self._num_of_covered_eojeols = 0

        if num_of_eojeols == -1:
            num_of_eojeols = lrgraph.to_EojeolCounter()._count_sum
        self._num_of_eojeols = num_of_eojeols

        if self.verbose:
            print('[Noun Extractor] has been trained. #eojeols={}, mem={} Gb'.
                  format(num_of_eojeols, '%.3f' % get_process_memory()))
コード例 #8
0
ファイル: _noun_ver2.py プロジェクト: tobby2002/soynlp
    def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True):

        # reset covered eojeol count
        self._num_of_covered_eojeols = 0

        # base prediction
        noun_candidates = self._noun_candidates_from_positive_features()
        prediction_scores = self._batch_prediction_order_by_word_length(
            noun_candidates, minimum_noun_score)

        # E = N*J+ or N*Posi+
        if self.extract_compound:
            candidates = {l:sum(rdict.values()) for l,rdict in
                self.lrgraph._lr.items() if len(l) >= 4}
            compounds = self.extract_compounds(
                candidates, prediction_scores, minimum_noun_score)
        else:
            compounds = {}

        # combine single nouns and compounds
        nouns = {noun:score for noun, score in prediction_scores.items()
            if score[0] >= minimum_noun_score}
        nouns.update(compounds)

        # frequency filtering
        nouns = {noun:score for noun, score in nouns.items()
            if score[1] >= min_count}

        nouns = self._post_processing(nouns, prediction_scores, compounds)

        if self.verbose:
            print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format(
                len(nouns), len(compounds), min_count), flush=True)

            coverage = '%.2f' % (100 * self._num_of_covered_eojeols
                / self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)

        if self.verbose:
            print('[Noun Extractor] flushing ... ', flush=True, end='')

        self._nouns = nouns
        if reset_lrgraph:
            # when extracting predicates, do not reset lrgraph.
            # the remained lrgraph is predicate (root - ending) graph
            self.lrgraph.reset_lrgraph()
        if self.verbose:
            print('done. mem={} Gb'.format('%.3f'%get_process_memory()))

        nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()}
        return nouns_
コード例 #9
0
ファイル: _pmi.py プロジェクト: ywkim/soynlp
def pmi(x, min_pmi=0, alpha=0.0001, verbose=False):
    # convert x to probability matrix & marginal probability
    px = (x.sum(axis=1) / x.sum()).reshape(-1)
    py = (x.sum(axis=0) / x.sum()).reshape(-1)
    pxy = x / x.sum()

    # transform px and py to diagonal matrix
    # using scipy.sparse.diags
    px_diag = diags(px.tolist()[0])
    py_diag = diags((py).tolist()[0])

    # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) )
    px_diag.data[0] = np.asarray(
        [0 if v == 0 else 1 / v for v in px_diag.data[0]])
    py_diag.data[0] = np.asarray(
        [0 if v == 0 else 1 / (v + alpha) for v in py_diag.data[0]])

    exp_pmi = px_diag.dot(pxy).dot(py_diag)

    # PPMI using threshold
    min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi)

    # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray
    indices = np.where(exp_pmi.data > min_exp_pmi)[0]

    pmi_dok = dok_matrix(exp_pmi.shape)

    # prepare data (rows, cols, data)
    rows, cols = exp_pmi.nonzero()
    data = exp_pmi.data

    # enumerate function for printing status
    for _n_idx, idx in enumerate(indices):
        # print current status
        if verbose and _n_idx % 10000 == 0:
            print('\rcomputing pmi {:.3} %  mem={} Gb    '.format(
                100 * _n_idx / indices.shape[0],
                '%.3f' % get_process_memory()),
                  flush=True,
                  end='')
        # apply logarithm
        pmi_dok[rows[idx], cols[idx]] = np.log(data[idx])
    if verbose:
        print('\rcomputing pmi was done{}'.format(' ' * 30), flush=True)

    return pmi_dok
コード例 #10
0
ファイル: _predicator.py プロジェクト: ysseo91/soynlp
    def _train_with_eojeol_counter(self, eojeol_counter, min_eojeol_frequency=2):
        eojeol_counter._counter = {
            eojeol:count for eojeol, count in eojeol_counter._counter.items()
            if count >= min_eojeol_frequency}

        eojeol_counter._set_count_sum()
        self._num_of_eojeols = len(eojeol_counter)
        self._num_of_covered_eojeols = 0
        self._count_of_eojeols = eojeol_counter._count_sum
        self._count_of_covered_eojeols = 0

        self.eojeol_counter = eojeol_counter

        if self.verbose:
            mem = '%.3f' % get_process_memory()
            message = '#eojeols={}, mem={} Gb'.format(self._num_of_eojeols, mem)
            self._print(message, replace=True, newline=True)
コード例 #11
0
ファイル: _pmi.py プロジェクト: tobby2002/soynlp
def pmi(x, min_pmi=0, alpha=0.0001, verbose=False):
    # convert x to probability matrix & marginal probability 
    px = (x.sum(axis=1) / x.sum()).reshape(-1)
    py = (x.sum(axis=0) / x.sum()).reshape(-1)
    pxy = x / x.sum()
    
    # transform px and py to diagonal matrix
    # using scipy.sparse.diags
    px_diag = diags(px.tolist()[0])
    py_diag = diags((py).tolist()[0])
    
    # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) )
    px_diag.data[0] = np.asarray([0 if v == 0 else 1/v for v in px_diag.data[0]])
    py_diag.data[0] = np.asarray([0 if v == 0 else 1/(v + alpha) for v in py_diag.data[0]])
    
    exp_pmi = px_diag.dot(pxy).dot(py_diag)
    
    # PPMI using threshold
    min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi)

    # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray
    indices = np.where(exp_pmi.data > min_exp_pmi)[0]

    pmi_dok = dok_matrix(exp_pmi.shape)

    # prepare data (rows, cols, data)
    rows, cols = exp_pmi.nonzero()
    data = exp_pmi.data

    # enumerate function for printing status
    for _n_idx, idx in enumerate(indices):
        # print current status        
        if verbose and _n_idx % 10000 == 0:
            print('\rcomputing pmi {:.3} %  mem={} Gb    '.format(
                100 * _n_idx / indices.shape[0], '%.3f' % get_process_memory())
                  , flush=True, end='')
        # apply logarithm
        pmi_dok[rows[idx], cols[idx]] = np.log(data[idx])
    if verbose:
        print('\rcomputing pmi was done{}'.format(' '*30), flush=True)

    return pmi_dok
コード例 #12
0
ファイル: _noun_ver2.py プロジェクト: tobby2002/soynlp
    def train(self, sentences, min_eojeol_count=1):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(sentences, min_eojeol_count,
            max_length=self.l_max_length + self.r_max_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)
        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')
        self.lrgraph = eojeol_counter.to_lrgraph(
            self.l_max_length, self.r_max_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f'%get_process_memory()))
コード例 #13
0
ファイル: _noun_ver2.py プロジェクト: keep-steady/soynlp
    def _check_covered_eojeols(self, nouns):

        self.lrgraph.reset_lrgraph()

        noun_candidates = self._noun_candidates_from_positive_features()

        n = len(noun_candidates)
        for i, word in enumerate(sorted(noun_candidates,
                                        key=lambda x: -len(x))):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i + 1) / n)
                print(
                    '\r[Noun Extractor] flushing ...  {} %'.format(percentage),
                    flush=True,
                    end='')

            if not (word in nouns):
                continue

            if len(word) > 1:
                for r, count in self.lrgraph.get_r(word, -1):
                    # remove all eojeols that including word at left-side.
                    # we have to assume that pos, neg features are incomplete
                    self.lrgraph.remove_eojeol(word + r, count)
                    self._num_of_covered_eojeols += count
            else:
                # a syllable noun is exception; remove only N + pos feature
                if (r == '' or (r in self._pos_features)
                        or (r in self._common_features)):
                    self.lrgraph.remove_eojeol(word + r, count)
                    self._num_of_covered_eojeols += count

        if self.verbose:
            print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format(
                '%.3f' % get_process_memory(), ' ' * 20),
                  flush=True)
            coverage = '%.2f' % (100 * self._num_of_covered_eojeols /
                                 self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage),
                  flush=True)
コード例 #14
0
ファイル: _noun_ver2.py プロジェクト: head1ton/soynlp
    def train(self, sentences):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(sentences, self.min_eojeol_count,
            max_length=self.l_max_length + self.r_max_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)

        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')

        self.lrgraph = eojeol_counter.to_lrgraph(
            self.l_max_length, self.r_max_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f'%get_process_memory()))
コード例 #15
0
    def _scan_vocabulary(self, sents):
        """
        Parameters
        ----------
            sents: list-like iterable object which has string
            
        It computes subtoken frequency first. 
        After then, it builds lr-graph with sub-tokens appeared at least min count
        """
        
        _ckpt = int(len(sents) / 40)
        
        wordset_l = defaultdict(lambda: 0)
        wordset_r = defaultdict(lambda: 0)
        
        for i, sent in enumerate(sents):
            for token in sent.split(' '):
                if not token:
                    continue
                token_len = len(token)
                for i in range(1, min(self.left_max_length, token_len)+1):
                    wordset_l[token[:i]] += 1
                for i in range(1, min(self.right_max_length, token_len)):
                    wordset_r[token[-i:]] += 1
            if self.verbose and (i % _ckpt == 0):
                args = ('#' * int(i/_ckpt), '-' * (40 - int(i/_ckpt)), 100.0 * i / len(sent), '%', get_process_memory())
                sys.stdout.write('\rscanning: %s%s (%.3f %s) %.3f Gb' % args)
            
        wordset_l = {w for w,f in wordset_l.items() if f >= self.min_count}
        wordset_r = {w for w,f in wordset_r.items() if f >= self.min_count}
        if self.verbose:
            print('\rscanning completed')
            print('(L,R) has (%d, %d) tokens. memory = %.3f Gb' % (len(wordset_l), len(wordset_r), get_process_memory()))

        return wordset_l, wordset_r
コード例 #16
0
ファイル: _word_context.py プロジェクト: tobby2002/soynlp
def sent_to_word_context_matrix(sents, windows=3, min_tf=10,
        tokenizer=lambda x:x.split(), verbose=True):

    # counting word frequency, first
    word_counter = defaultdict(int)
    for i_sent, sent in enumerate(sents):
        if verbose and i_sent % 1000 == 0:
            print('\rcounting word frequency from {} sents, mem={} Gb'.format(
                i_sent, '%.3f' % get_process_memory()), flush=True, end='')
        words = tokenizer(sent)
        for word in words:
            word_counter[word] += 1
    if verbose:
        print('\rcounting word frequency from {} sents was done. mem={} Gb'.format(
            i_sent, '%.3f' % get_process_memory()), flush=True, end='')
    
    # filtering with min_tf    
    vocabulary = {word for word, count in word_counter.items() if count >= min_tf}
    vocabulary = {word:idx for idx, word in enumerate(sorted(vocabulary, key=lambda w:-word_counter[w]))}
    idx2vocab = [word for word, _ in sorted(vocabulary.items(), key=lambda w:w[1])]
    del word_counter

    # scanning (word, context) pairs
    base2contexts = defaultdict(lambda: defaultdict(int))

    for i_sent, sent in enumerate(sents):
        if verbose and i_sent % 1000 == 0:
            print('\rscanning (word, context) pairs from {} sents, mem={} Gb'.format(
                i_sent, '%.3f' % get_process_memory()), flush=True, end='')

        words = tokenizer(sent)
        if not words:
            continue

        n = len(words)

        for i, base in enumerate(words):
            if not (base in vocabulary):
                continue

            # left_contexts
            for context in words[max(0, i-windows):i]:
                if not (context in vocabulary):
                    continue
                base2contexts[base][context] += 1

            # right_contexts
            for context in words[min(i+1, n):min(i+windows, n)+1]:
                if not (context in vocabulary):
                    continue
                base2contexts[base][context] += 1

    if verbose:
        print('\rscanning (word, context) pairs from {} sents was done. mem={} Gb'.format(
            i_sent + 1, '%.3f' % get_process_memory()), flush=True, end='')

    # Encoding dict to vectors
    rows = []
    cols = []
    data = []
    for base, contexts in base2contexts.items():
        base_idx = vocabulary[base]
        for context, cooccurrence in contexts.items():
            context_idx = vocabulary[context]
            rows.append(base_idx)
            cols.append(context_idx)
            data.append(cooccurrence)
    x = csr_matrix((data, (rows, cols)))

    if verbose:
        print('\r(word, context) matrix was constructed. shape = {}{}'.format(
            x.shape, ' '*20))

    return x, idx2vocab
コード例 #17
0
def _print_status(message, i_sent, new_line=False):
    print('\r{} from {} sents, mem={} Gb'.format(message, i_sent, '%.3f' %
                                                 get_process_memory()),
          flush=True,
          end='\n' if new_line else '')
コード例 #18
0
ファイル: _pmi.py プロジェクト: zeroday0619/soynlp
def pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0.0, beta=1.0, verbose=False):
    """
    :param X: scipy.sparse.csr_matrix
        (word, contexts) sparse matrix
    :param py: numpy.ndarray
        (1, word) shape, probability of context words.
    :param min_pmi: float
        Minimum value of pmi. all the values that smaller than min_pmi
        are reset to zero.
        Default is zero.
    :param alpha: float
        Smoothing factor. pmi(x,y; alpha) = p_xy /(p_x * (p_y + alpha))
        Default is 0.0
    :param beta: float
        Smoothing factor. pmi(x,y) = log ( Pxy / (Px x Py^beta) )
        Default is 1.0
    :param verbose: Boolean
        If True, verbose mode on

    Returns
    ----------
    pmi : scipy.sparse.dok_matrix
        (word, contexts) pmi value sparse matrix
    px : numpy.ndarray
        Probability of rows (items)
    py : numpy.ndarray
        Probability of columns (features)

    Usage
    -----
        >>> pmi, px, py = pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0, beta=1.0)
    """

    assert 0 < beta <= 1

    # convert x to probability matrix & marginal probability 
    px = (X.sum(axis=1) / X.sum()).reshape(-1)
    if py is None:
        py = (X.sum(axis=0) / X.sum()).reshape(-1)
    pxy = X / X.sum()

    assert py.shape[0] == pxy.shape[1]

    if beta < 1:
        py = py ** beta
        py /= py.sum()

    # transform px and py to diagonal matrix
    # using scipy.sparse.diags
    px_diag = diags(px.tolist()[0])
    py_diag = diags((py).tolist()[0])
    
    # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) )
    px_diag.data[0] = np.asarray([0 if v == 0 else 1/v for v in px_diag.data[0]])
    py_diag.data[0] = np.asarray([0 if v == 0 else 1/(v + alpha) for v in py_diag.data[0]])
    
    exp_pmi = px_diag.dot(pxy).dot(py_diag)
    
    # PPMI using threshold
    min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi)

    # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray
    indices = np.where(exp_pmi.data > min_exp_pmi)[0]

    pmi_dok = dok_matrix(exp_pmi.shape)

    # prepare data (rows, cols, data)
    rows, cols = exp_pmi.nonzero()
    data = exp_pmi.data

    # enumerate function for printing status
    for _n_idx, idx in enumerate(indices):
        # print current status        
        if verbose and _n_idx % 10000 == 0:
            print('\rcomputing pmi {:.3} %  mem={} Gb    '.format(
                100 * _n_idx / indices.shape[0], '%.3f' % get_process_memory())
                  , flush=True, end='')
        # apply logarithm
        pmi_dok[rows[idx], cols[idx]] = np.log(data[idx])
    if verbose:
        print('\rcomputing pmi was done{}'.format(' '*30), flush=True)

    return pmi_dok, px, py
コード例 #19
0
ファイル: _predicator.py プロジェクト: songys/soynlp
    def _train_with_sentences(self,
                              sentences,
                              min_eojeol_count=2,
                              filtering_checkpoint=100000):

        check = filtering_checkpoint > 0

        if self.verbose:
            message = 'counting eojeols'
            self._print(message, replace=False, newline=False)

        # Eojeol counting
        counter = {}

        def contains_noun(eojeol, n):
            for e in range(2, n + 1):
                if eojeol[:e] in self.nouns:
                    return True
            return False

        for i_sent, sent in enumerate(sentences):

            if check and i_sent > 0 and i_sent % filtering_checkpoint == 0:
                counter = {
                    eojeol: count
                    for eojeol, count in counter.items()
                    if count >= min_eojeol_count
                }

            if self.verbose and i_sent % 100000 == 99999:
                message = 'n eojeol = {} from {} sents. mem={} Gb{}'.format(
                    len(counter), i_sent + 1, '%.3f' % get_process_memory(),
                    ' ' * 20)
                self._print(message, replace=True, newline=False)

            for eojeol in sent.split():

                n = len(eojeol)

                if n <= 1 or contains_noun(eojeol, n):
                    continue

                counter[eojeol] = counter.get(eojeol, 0) + 1

        if self.verbose:
            message = 'counting eojeols was done. {} eojeols, mem={} Gb{}'.format(
                len(counter), '%.3f' % get_process_memory(), ' ' * 20)
            self._print(message, replace=True, newline=True)

        counter = {
            eojeol: count
            for eojeol, count in counter.items() if count >= min_eojeol_count
        }

        self._num_of_eojeols = sum(counter.values())
        self._num_of_covered_eojeols = 0

        if self.verbose:
            message = 'complete eojeol counter -> lr graph'
            self._print(message, replace=False, newline=True)

        self.lrgraph = EojeolCounter()._to_lrgraph(counter,
                                                   l_max_length=10,
                                                   r_max_length=9)

        if self.verbose:
            message = 'has been trained. mem={} Gb'.format(
                '%.3f' % get_process_memory())
            self._print(message, replace=False, newline=True)
コード例 #20
0
ファイル: _word_context.py プロジェクト: tobby2002/soynlp
def sent_to_word_context_matrix(sents,
                                windows=3,
                                min_tf=10,
                                tokenizer=lambda x: x.split(),
                                verbose=True):

    # counting word frequency, first
    word_counter = defaultdict(int)
    for i_sent, sent in enumerate(sents):
        if verbose and i_sent % 1000 == 0:
            print('\rcounting word frequency from {} sents, mem={} Gb'.format(
                i_sent, '%.3f' % get_process_memory()),
                  flush=True,
                  end='')
        words = tokenizer(sent)
        for word in words:
            word_counter[word] += 1
    if verbose:
        print('\rcounting word frequency from {} sents was done. mem={} Gb'.
              format(i_sent, '%.3f' % get_process_memory()),
              flush=True,
              end='')

    # filtering with min_tf
    vocabulary = {
        word
        for word, count in word_counter.items() if count >= min_tf
    }
    vocabulary = {
        word: idx
        for idx, word in enumerate(
            sorted(vocabulary, key=lambda w: -word_counter[w]))
    }
    idx2vocab = [
        word for word, _ in sorted(vocabulary.items(), key=lambda w: w[1])
    ]
    del word_counter

    # scanning (word, context) pairs
    base2contexts = defaultdict(lambda: defaultdict(int))

    for i_sent, sent in enumerate(sents):
        if verbose and i_sent % 1000 == 0:
            print('\rscanning (word, context) pairs from {} sents, mem={} Gb'.
                  format(i_sent, '%.3f' % get_process_memory()),
                  flush=True,
                  end='')

        words = tokenizer(sent)
        if not words:
            continue

        n = len(words)

        for i, base in enumerate(words):
            if not (base in vocabulary):
                continue

            # left_contexts
            for context in words[max(0, i - windows):i]:
                if not (context in vocabulary):
                    continue
                base2contexts[base][context] += 1

            # right_contexts
            for context in words[min(i + 1, n):min(i + windows, n) + 1]:
                if not (context in vocabulary):
                    continue
                base2contexts[base][context] += 1

    if verbose:
        print(
            '\rscanning (word, context) pairs from {} sents was done. mem={} Gb'
            .format(i_sent + 1, '%.3f' % get_process_memory()),
            flush=True,
            end='')

    # Encoding dict to vectors
    rows = []
    cols = []
    data = []
    for base, contexts in base2contexts.items():
        base_idx = vocabulary[base]
        for context, cooccurrence in contexts.items():
            context_idx = vocabulary[context]
            rows.append(base_idx)
            cols.append(context_idx)
            data.append(cooccurrence)
    x = csr_matrix((data, (rows, cols)))

    if verbose:
        print('\r(word, context) matrix was constructed. shape = {}{}'.format(
            x.shape, ' ' * 20))

    return x, idx2vocab
コード例 #21
0
    def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True):

        # reset covered eojeol count
        self._num_of_covered_eojeols = 0

        # base prediction
        noun_candidates = self._noun_candidates_from_positive_features()
        prediction_scores = self._batch_prediction_order_by_word_length(
            noun_candidates, minimum_noun_score)

        # E = N*J+ or N*Posi+
        if self.extract_compound:
            candidates = {
                l: sum(rdict.values())
                for l, rdict in self.lrgraph._lr.items() if len(l) >= 4
            }
            compounds = self.extract_compounds(candidates, prediction_scores,
                                               minimum_noun_score)
        else:
            compounds = {}

        # combine single nouns and compounds
        nouns = {
            noun: score
            for noun, score in prediction_scores.items()
            if score[0] >= minimum_noun_score
        }
        nouns.update(compounds)

        # frequency filtering
        nouns = {
            noun: score
            for noun, score in nouns.items() if score[1] >= min_count
        }

        nouns = self._post_processing(nouns, prediction_scores, compounds)

        if self.verbose:
            print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.
                  format(len(nouns), len(compounds), min_count),
                  flush=True)

            coverage = '%.2f' % (100 * self._num_of_covered_eojeols /
                                 self._num_of_eojeols)
            print('[Noun Extractor] {} % eojeols are covered'.format(coverage),
                  flush=True)

        if self.verbose:
            print('[Noun Extractor] flushing ... ', flush=True, end='')

        self._nouns = nouns
        if reset_lrgraph:
            # when extracting predicates, do not reset lrgraph.
            # the remained lrgraph is predicate (root - ending) graph
            self.lrgraph.reset_lrgraph()
        if self.verbose:
            print('done. mem={} Gb'.format('%.3f' % get_process_memory()))

        nouns_ = {
            noun: NounScore(score[1], score[0])
            for noun, score in nouns.items()
        }
        return nouns_