def pmi_test(corpus_path): print('PMI test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True) for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]: pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]]) print('pmi {} = {:.3f}'.format(pair_, pmi)) print('computed PMI')
def pmi_test(corpus_path): print('pmi test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001) rows, cols = x_pmi.nonzero() data = x_pmi.data print('row shape = {}'.format(rows.shape)) print('col shape = {}'.format(cols.shape)) print('data shape = {}'.format(data.shape)) for indpt in data.argsort()[-150:-100]: i = rows[indpt] j = cols[indpt] pair = (idx2vocab[i], idx2vocab[j]) value = data[indpt] print('pmi {} = {:.3f}'.format(pair, value)) print('computed pmi')