Beispiel #1
0
def pmi_test(corpus_path):
    print('PMI test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True)

    for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]:
        pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]])
        print('pmi {} = {:.3f}'.format(pair_, pmi))
    print('computed PMI')
Beispiel #2
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)

    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-' * 40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(noun_scores)))
    topwords = sorted(
        noun_scores,
        key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-' * 40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores,
                      key=lambda x: -newsnoun_scores[x].score *
                      newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word,
                                             newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')
Beispiel #3
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores,
                      key=lambda x: -word_scores[x].cohesion_forward *
                      word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(
            word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
Beispiel #4
0
def pmi_test(corpus_path):
    print('pmi test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001)

    rows, cols = x_pmi.nonzero()
    data = x_pmi.data

    print('row  shape = {}'.format(rows.shape))
    print('col  shape = {}'.format(cols.shape))
    print('data shape = {}'.format(data.shape))

    for indpt in data.argsort()[-150:-100]:
        i = rows[indpt]
        j = cols[indpt]
        pair = (idx2vocab[i], idx2vocab[j])
        value = data[indpt]
        print('pmi {} = {:.3f}'.format(pair, value))
    print('computed pmi')
import tensorflow as tf
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

from gensim.test.utils import common_texts, get_tmpfile

if __name__ == '__main__':
    # Load Data
    file_name = 'wikiPlotText_m.txt'

    # 한국어 Cohesion score 사용 형태소 분석기
    corpus = DoublespaceLineCorpus(file_name, iter_sent=True)
    word_extractor = WordExtractor(corpus)
    word_extractor.train(corpus)
    words_scores = word_extractor.extract()
    scores = {w:s.cohesion_forword for w, s in words_scores.items()}
    tokenizer = LTokenizer(scores=scores)

    # {'games':games, 'corpus':corpus, 'titles':titles}
    games_data = fh.getGamesData(fh.getStoragePath()+file_name, tokenizer=tokenizer)
    tokenized_contents = games_data['corpus_words']

    # Vectorizing
    # sg=1(skip-gram), 0(CBOW)
    model_path = 'models/word2vec_ko.model'
    if os.path.isfile(model_path): word2vec_model = Word2Vec.load(model_path)
    else:
Beispiel #6
0
from soynlp.noun import NewsNounExtractor
from soynlp import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

corpus_path = "text/news/articles.txt"
#corpus_path = "text/news/input5-1.txt"

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

#for n_sent, sent in enumerate(corpus):
#    print('sent %d: %s %s\n'%(n_sent, sent, '' ))

we = WordExtractor()
we.train(corpus)
scores = we.word_scores()
print(scores.keys())
'''
sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(sentences)
n = nouns.keys()
lists=""
for a in n:
	lists+=a
	lists+=" "
print(lists)
'''
#top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1]
#print(top)