Exemple #1
0
def latent_semantic_analysis(corpus_fname, output_fname):
    make_save_path(output_fname)
    corpus = [
        sent.replace('\n', '').strip()
        for sent in open(corpus_fname, 'r').readlines()
    ]
    # construct co-occurrence matrix (=word_context)
    # dynamic weight if True. co-occurrence weight = [1, (w-1)/w, (w-2)/w, ... 1/w]
    input_matrix, idx2vocab = sent_to_word_contexts_matrix(corpus,
                                                           windows=3,
                                                           min_tf=10,
                                                           dynamic_weight=True,
                                                           verbose=True)
    # compute truncated SVD
    cooc_svd = TruncatedSVD(n_components=100)
    cooc_vecs = cooc_svd.fit_transform(input_matrix)
    with open(output_fname + "-cooc.vecs", 'w') as f1:
        for word, vec in zip(idx2vocab, cooc_vecs):
            str_vec = [str(el) for el in vec]
            f1.writelines(word + ' ' + ' '.join(str_vec) + "\n")
    # Shift PPMI at k=0, (equal PPMI)
    # pmi(word, contexts)
    # px: Probability of rows(items)
    # py: Probability of columns(features)
    pmi_matrix, _, _ = pmi(input_matrix, min_pmi=0, alpha=0)
    # compute truncated SVD
    pmi_svd = TruncatedSVD(n_components=100)
    pmi_vecs = pmi_svd.fit_transform(input_matrix)
    with open(output_fname + "-pmi.vecs", 'w') as f2:
        for word, vec in zip(idx2vocab, pmi_vecs):
            str_vec = [str(el) for el in vec]
            f2.writelines(word + ' ' + ' '.join(str_vec) + "\n")
Exemple #2
0
def pmi_test(corpus_path):
    print('PMI test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True)

    for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]:
        pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]])
        print('pmi {} = {:.3f}'.format(pair_, pmi))
    print('computed PMI')
Exemple #3
0
    def train(self, sents):
        # construct word - context matrix
        self.x, self.idx2vocab = sent_to_word_contexts_matrix(
            sents, self.windows, self.min_tf, self.tokenizer, self.verbose)
        self.vocab2idx = {
            vocab: idx
            for idx, vocab in enumerate(self.idx2vocab)
        }

        # compute pmi
        self.pmi_ = pmi(self.x,
                        min_pmi=self.min_pmi,
                        alpha=self.alpha,
                        verbose=self.verbose)

        return self
Exemple #4
0
def pmi_test(corpus_path):
    print('pmi test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001)

    rows, cols = x_pmi.nonzero()
    data = x_pmi.data

    print('row  shape = {}'.format(rows.shape))
    print('col  shape = {}'.format(cols.shape))
    print('data shape = {}'.format(data.shape))

    for indpt in data.argsort()[-150:-100]:
        i = rows[indpt]
        j = cols[indpt]
        pair = (idx2vocab[i], idx2vocab[j])
        value = data[indpt]
        print('pmi {} = {:.3f}'.format(pair, value))
    print('computed pmi')
    json_data = json.load(json_file)

    for key in json_data.keys():
        words = words + key + " "

    # print(words)
words = words.split()
#############################################

corpus_path = file_name
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=10,  #고려할 앞뒤 단어의 개수
    min_tf=0,  #10회 이하의 단어 무시
    tokenizer=lambda x: x.split(),  # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

print(x.shape)

glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=10, no_threads=4, verbose=True)

dictionary = {vocab: idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

# 모델 저장
glove.save('./model/gloves/glove_스포츠_doc.model')