def latent_semantic_analysis(corpus_fname, output_fname): make_save_path(output_fname) corpus = [ sent.replace('\n', '').strip() for sent in open(corpus_fname, 'r').readlines() ] # construct co-occurrence matrix (=word_context) # dynamic weight if True. co-occurrence weight = [1, (w-1)/w, (w-2)/w, ... 1/w] input_matrix, idx2vocab = sent_to_word_contexts_matrix(corpus, windows=3, min_tf=10, dynamic_weight=True, verbose=True) # compute truncated SVD cooc_svd = TruncatedSVD(n_components=100) cooc_vecs = cooc_svd.fit_transform(input_matrix) with open(output_fname + "-cooc.vecs", 'w') as f1: for word, vec in zip(idx2vocab, cooc_vecs): str_vec = [str(el) for el in vec] f1.writelines(word + ' ' + ' '.join(str_vec) + "\n") # Shift PPMI at k=0, (equal PPMI) # pmi(word, contexts) # px: Probability of rows(items) # py: Probability of columns(features) pmi_matrix, _, _ = pmi(input_matrix, min_pmi=0, alpha=0) # compute truncated SVD pmi_svd = TruncatedSVD(n_components=100) pmi_vecs = pmi_svd.fit_transform(input_matrix) with open(output_fname + "-pmi.vecs", 'w') as f2: for word, vec in zip(idx2vocab, pmi_vecs): str_vec = [str(el) for el in vec] f2.writelines(word + ' ' + ' '.join(str_vec) + "\n")
def pmi_test(corpus_path): print('PMI test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True) for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]: pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]]) print('pmi {} = {:.3f}'.format(pair_, pmi)) print('computed PMI')
def train(self, sents): # construct word - context matrix self.x, self.idx2vocab = sent_to_word_contexts_matrix( sents, self.windows, self.min_tf, self.tokenizer, self.verbose) self.vocab2idx = { vocab: idx for idx, vocab in enumerate(self.idx2vocab) } # compute pmi self.pmi_ = pmi(self.x, min_pmi=self.min_pmi, alpha=self.alpha, verbose=self.verbose) return self
def pmi_test(corpus_path): print('pmi test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001) rows, cols = x_pmi.nonzero() data = x_pmi.data print('row shape = {}'.format(rows.shape)) print('col shape = {}'.format(cols.shape)) print('data shape = {}'.format(data.shape)) for indpt in data.argsort()[-150:-100]: i = rows[indpt] j = cols[indpt] pair = (idx2vocab[i], idx2vocab[j]) value = data[indpt] print('pmi {} = {:.3f}'.format(pair, value)) print('computed pmi')
json_data = json.load(json_file) for key in json_data.keys(): words = words + key + " " # print(words) words = words.split() ############################################# corpus_path = file_name corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=10, #고려할 앞뒤 단어의 개수 min_tf=0, #10회 이하의 단어 무시 tokenizer=lambda x: x.split(), # (default) lambda x:x.split(), dynamic_weight=True, verbose=True) print(x.shape) glove = Glove(no_components=100, learning_rate=0.05, max_count=30) glove.fit(x.tocoo(), epochs=10, no_threads=4, verbose=True) dictionary = {vocab: idx for idx, vocab in enumerate(idx2vocab)} glove.add_dictionary(dictionary) # 모델 저장 glove.save('./model/gloves/glove_스포츠_doc.model')