def fit_word_vectors(self, corpus_path, holdout_path=None): # logger corpus_name = os.path.splitext(os.path.basename(corpus_path))[0] log_file = os.path.join( 'exp_results', 'log_{}_{}.txt'.format(corpus_name, self.get_mid())) logging.basicConfig(filename=log_file, format="%(asctime)s:%(levelname)s:%(message)s", level=logging.INFO) corpus = TextCorpus(corpus_path, tokenizer=str.split, token_filters=[]) # corpus = MyTextCorpus(corpus_path, tokenizer=str.split, # token_filters=[], min_count=self.min_count) #character_filters=[lambda x:x], id2word = corpus.dictionary #https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/corpora/dictionary.py self.ind2word, self.vocabulary = get_vocabulary( LineCorpus(corpus_path), self.min_count, sort_by_frequency=True) if self.max_vocab_size is not None: self.ind2word = self.ind2word[:self.max_vocab_size] self.vocabulary = {w: i for i, w in enumerate(self.ind2word)} id2word.token2id = self.vocabulary id2word.id2token = self.ind2word id2word.dfs = {} # useless here print('vocabulary size: {}'.format(len(self.vocabulary))) if holdout_path is not None: holdout_corpus = TextCorpus(holdout_path, tokenizer=str.split, token_filters=[]) perplexity_logger = PerplexityMetric(corpus=holdout_corpus, logger='shell') callbacks = [perplexity_logger] else: callbacks = None self.model = LdaModel(corpus, num_topics=self.num_topics, alpha=self.alpha, eta=self.eta, passes=self.passes, id2word=id2word, random_state=self.random_state, callbacks=callbacks) # self.model = LdaMulticore(corpus, num_topics=self.num_topics, # alpha=self.alpha, eta=self.eta, passes=self.passes, # id2word=id2word, random_state=self.random_state, workers=2) # self.vocabulary = self.model.id2word.token2id # self.ind2word = self.model.id2word.id2token topic_word_dist = self.model.state.get_lambda() topic_word_dist = np.log(topic_word_dist) col_sum = topic_word_dist.sum(axis=0) self.word_vectors = topic_word_dist / col_sum self.word_vectors = self.word_vectors.transpose() # word * topic self.init_sims(replace=True)
def eval_log_anal(m): google_anal = 'data/evaluations/google_analogies.txt' logger.warning('# ========= Google Analogies =========') restrict_vocab = 300000 corpus = LineCorpus(corpus_path) global words_in_order if words_in_order is None: words_in_order, vocab = get_vocabulary( corpus, min_count=run_config['min_count'], sort_by_frequency=True) print('restrict_vocab = {}'.format(restrict_vocab)) analogies_score, sections, oov_ratio = evaluate_word_analogies( m, m.get_name(), google_anal, words_in_order=words_in_order, restrict_vocab=restrict_vocab, case_insensitive=True, dummy4unknown=False) semantic_correct, semantic_incorrect = 0, 0 syntactic_correct, syntactic_incorrect = 0, 0 for sec in sections: if 'Total' in sec['section']: continue if 'gram' in sec['section']: syntactic_correct += len(sec['correct']) syntactic_incorrect += len(sec['incorrect']) else: semantic_correct += len(sec['correct']) semantic_incorrect += len(sec['incorrect']) semantic_score = semantic_correct / (semantic_correct + semantic_incorrect) syntactic_score = syntactic_correct / (syntactic_correct + syntactic_incorrect) print('semantic #{}'.format(semantic_correct + semantic_incorrect)) print('syntactic #{}'.format(syntactic_correct + syntactic_incorrect)) logger.warning( '!model, analogies_score, semantic_score, syntactic_score, oov_ratio') logger.warning('!{},{:.4f},{:.4f},{:.4f},{:.4f}'.format( m.get_name(), analogies_score, semantic_score, syntactic_score, oov_ratio)) result = {} result['Analogies'] = '{},{:.4f},{:.4f},{:.4f},{:.4f}'.format( m.get_name(), analogies_score, semantic_score, syntactic_score, oov_ratio) return result
def fit_word_vectors(self, corpus_path): corpus_name = os.path.splitext(os.path.basename(corpus_path))[0] save_com_path = '{}_{}_mc{}_w{}_com.npz'.format( self.get_name(), corpus_name, self.min_count, self.window_size) save_com_path = os.path.join(MODEL_PATH, save_com_path) save_ind2word_path = '{}_{}_mc{}_w{}_ind2word.bin'.format( self.get_name(), corpus_name, self.min_count, self.window_size) save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path) try: cooccurence_matrix = sp.load_npz(save_com_path) with open(save_ind2word_path, 'rb') as fin: self.ind2word = pickle.load(fin) self.vocabulary = {w: i for i, w in enumerate(self.ind2word)} print('load existed cooccurence_matrix and vocab') print('vocabulary size: {}'.format(len(self.vocabulary))) except Exception as e: docs = LineCorpus(corpus_path) self.ind2word, self.vocabulary = get_vocabulary( docs, self.min_count, sort_by_frequency=True) #remove stopwords: self.ind2word = [ w for w in self.ind2word if w not in self.stop_words ] self.vocabulary = {w: i for i, w in enumerate(self.ind2word)} print('vocabulary size: {}'.format(len(self.vocabulary))) cooccurence_matrix = self._count_cooccurence(docs) sp.save_npz(save_com_path, cooccurence_matrix) with open(save_ind2word_path, 'wb') as fout: pickle.dump(self.ind2word, fout) if self.max_features: #discard all but the k columns reflecting the most common open-class words k = self.max_features #vocabulary has been ordered by freqeuncy decreasingly cooccurence_matrix = cooccurence_matrix[:, :k] #reserved features self.reserved_features = self.ind2word[:k] #normalize ##convert counts to word pair correlations t_sum = cooccurence_matrix.sum() row_sum = cooccurence_matrix.sum(axis=1) col_sum = cooccurence_matrix.sum(axis=0) cooccurence_matrix = cooccurence_matrix.tocoo() multi_rsum_csum_value = np.multiply( col_sum.take(cooccurence_matrix.col), row_sum.take(cooccurence_matrix.row)).A.squeeze() assert (multi_rsum_csum_value >= 0).all() #check overflow multi_rsum_csum = sp.coo_matrix( (multi_rsum_csum_value, (cooccurence_matrix.row, cooccurence_matrix.col))) deno = t_sum * cooccurence_matrix.tocsr() - multi_rsum_csum.tocsr() row_d = np.multiply(np.sqrt(row_sum), np.sqrt((t_sum - row_sum))) col_d = np.multiply(np.sqrt(col_sum), np.sqrt((t_sum - col_sum))) assert (row_d >= 0).all() #check overflow assert (col_d >= 0).all() #check overflow col_d_target_value = col_d.take(cooccurence_matrix.col).A.squeeze() col_d_target = sp.coo_matrix( (col_d_target_value, (cooccurence_matrix.row, cooccurence_matrix.col))) col_d_target.data = 1 / col_d_target.data row_d_target_value = row_d.take(cooccurence_matrix.row).A.squeeze() row_d_target = sp.coo_matrix( (row_d_target_value, (cooccurence_matrix.row, cooccurence_matrix.col))) row_d_target.data = 1 / row_d_target.data cooccurence_matrix = deno.multiply(col_d_target.tocsr()).multiply( row_d_target.tocsr()) ##set negative values to 0 cooccurence_matrix[cooccurence_matrix < 0] = 0 ##take square roots cooccurence_matrix = np.sqrt(cooccurence_matrix) #apply svd if self.svd_dim: #TODO : remove less frequent rows to accelerate computing speed of svd cooccurence_matrix = cooccurence_matrix.asfptype() svd = TruncatedSVD(self.svd_dim, algorithm='arpack') cooccurence_matrix = svd.fit_transform( cooccurence_matrix) # vocab_len * vector_dim self.svd = svd self.word_vectors = cooccurence_matrix self.init_sims() return self
def fit_word_vectors(self, corpus_path): #count cooccurence corpus_name = os.path.splitext(os.path.basename(corpus_path))[0] save_com_path = '{}_{}_mc{}_com.npz'.format(self.get_name(), corpus_name, self.min_count) save_com_path = os.path.join(MODEL_PATH, save_com_path) save_ind2word_path = '{}_{}_mc{}_ind2word.bin'.format(self.get_name(), corpus_name, self.min_count) save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path) try: cooccurence_matrix = sp.load_npz(save_com_path) with open(save_ind2word_path, 'rb') as fin: self.ind2word = pickle.load(fin) self.vocabulary = {w:i for i, w in enumerate(self.ind2word)} print('load existed cooccurence_matrix and vocab') print('vocabulary size: {}'.format(len(self.vocabulary))) except Exception as e: docs = LineCorpus(corpus_path) # filter rare words according to self.min_count self.ind2word, self.vocabulary = get_vocabulary(docs, self.min_count) print('vocabulary size: {}'.format(len(self.vocabulary))) cooccurence_matrix = self._count_cooccurence(docs) sp.save_npz(save_com_path, cooccurence_matrix) with open(save_ind2word_path, 'wb') as fout: pickle.dump(self.ind2word, fout) if self.max_features: #conserve top k cols with highest variance # compute variance # E[X^2] - (E[X])^2 or np.var? squared_of_mean = np.square(cooccurence_matrix.mean(0)) assert (squared_of_mean>=0).all() cooccurence_matrix.data = np.square(cooccurence_matrix.data) assert (cooccurence_matrix.data >= 0).all() mean_of_squared = cooccurence_matrix.mean(0) variance = (mean_of_squared - squared_of_mean).A variance = np.squeeze(variance, axis = 0) cooccurence_matrix.data = np.sqrt(cooccurence_matrix.data) # conserve top k cols k = self.max_features topk_ind = np.sort(np.argsort(-variance)[:k]) cooccurence_matrix = cooccurence_matrix[:, topk_ind] # reserved features vlen = len(self.ind2word) reserved_features = [(self.ind2word[i],'l') for i in topk_ind if i < vlen] reserved_features.extend([(self.ind2word[i-vlen],'r') for i in topk_ind if i >= vlen]) self.reserved_features = reserved_features #normalize # cooccurence_matrix = normalize(cooccurence_matrix, norm='l2', axis=1, copy=True) self.word_vectors = cooccurence_matrix.tocsr() self.init_sims(replace=True) return self
def fit_word_vectors(self, corpus_path): docs = LineCorpus(corpus_path) self.ind2word, self.vocabulary = get_vocabulary(docs, self.min_count) print('vocabulary size: {}'.format(len(self.vocabulary))) if self.count_normalization is None: self.vectorizer = CountVectorizer(vocabulary=self.vocabulary,tokenizer=str.split) elif self.count_normalization == 'entropy': self.vectorizer = CountVectorizer(vocabulary=self.vocabulary,tokenizer=str.split) elif self.count_normalization == 'tfidf': self.vectorizer = TfidfVectorizer(vocabulary=self.vocabulary,tokenizer=str.split, sublinear_tf=True, use_idf=True) dtm = self.vectorizer.fit_transform(docs) tdm = dtm.T.tocsr() tdm = tdm.asfptype() if self.count_normalization == 'entropy': #apply entropy normalization print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('apply entropy normalization') corpus_name = os.path.splitext(os.path.basename(corpus_path))[0] save_tdm_path = '{}_mc{}_tdm.npz'.format(corpus_name, self.min_count) save_tdm_path = os.path.join(MODEL_PATH, save_tdm_path) save_ind2word_path = '{}_mc{}_ind2word.bin'.format(corpus_name, self.min_count) save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path) try: tdm = sp.load_npz(save_tdm_path) with open(save_ind2word_path, 'rb') as fin: self.ind2word = pickle.load(fin) self.vocabulary = {w:i for i, w in enumerate(self.ind2word)} print('load existed normalized tdm and vocab') except Exception as e: vlen = tdm.shape[0] H = np.zeros((vlen,1)) # row entropy step = 2000 for i in range(0, vlen, step): start, end = i, i+step end = end if end < vlen else vlen H[start:end,0] = word_entropy(tdm[start:end, ]) if i % 2000 == 0: print('finish computing entropy of {}/{} rows'.format(i, vlen)) tdm.data = np.log(tdm.data+1) tdm = tdm.multiply(1/H) sp.save_npz(save_tdm_path, tdm) with open(save_ind2word_path, 'wb') as fout: pickle.dump(self.ind2word, fout) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('start performing svd') svd = TruncatedSVD(self.vector_dim, algorithm = 'arpack') tdm_svd = svd.fit_transform(tdm) # vocab_len * vector_dim (U * sigma) # tdm_svd = Normalizer(copy=False).fit_transform(tdm_svd) self.svd = svd #components_ : vector_dim* doc_len (aka. transpose of T) self.word_vectors = tdm_svd self.init_sims(replace=True)