Example #1
0
    def fit_word_vectors(self, corpus_path, holdout_path=None):
        # logger
        corpus_name = os.path.splitext(os.path.basename(corpus_path))[0]
        log_file = os.path.join(
            'exp_results', 'log_{}_{}.txt'.format(corpus_name, self.get_mid()))
        logging.basicConfig(filename=log_file,
                            format="%(asctime)s:%(levelname)s:%(message)s",
                            level=logging.INFO)

        corpus = TextCorpus(corpus_path, tokenizer=str.split, token_filters=[])
        # corpus = MyTextCorpus(corpus_path, tokenizer=str.split,
        #     token_filters=[], min_count=self.min_count) #character_filters=[lambda x:x],
        id2word = corpus.dictionary  #https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/corpora/dictionary.py

        self.ind2word, self.vocabulary = get_vocabulary(
            LineCorpus(corpus_path), self.min_count, sort_by_frequency=True)
        if self.max_vocab_size is not None:
            self.ind2word = self.ind2word[:self.max_vocab_size]
            self.vocabulary = {w: i for i, w in enumerate(self.ind2word)}

        id2word.token2id = self.vocabulary
        id2word.id2token = self.ind2word
        id2word.dfs = {}  # useless here
        print('vocabulary size: {}'.format(len(self.vocabulary)))

        if holdout_path is not None:
            holdout_corpus = TextCorpus(holdout_path,
                                        tokenizer=str.split,
                                        token_filters=[])
            perplexity_logger = PerplexityMetric(corpus=holdout_corpus,
                                                 logger='shell')
            callbacks = [perplexity_logger]
        else:
            callbacks = None

        self.model = LdaModel(corpus,
                              num_topics=self.num_topics,
                              alpha=self.alpha,
                              eta=self.eta,
                              passes=self.passes,
                              id2word=id2word,
                              random_state=self.random_state,
                              callbacks=callbacks)

        # self.model = LdaMulticore(corpus, num_topics=self.num_topics,
        #     alpha=self.alpha, eta=self.eta, passes=self.passes,
        #     id2word=id2word, random_state=self.random_state, workers=2)

        # self.vocabulary = self.model.id2word.token2id
        # self.ind2word =  self.model.id2word.id2token

        topic_word_dist = self.model.state.get_lambda()
        topic_word_dist = np.log(topic_word_dist)
        col_sum = topic_word_dist.sum(axis=0)

        self.word_vectors = topic_word_dist / col_sum
        self.word_vectors = self.word_vectors.transpose()  # word * topic

        self.init_sims(replace=True)
def eval_log_anal(m):
    google_anal = 'data/evaluations/google_analogies.txt'
    logger.warning('# ========= Google Analogies =========')

    restrict_vocab = 300000
    corpus = LineCorpus(corpus_path)
    global words_in_order
    if words_in_order is None:
        words_in_order, vocab = get_vocabulary(
            corpus, min_count=run_config['min_count'], sort_by_frequency=True)

    print('restrict_vocab = {}'.format(restrict_vocab))
    analogies_score, sections, oov_ratio = evaluate_word_analogies(
        m,
        m.get_name(),
        google_anal,
        words_in_order=words_in_order,
        restrict_vocab=restrict_vocab,
        case_insensitive=True,
        dummy4unknown=False)

    semantic_correct, semantic_incorrect = 0, 0
    syntactic_correct, syntactic_incorrect = 0, 0
    for sec in sections:
        if 'Total' in sec['section']:
            continue

        if 'gram' in sec['section']:
            syntactic_correct += len(sec['correct'])
            syntactic_incorrect += len(sec['incorrect'])
        else:
            semantic_correct += len(sec['correct'])
            semantic_incorrect += len(sec['incorrect'])
    semantic_score = semantic_correct / (semantic_correct + semantic_incorrect)
    syntactic_score = syntactic_correct / (syntactic_correct +
                                           syntactic_incorrect)
    print('semantic #{}'.format(semantic_correct + semantic_incorrect))
    print('syntactic #{}'.format(syntactic_correct + syntactic_incorrect))

    logger.warning(
        '!model, analogies_score, semantic_score, syntactic_score, oov_ratio')
    logger.warning('!{},{:.4f},{:.4f},{:.4f},{:.4f}'.format(
        m.get_name(), analogies_score, semantic_score, syntactic_score,
        oov_ratio))

    result = {}
    result['Analogies'] = '{},{:.4f},{:.4f},{:.4f},{:.4f}'.format(
        m.get_name(), analogies_score, semantic_score, syntactic_score,
        oov_ratio)
    return result
Example #3
0
    def fit_word_vectors(self, corpus_path):

        corpus_name = os.path.splitext(os.path.basename(corpus_path))[0]
        save_com_path = '{}_{}_mc{}_w{}_com.npz'.format(
            self.get_name(), corpus_name, self.min_count, self.window_size)
        save_com_path = os.path.join(MODEL_PATH, save_com_path)
        save_ind2word_path = '{}_{}_mc{}_w{}_ind2word.bin'.format(
            self.get_name(), corpus_name, self.min_count, self.window_size)
        save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path)

        try:
            cooccurence_matrix = sp.load_npz(save_com_path)
            with open(save_ind2word_path, 'rb') as fin:
                self.ind2word = pickle.load(fin)
                self.vocabulary = {w: i for i, w in enumerate(self.ind2word)}

            print('load existed cooccurence_matrix and vocab')
            print('vocabulary size: {}'.format(len(self.vocabulary)))

        except Exception as e:

            docs = LineCorpus(corpus_path)
            self.ind2word, self.vocabulary = get_vocabulary(
                docs, self.min_count, sort_by_frequency=True)
            #remove stopwords:
            self.ind2word = [
                w for w in self.ind2word if w not in self.stop_words
            ]
            self.vocabulary = {w: i for i, w in enumerate(self.ind2word)}
            print('vocabulary size: {}'.format(len(self.vocabulary)))

            cooccurence_matrix = self._count_cooccurence(docs)
            sp.save_npz(save_com_path, cooccurence_matrix)
            with open(save_ind2word_path, 'wb') as fout:
                pickle.dump(self.ind2word, fout)

        if self.max_features:  #discard all but the k columns reflecting the most common open-class words
            k = self.max_features
            #vocabulary has been ordered by freqeuncy decreasingly
            cooccurence_matrix = cooccurence_matrix[:, :k]
            #reserved features
            self.reserved_features = self.ind2word[:k]

        #normalize
        ##convert counts to word pair correlations
        t_sum = cooccurence_matrix.sum()
        row_sum = cooccurence_matrix.sum(axis=1)
        col_sum = cooccurence_matrix.sum(axis=0)

        cooccurence_matrix = cooccurence_matrix.tocoo()

        multi_rsum_csum_value = np.multiply(
            col_sum.take(cooccurence_matrix.col),
            row_sum.take(cooccurence_matrix.row)).A.squeeze()
        assert (multi_rsum_csum_value >= 0).all()  #check overflow
        multi_rsum_csum = sp.coo_matrix(
            (multi_rsum_csum_value, (cooccurence_matrix.row,
                                     cooccurence_matrix.col)))

        deno = t_sum * cooccurence_matrix.tocsr() - multi_rsum_csum.tocsr()

        row_d = np.multiply(np.sqrt(row_sum), np.sqrt((t_sum - row_sum)))
        col_d = np.multiply(np.sqrt(col_sum), np.sqrt((t_sum - col_sum)))
        assert (row_d >= 0).all()  #check overflow
        assert (col_d >= 0).all()  #check overflow

        col_d_target_value = col_d.take(cooccurence_matrix.col).A.squeeze()
        col_d_target = sp.coo_matrix(
            (col_d_target_value, (cooccurence_matrix.row,
                                  cooccurence_matrix.col)))
        col_d_target.data = 1 / col_d_target.data

        row_d_target_value = row_d.take(cooccurence_matrix.row).A.squeeze()
        row_d_target = sp.coo_matrix(
            (row_d_target_value, (cooccurence_matrix.row,
                                  cooccurence_matrix.col)))
        row_d_target.data = 1 / row_d_target.data

        cooccurence_matrix = deno.multiply(col_d_target.tocsr()).multiply(
            row_d_target.tocsr())

        ##set negative values to 0
        cooccurence_matrix[cooccurence_matrix < 0] = 0

        ##take square roots
        cooccurence_matrix = np.sqrt(cooccurence_matrix)

        #apply svd
        if self.svd_dim:
            #TODO : remove less frequent rows to accelerate computing speed of svd
            cooccurence_matrix = cooccurence_matrix.asfptype()
            svd = TruncatedSVD(self.svd_dim, algorithm='arpack')
            cooccurence_matrix = svd.fit_transform(
                cooccurence_matrix)  # vocab_len * vector_dim
            self.svd = svd

        self.word_vectors = cooccurence_matrix
        self.init_sims()

        return self
Example #4
0
    def fit_word_vectors(self, corpus_path):

        #count cooccurence
        corpus_name = os.path.splitext(os.path.basename(corpus_path))[0]
        save_com_path =  '{}_{}_mc{}_com.npz'.format(self.get_name(), corpus_name, self.min_count)
        save_com_path = os.path.join(MODEL_PATH, save_com_path)
        save_ind2word_path =  '{}_{}_mc{}_ind2word.bin'.format(self.get_name(), corpus_name, self.min_count)
        save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path)

        try:
            cooccurence_matrix = sp.load_npz(save_com_path)
            with open(save_ind2word_path, 'rb') as fin:
                self.ind2word = pickle.load(fin)
                self.vocabulary = {w:i for i, w in enumerate(self.ind2word)}

                print('load existed cooccurence_matrix and vocab')
                print('vocabulary size: {}'.format(len(self.vocabulary)))
                
        except Exception as e:
            docs = LineCorpus(corpus_path)
            # filter rare words according to self.min_count
            self.ind2word, self.vocabulary = get_vocabulary(docs, self.min_count)
            print('vocabulary size: {}'.format(len(self.vocabulary)))

            cooccurence_matrix = self._count_cooccurence(docs)
            sp.save_npz(save_com_path, cooccurence_matrix)
            with open(save_ind2word_path, 'wb') as fout:
                pickle.dump(self.ind2word, fout)


        if self.max_features: #conserve top k cols with highest variance
            # compute variance 
            # E[X^2] - (E[X])^2 or np.var?
            squared_of_mean = np.square(cooccurence_matrix.mean(0))
            assert (squared_of_mean>=0).all()
            
            cooccurence_matrix.data = np.square(cooccurence_matrix.data)
            assert (cooccurence_matrix.data >= 0).all()
            mean_of_squared = cooccurence_matrix.mean(0)

            variance = (mean_of_squared - squared_of_mean).A
            variance = np.squeeze(variance, axis = 0)
            
            cooccurence_matrix.data = np.sqrt(cooccurence_matrix.data)

            # conserve top k cols
            k = self.max_features
            topk_ind = np.sort(np.argsort(-variance)[:k])
            cooccurence_matrix = cooccurence_matrix[:, topk_ind]

            # reserved features
            vlen = len(self.ind2word)
            reserved_features = [(self.ind2word[i],'l') for i in topk_ind if i < vlen]
            reserved_features.extend([(self.ind2word[i-vlen],'r') for i in topk_ind if i >= vlen])
            self.reserved_features = reserved_features

        #normalize
        # cooccurence_matrix = normalize(cooccurence_matrix, norm='l2', axis=1, copy=True)

        self.word_vectors = cooccurence_matrix.tocsr()
        self.init_sims(replace=True)

        return self
Example #5
0
    def fit_word_vectors(self, corpus_path):
        docs = LineCorpus(corpus_path)
        self.ind2word, self.vocabulary = get_vocabulary(docs, self.min_count)
        print('vocabulary size: {}'.format(len(self.vocabulary)))

        if self.count_normalization is None:
            self.vectorizer = CountVectorizer(vocabulary=self.vocabulary,tokenizer=str.split)
        elif self.count_normalization == 'entropy':
            self.vectorizer = CountVectorizer(vocabulary=self.vocabulary,tokenizer=str.split)
        elif self.count_normalization == 'tfidf':
            self.vectorizer = TfidfVectorizer(vocabulary=self.vocabulary,tokenizer=str.split,
                                        sublinear_tf=True, use_idf=True)
        
        dtm = self.vectorizer.fit_transform(docs)

        tdm = dtm.T.tocsr()
        tdm = tdm.asfptype()

        if self.count_normalization == 'entropy':
            #apply entropy normalization
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            print('apply entropy normalization')

            corpus_name = os.path.splitext(os.path.basename(corpus_path))[0]
            save_tdm_path =  '{}_mc{}_tdm.npz'.format(corpus_name, self.min_count)
            save_tdm_path = os.path.join(MODEL_PATH, save_tdm_path)
            save_ind2word_path =  '{}_mc{}_ind2word.bin'.format(corpus_name, self.min_count)
            save_ind2word_path = os.path.join(MODEL_PATH, save_ind2word_path)

            try:
                tdm = sp.load_npz(save_tdm_path)
                with open(save_ind2word_path, 'rb') as fin:
                    self.ind2word = pickle.load(fin)
                    self.vocabulary = {w:i for i, w in enumerate(self.ind2word)}

                print('load existed normalized tdm and vocab')

            except Exception as e:
                vlen = tdm.shape[0]
                H = np.zeros((vlen,1)) # row entropy
                step = 2000
                for i in range(0, vlen, step):
                    start, end = i, i+step
                    end = end if end < vlen else vlen
                    H[start:end,0] = word_entropy(tdm[start:end, ])
                    
                    if i % 2000 == 0:
                        print('finish computing entropy of {}/{} rows'.format(i, vlen))

                tdm.data = np.log(tdm.data+1)
                tdm = tdm.multiply(1/H)

                sp.save_npz(save_tdm_path, tdm)
                with open(save_ind2word_path, 'wb') as fout:
                    pickle.dump(self.ind2word, fout)

        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print('start performing svd')
        svd = TruncatedSVD(self.vector_dim, algorithm = 'arpack')
        tdm_svd = svd.fit_transform(tdm) # vocab_len * vector_dim (U * sigma)
        # tdm_svd = Normalizer(copy=False).fit_transform(tdm_svd) 
        
        self.svd = svd #components_ : vector_dim* doc_len (aka. transpose of T)
        self.word_vectors = tdm_svd

        self.init_sims(replace=True)