Beispiel #1
0
def visualize(m, show_emb=True):
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval()

    queries = []

    ## visualize topics using monte carlo
    with torch.no_grad():
        print('#' * 100)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta()
        for k in range(args.num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('#' * 100)
            print('Visualize word embeddings by using output embedding matrix')
            try:
                embeddings = m.rho.weight  # Vocab_size x E
            except:
                embeddings = m.rho  # Vocab_size x E
            neighbors = []
            for word in queries:
                print('word: {} .. neighbors: {}'.format(
                    word, nearest_neighbors(word, embeddings, vocab)))
            print('#' * 100)
Beispiel #2
0
def visualize(model,
              num_topics=num_topics,
              num_words=num_words,
              vocab=idx2word,
              show_emb=True,
              tokenizer=tokenizer,
              bert_model=bert):
    """
    This is a cool visualisation function.
    Takes as input the model so far and shows the discovered embeddings!
    """
    model.eval()  # set the net in evaluation mode
    # set a few words to query
    queries = [
        'insurance', 'weather', 'particles', 'religion', 'man', 'love',
        'intelligence', 'money', 'politics', 'health', 'people', 'family'
    ]

    ## visualize topics using monte carlo (sampling from the posterior I guess)
    with torch.no_grad(
    ):  # no gradients computation - makes forward pass lighter
        print('-' * 20)
        print('Visualize topics...')
        topics_words = []
        gammas = model.get_beta()  # topics distributions
        for k in range(num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('-' * 20)
            print('Visualize word embeddings by using output embedding matrix')

            # extract the embeddings from the model!
            try:
                embeddings = model.rho.weight  # Vocab_size x E
            except:
                embeddings = model.rho  # Vocab_size x E

            for word in queries:
                # extracting Bert representation of the word
                inputs = tokenizer(word, return_tensors="pt")
                outputs = bert_model(**inputs).last_hidden_state[0]
                outputs.requires_grad = False
                if outputs.size()[0] > 1:  # aggregate
                    outputs = torch.sum(outputs, dim=0)
                nns = utils.nearest_neighbors(q=outputs,
                                              embeddings=embeddings,
                                              vocab=list(vocab.values()))
                print('word: {} .. neighbors: {}'.format(
                    word, nns))  # utility function
Beispiel #3
0
    def visualize(self, args, vocabulary, show_emb=False):
        Path.cwd().joinpath("results").mkdir(parents=True, exist_ok=True)
        self.eval()
        model_path = Path.home().joinpath(
            "Projects", "Personal", "balobi_nini", 'models',
            'embeddings_one_gram_fast_tweets_only').__str__()
        model_gensim = FT_gensim.load(model_path)

        # need to update this ..
        queries = [
            'felix', 'covid', 'pprd', '100jours', 'beni', 'adf', 'muyembe',
            'fally'
        ]

        ## visualize topics using monte carlo
        results_file_name = "topic_results_{}_{}.txt".format(
            args.batch_size, args.epochs)
        results_file_name = Path.cwd().joinpath("results", results_file_name)
        with torch.no_grad():
            print('#' * 100)
            print('Visualize topics...')
            topics_words = []
            gammas = self.get_beta()
            for k in range(args.num_topics):
                gamma = gammas[k]
                top_words = list(
                    gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1])
                topic_words = [vocabulary[a].strip() for a in top_words]
                topics_words.append(' '.join(topic_words))
                with open(results_file_name, "a") as results_file:
                    results_file.write('Topic {}: {}\n'.format(k, topic_words))
            with open(results_file_name, "a") as results_file:
                results_file.write(
                    10 * '#' +
                    '\n')  # But this could have been done as a function

            if show_emb:
                ## visualize word embeddings by using V to get nearest neighbors
                print('#' * 100)
                print(
                    'Visualize word embeddings by using output embedding matrix'
                )
                try:
                    embeddings = self.rho.weight  # Vocab_size x E
                except:
                    embeddings = self.rho  # Vocab_size x E
                neighbors = []
                for word in queries:
                    print('word: {} .. neighbors: {}'.format(
                        word, nearest_neighbors(model_gensim, word)))
                print('#' * 100)
Beispiel #4
0
def visualize(m, show_emb=True):
    #可视化模型
    #如何根据代码中的变量来对文章提取主题。
    #整篇论文的目的,就是求得self.rho和beta。再对这论文看一次代码。
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval()

    queries = [
        'andrew', 'computer', 'sports', 'religion', 'man', 'love',
        'intelligence', 'money', 'politics', 'health', 'people', 'family'
    ]

    ## 可视化主题
    with torch.no_grad():
        print('#' * 100)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta()  # 也就是m.get_beta()的作用是得到主题关于此的分布。这也是个向量。
        #现在的理解是gammas每个主题的向量。5 * 3072维的。5行是有五个主题,3072是单词的个数。
        for k in range(args.num_topics):
            gamma = gammas[k]
            top_words = list(
                gamma.cpu().numpy().argsort()[-args.num_words +
                                              1:][::-1])  #对所得到的词排序。
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            ## 展示每个单词的上下文。使用上下文来做CBOW来实现对word 2 vec
            print('#' * 100)
            print('Visualize word embeddings by using output embedding matrix')
            # 输出中有一个vectors : (3072, 300) 和 query:(300,)这两个东西,是从哪来的。我知道这个元组的意义是3072的输入向量转成300维的词向量。
            try:
                embeddings = m.rho.weight  # Vocab_size x E
            except:
                embeddings = m.rho  # Vocab_size x E
            # embeddings表示的是词向量的维度,3072 * 300
            neighbors = []
            for word in queries:
                print('word: {}    neighbors: {}'.format(
                    word, nearest_neighbors(word, embeddings, vocab)))
            print('#' * 100)
Beispiel #5
0
def visualize(m, show_emb=True):
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval()

    #queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love',
    #            'intelligence', 'money', 'politics', 'health', 'people', 'family']
    queries = [
        "sentence", "punishment", "guilt", "murder", "vote", "woman", "man",
        "innocent", "London", "crime", "female", "slave", "chattle",
        "foreigner", "foreign", "theft", "robbery", "rape", "thievery",
        "larceny", "burglary", "assault", "hanging", "prison", "convict"
    ]
    ## visualize topics using monte carlo
    with torch.no_grad():
        print('#' * 100)

        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta()
        for k in range(args.num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('#' * 100)
            print('Visualize word embeddings by using output embedding matrix')
            try:
                embeddings = m.rho.weight  # Vocab_size x E
            except:
                embeddings = m.rho  # Vocab_size x E
            neighbors = []
            for word in queries:
                try:
                    print('word: {} .. neighbors: {}'.format(
                        word, nearest_neighbors(word, embeddings, vocab)))
                except ValueError:
                    print(word + " not in queries. Skipping...")
                    continue
            print('#' * 100)
Beispiel #6
0
def visualize(m, show_emb=True):
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval()

    queries = [
        'medical', 'computer', 'sports', 'religion', 'man', 'love',
        'intelligence', 'money', 'politics', 'health', 'people', 'family'
    ]

    ## visualize topics using monte carlo
    with torch.no_grad():
        print('#' * 100)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta()
        for k in range(args.num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))
            with open(
                    '/mnt/nas6/users/wangweixuan/ETM/results/topic_words_de.txt',
                    'a') as f:
                f.write(str(k))
                f.write(str(topic_words))
                f.write('\n')
                f.close()

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('#' * 100)
            print('Visualize word embeddings by using output embedding matrix')
            try:
                embeddings = m.rho.weight  # Vocab_size x E
            except:
                embeddings = m.rho  # Vocab_size x E
            neighbors = []
            for word in queries:
                print('word: {} .. neighbors: {}'.format(
                    word, nearest_neighbors(word, embeddings, vocab)))
            print('#' * 100)
Beispiel #7
0
    def visualize(self, vocab, show_emb=True):
        if not os.path.exists('./results'):
            os.makedirs('./results')
        self.eval()

        # Examples of English queries
        # queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love',
        #            'intelligence', 'money', 'politics', 'health', 'people', 'family']

        queries = ['محمد']

        # visualize topics using monte carlo
        with torch.no_grad():
            print('#' * 100)
            print('Visualize topics...')
            topics_words = []
            gammas = self.get_beta()
            for k in range(self.config_dict['model_params']['num_topics']):
                gamma = gammas[k]
                top_words = list(
                    gamma.cpu().numpy().argsort()
                    [-self.config_dict['evaluation_params']['num_words'] +
                     1:][::-1])
                topic_words = [vocab[a] for a in top_words]
                topics_words.append(' '.join(topic_words))
                print('Topic {}: {}'.format(k, topic_words))

            if show_emb:
                # visualize word embeddings by using V to get nearest neighbors
                print('#' * 100)
                print(
                    'Visualize word embeddings by using output embedding matrix'
                )
                try:
                    embeddings = self.rho.weight  # Vocab_size x E
                except:
                    embeddings = self.rho  # Vocab_size x E
                neighbors = []
                for word in queries:
                    print('word: {} .. neighbors: {}'.format(
                        word, nearest_neighbors(word, embeddings, vocab)))
                print('#' * 100)
Beispiel #8
0
    def visualize(self, args, vocabulary):
        """Visualizes topics and embeddings and word usage evolution.
        """
        self.eval()
        with torch.no_grad():
            alpha = self.mu_q_alpha
            beta = self.get_beta(alpha)
            print('beta: ', beta.size())
            print('\n')
            print('#' * 100)
            print('Visualize topics...')
            times = [0, 10, 40]
            topics_words = []
            for k in range(args.num_topics):
                for t in times:
                    gamma = beta[k, t, :]
                    top_words = list(
                        gamma.cpu().numpy().argsort()[-args.num_words +
                                                      1:][::-1])
                    topic_words = [vocabulary[a] for a in top_words]
                    topics_words.append(' '.join(topic_words))
                    print('Topic {} .. Time: {} ===> {}'.format(
                        k, t, topic_words))

            print('\n')
            print('Visualize word embeddings ...')
            queries = [
                'economic', 'assembly', 'security', 'management', 'debt',
                'rights', 'africa'
            ]
            try:
                embeddings = self.rho.weight  # Vocab_size x E
            except Exception:
                embeddings = self.rho  # Vocab_size x E
            neighbors = []
            for word in queries:
                print('word: {} .. neighbors: {}'.format(
                    word,
                    nearest_neighbors(word, embeddings, vocabulary,
                                      args.num_words)))
            print('#' * 100)
def visualize(m, show_emb=True):
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval()

    # queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love',
    #             'intelligence', 'money', 'politics', 'health', 'people', 'family']

    # queries = ['biology', 'gene', 'calling', 'cancer', 'experiment']

    queries = ['biology']

    ## visualize topics using monte carlo
    with torch.no_grad():
        print('#' * 100)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta()
        for k in range(args.num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('#' * 100)
            print('Visualize word embeddings by using output embedding matrix')
            try:
                embeddings = m.rho.weight  # Vocab_size x E
            except:
                embeddings = m.rho  # Vocab_size x E
            neighbors = []
            for word in queries:
                print('word: {} .. neighbors: {}'.format(
                    word, nearest_neighbors(word, embeddings, vocab)))
            print('#' * 100)
Beispiel #10
0
def visualize(epoch, writer):
    """Visualizes topics and embeddings and word usage evolution.
    """
    model.eval()
    with torch.no_grad():
        alpha = model.mu_q_alpha
        beta = model.get_beta(alpha)
        print('beta: ', beta.size())
        print('\n')
        print('#' * 100)
        print('Visualize topics...')
        times = [0, 10, 40]
        topics_words = []
        for k in range(args.num_topics):
            for t in times:
                gamma = beta[k, t, :]
                top_words = list(
                    gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1])
                topic_words = [vocab[a] for a in top_words]
                topics_words.append(' '.join(topic_words))
                print('Topic {} .. Time: {} ===> {}'.format(k, t, topic_words))

        print('\n')
        print('Visualize word embeddings ...')
        queries = [
            'economy', 'vietnam', 'islam', 'climate', 'debt', 'electricity',
            'africa'
        ]
        try:
            embeddings = model.rho.weight  # Vocab_size x E
        except:
            embeddings = model.rho  # Vocab_size x E
        neighbors = []
        for word in queries:
            print('word: {} .. neighbors: {}'.format(
                word, nearest_neighbors(word, embeddings, vocab,
                                        args.num_words)))
        print('#' * 100)
Beispiel #11
0
    def train_knn_gcn(self, args):
        features, nfeats, labels, nclasses, train_mask, val_mask, test_mask = load_data(
            args)
        val_accuracies = []
        test_accuracies = []

        Adj = torch.from_numpy(
            nearest_neighbors(features, args.k, args.knn_metric)).cuda()
        Adj = normalize(Adj, args.normalization, args.sparse)

        if torch.cuda.is_available():
            features = features.cuda()

        if args.half_val_as_train:
            val_mask, train_mask = self.half_val_as_train(val_mask, train_mask)

        for trial in range(args.ntrials):
            val_accu, test_accu, best_model = self.train_classification_gcn(
                Adj, features, nfeats, labels, nclasses, train_mask, val_mask,
                test_mask, args)
            val_accuracies.append(val_accu.item())
            test_accuracies.append(test_accu.item())

        self.print_results(val_accuracies, test_accuracies)
Beispiel #12
0
                print('batch: {}/{}'.format(idx, len(indices)))
        thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt
        print('\nThe 10 most used topics are {}'.format(
            thetaWeightedAvg.argsort()[::-1][:10]))

        ## show topics
        beta = model.get_beta()
        topic_indices = list(np.random.choice(args.num_topics,
                                              10))  # 10 random topics
        print('\n')
        for k in range(args.num_topics):  #topic_indices:
            gamma = beta[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words +
                                                           1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            print('Topic {}: {}'.format(k, topic_words))

        if args.train_embeddings:
            ## show etm embeddings
            try:
                rho_etm = model.rho.weight.cpu()
            except:
                rho_etm = model.rho.cpu()
            queries = []
            print('\n')
            print('ETM embeddings...')
            for word in queries:
                print('word: {} .. etm neighbors: {}'.format(
                    word, nearest_neighbors(word, rho_etm, vocab)))
            print('\n')
Beispiel #13
0
        thetaWeightedAvg.argsort()[::-1][:10]))

    # Now we show the topics
    # A nice visualisation is always welcome
    beta = etm_model.get_beta()
    topic_indices = list(np.random.choice(num_topics, 10))  # 10 random topics
    print('\n')
    for k in range(num_topics):  # topic_indices:
        gamma = beta[k]
        top_words = list(gamma.cpu().numpy().argsort()[-num_words + 1:][::-1])
        topic_words = [idx2word[a] for a in top_words]
        print('Topic {}: {}'.format(k, topic_words))

    # Why not, also showing a few embeddings
    if train_embeddings:
        # get embeddings from the model
        try:
            rho_etm = etm_model.rho.weight.cpu()
        except:
            rho_etm = etm_model.rho.cpu()
        queries = [
            'andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love',
            'intelligence', 'money', 'politics', 'health', 'people', 'family'
        ]
        print('\n')
        print('ETM embeddings...')
        for word in queries:
            print('word: {} .. etm neighbors: {}'.format(
                word, utils.nearest_neighbors(word, rho_etm, idx2word)))
        print('\n')
Beispiel #14
0
            topic_words = [vocab[a] for a in top_words]
            print('Topic {}: {}'.format(k, topic_words))

        if args.train_embeddings:
            ## show etm embeddings 
            try:
                rho_etm = model.rho.weight.cpu()
            except:
                rho_etm = model.rho.cpu()
            # queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love',
            #                'intelligence', 'money', 'politics', 'health', 'people', 'family']
            queries = []
            print('\n')
            print('ETM embeddings...')
            for word in queries:
                print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab)))
            print('\n')

            indices = torch.tensor(range(args.num_docs_test))
            indices = torch.split(indices, 1)
            topics = []
            for idx, ind in enumerate(indices):
                print(idx, ind)
                data_batch = data.get_batch(test_tokens, test_counts, ind, args.vocab_size, device)
                topic = predict(data_batch, args.num_topics, model)
                topics.append(topic)
            print(topics)

            #keys_list = ["Υγεία", "Ψυχαγωγία", "Υγεία", "Κόσμος", "Υγεία", "Υγεία", "Επιχείρηση", "Ελλάδα", "Ψυχαγωγία",
            #             "Ελλάδα", "Τεχνολογία", "Σπορ"]