Python LsiModel.get_topics Examples, gensim.models.LsiModel.get_topics Python Examples

Example #1

0

Show file

File: unfinished_commands.py Project: cstenkamp/derive_conceptualspaces

def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False):
    """as in [VISR12: 4.2.1]"""
    # TODO options here:
    # * if it should filter AFTER the LSI

    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")

    filtered_dcm.add_pseudo_keyworddocs()
    dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())])
    print("Start creating the LSA-Model with MORE topics than terms...")
    lsamodel_manytopics = LsiModel(doc_term_matrix,
                                   num_topics=len(all_terms) * 2,
                                   id2word=dictionary)
    print("Start creating the LSA-Model with FEWER topics than terms...")
    lsamodel_lesstopics = LsiModel(filtered_dcm.dtm,
                                   num_topics=len(filtered_dcm.all_terms) //
                                   10,
                                   id2word=dictionary)
    print()
    import matplotlib.cm
    import matplotlib.pyplot as plt
    # TODO use the mpl_tools here as well to also save plot!
    plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200],
               vmin=lsamodel_lesstopics.get_topics().min(),
               vmax=lsamodel_lesstopics.get_topics().max(),
               cmap=matplotlib.cm.get_cmap("coolwarm"))
    plt.show()

Example #2

0

Show file

File: latent_semantic_analysis.py Project: rbarile17/orange_cb_recsys

class GensimLatentSemanticAnalysis(GensimProjectionsWordEmbeddingLearner):
    """
    Class that implements the Abstract Class GensimProjectionsWordEmbeddingLearner
    Class that implements latent semantic analysis using Gensim
    """
    def __init__(self,
                 reference: str = None,
                 auto_save: bool = True,
                 **kwargs):
        super().__init__(reference, auto_save, ".model", **kwargs)

    def fit_model(self, corpus: List):
        """
        This method creates the model, using Gensim Latent Semantic Analysis.
        The model isn't then returned, but gets stored in the 'model' class attribute.
        """
        dictionary = Dictionary(corpus)
        word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus]
        self.model = LsiModel(word_docs_matrix,
                              id2word=dictionary,
                              **self.additional_parameters)

    def load_model(self):
        return LsiModel.load(self.reference)

    def get_vector_size(self) -> int:
        return len(self.model.get_topics())

    def __str__(self):
        return "GensimLatentSemanticAnalysis"

    def __repr__(self):
        return "< GensimLatentSemanticAnalysis : model = " + str(
            self.model) + " >"

Example #3

0

Show file

def build_and_save_lsi_model():
    print('Connecting to the database...')
    sentences = SentencesIterator(tokens_generator)
    dct = Dictionary(sentences)
    # Corpus as dictionary ids lists, in memory
    # Can be transformed in an iterable as done with the others if needed
    print('Calculating the LSI model...')
    bow_corpus = [dct.doc2bow(s) for s in sentences]
    model = LsiModel(bow_corpus, id2word=dct)
    model.print_debug()
    model.save(LSI_MODEL_FILE)
    for t in range(model.get_topics().shape[0]):
        print(t)
        print(model.print_topic(t))

Example #4

0

Show file

File: train-model.py Project: teodor-cotet/DiacriticsRestoration

def train_lsa(docs: Iterable, outputFolder: str):
    docs = list(docs)
    id2word = Dictionary(docs)
    id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000)
    corpus = [id2word.doc2bow(doc) for doc in docs]
    corpus = log_entropy_norm(corpus)
    print("Starting training...")
    lsa = LsiModel(corpus=corpus, id2word=id2word, num_topics=300)
    path = outputFolder + "/lsa.model"
    lsa.save(outputFolder + "/lsa.bin")
    matrix = np.transpose(lsa.get_topics())
    with open(path, "wt", encoding='utf-8') as f:
        f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1)))
        for idx in range(np.size(matrix, 0)):
            f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n")
    print("Model saved to ", path)

Example #5

0

Show file

    def __init__(self, embedding_dictionary_file, word_to_index_file,
                 docs_tokens, doc_len, word_len, iters):
        self.time = 0.
        if embedding_dictionary_file is not None and word_to_index_file is not None:
            super(LSAEmbedding,
                  self).get_from_files(embedding_dictionary_file,
                                       word_to_index_file, doc_len, self)
        else:
            self.time = time()
            word_dictionary = Dictionary(docs_tokens)
            word_to_index = word_dictionary.token2id
            docs_term_matrix = [
                word_dictionary.doc2bow(tokens) for tokens in docs_tokens
            ]
            tfidfmodel = TfidfModel(docs_term_matrix, id2word=word_dictionary)
            corpus = [tfidfmodel[doc] for doc in docs_term_matrix]
            lsamodel = LsiModel(corpus,
                                num_topics=word_len,
                                id2word=word_dictionary,
                                power_iters=iters)
            self.time = time() - self.time

            embedding_matrix = lsamodel.get_topics().transpose()
            embedding_dictionary = {}
            embedding_dim = None
            for word, i in word_to_index.items():
                embedding_dictionary[word] = embedding_matrix[i]
                if embedding_dim is None:
                    embedding_dim = len(embedding_matrix[i])

            # print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
            # one_hot = OneHot(docs_tokens, max_doc_len=self.doc_len)
            # word_to_index = one_hot.get_word_indexes()
            super(LSAEmbedding,
                  self).get_from_data(embedding_dictionary, embedding_dim,
                                      word_to_index, doc_len, self)

        self.name = 'lsa'
        self.iters = iters

Example #6

0

Show file

File: lsa_embedding.py Project: ana-dev/sentime

    def __init__(self, docs_tokens, emb_dim, iters):
        self.time = 0.

        self.time = time()

        word_dictionary = Dictionary(docs_tokens)
        word_to_index = word_dictionary.token2id
        docs_term_matrix = [word_dictionary.doc2bow(tokens) for tokens in docs_tokens]
        tfidfmodel = TfidfModel(docs_term_matrix, id2word=word_dictionary)
        corpus = [tfidfmodel[doc] for doc in docs_term_matrix]
        lsamodel = LsiModel(corpus, num_topics=emb_dim, id2word=word_dictionary, power_iters=iters)
        self.time = time() - self.time

        embedding_matrix = lsamodel.get_topics().transpose()
        embedding_dictionary = {}
        embedding_dim = None
        for word, i in word_to_index.items():
            embedding_dictionary[word] = embedding_matrix[i]
            if embedding_dim is None:
                embedding_dim = len(embedding_matrix[i])

        super(EmbeddingModel, self).get_from_data(embedding_dictionary, embedding_dim, word_to_index, self)

        self.name = 'lsa'

Example #7

0

Show file

File: ouhtc_nlp.py Project: jackkitte/ouhtc_nlp

    labels_topics[label] = topic_with_id

with open("./data/20180101_0815/labels_ids.dump", "wb") as f:
    pickle.dump(labels_ids, f)
with open("./data/20180101_0815/labels_corpus.dump", "wb") as f:
    pickle.dump(labels_corpus, f)
with open("./data/20180101_0815/labels_topics.dump", "wb") as f:
    pickle.dump(labels_topics, f)

labels_topic_vec = {}
for label, many_topic in labels_topics.items():
    if label not in labels_topic_vec.keys():
        labels_topic_vec[label] = []
    topic_vec_list = []
    for topic_id, weight in many_topic:
        w_vector = lsi_model.get_topics()[topic_id] * weight
        topic_vec_list.append(w_vector)
    labels_topic_vec[label] = np.average(topic_vec_list, axis=0)

with open("./data/20180101_0815/labels_topic_vec.dump", "wb") as f:
    pickle.dump(labels_topic_vec, f)

labels_words_freq = {}
for label, vec in labels_topic_vec.items():
    if label not in labels_words_freq.keys():
        labels_words_freq[label] = {}
    for id_, val in enumerate(vec):
        labels_words_freq[label][dict_2d[id_]] = abs(val)

with open("./data/20180101_0815/labels_words_freq.dump", "wb") as f:
    pickle.dump(labels_words_freq, f)

Example #8

0

Show file

class AnnStream:
    def __init__(self, data, k: int, n_cluster: int, reduction_method: str,
                 dims: int, loadings: np.ndarray, use_for_pca: np.ndarray,
                 mu: np.ndarray, sigma: np.ndarray, ann_metric: str,
                 ann_efc: int, ann_ef: int, ann_m: int, nthreads: int,
                 ann_parallel: bool, rand_state: int, do_kmeans_fit: bool,
                 disable_scaling: bool, ann_idx):
        self.data = data
        self.k = k
        if self.k >= self.data.shape[0]:
            self.k = self.data.shape[0] - 1
        self.nClusters = max(n_cluster, 2)
        self.dims = dims
        self.loadings = loadings
        if self.dims is None and self.loadings is None:
            raise ValueError(
                "ERROR: Provide either value for atleast one: 'dims' or 'loadings'"
            )
        self.annMetric = ann_metric
        self.annEfc = ann_efc
        self.annEf = ann_ef
        self.annM = ann_m
        self.nthreads = nthreads
        if ann_parallel:
            self.annThreads = self.nthreads
        else:
            self.annThreads = 1
        self.randState = rand_state
        self.batchSize = self._handle_batch_size()
        self.method = reduction_method
        self.nCells, self.nFeats = self.data.shape
        self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells)
        disable_reduction = False
        if self.dims < 1:
            disable_reduction = True
        with threadpool_limits(limits=self.nthreads):
            if self.method == 'pca':
                self.mu, self.sigma = mu, sigma
                if self.loadings is None or len(self.loadings) == 0:
                    if len(use_for_pca) != self.nCells:
                        raise ValueError(
                            "ERROR: `use_for_pca` does not have sample length as nCells"
                        )
                    if disable_reduction is False:
                        self._fit_pca(disable_scaling, use_for_pca)
                else:
                    # Even though the dims might have been already adjusted according to loadings before calling
                    # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here.
                    self.dims = self.loadings.shape[1]
                    # it is okay for dimensions to be larger than batch size here because we will not fit the PCA
                if disable_scaling:
                    if disable_reduction:
                        self.reducer = lambda x: x
                    else:
                        self.reducer = lambda x: x.dot(self.loadings)
                else:
                    if disable_reduction:
                        self.reducer = lambda x: self.transform_z(x)
                    else:
                        self.reducer = lambda x: self.transform_z(x).dot(
                            self.loadings)
            elif self.method == 'lsi':
                if self.loadings is None or len(self.loadings) == 0:
                    if disable_reduction is False:
                        self._fit_lsi()
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            elif self.method == 'custom':
                if self.loadings is None or len(self.loadings) == 0:
                    logger.warning(
                        "No loadings provided for manual dimension reduction")
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            else:
                raise ValueError(
                    f"ERROR: Unknown reduction method: {self.method}")
            if ann_idx is None:
                self.annIdx = self._fit_ann()
            else:
                self.annIdx = ann_idx
                self.annIdx.set_ef(self.annEf)
                self.annIdx.set_num_threads(1)
            self.kmeans = self._fit_kmeans(do_kmeans_fit)

    def _handle_batch_size(self):
        if self.dims > self.data.shape[0]:
            self.dims = self.data.shape[0]
        batch_size = self.data.chunksize[
            0]  # Assuming all chunks are same size
        if self.dims >= batch_size:
            self.dims = batch_size - 1  # -1 because we will do PCA +1
            logger.info(
                f"Number of PCA/LSI components reduced to batch size of {batch_size}"
            )
        if self.nClusters > batch_size:
            self.nClusters = batch_size
            logger.info(
                f"Cluster number reduced to batch size of {batch_size}")
        return batch_size

    def iter_blocks(self, msg: str = '') -> np.ndarray:
        for i in tqdm(self.data.blocks, desc=msg,
                      total=self.data.numblocks[0]):
            yield controlled_compute(i, self.nthreads)

    def transform_z(self, a: np.ndarray) -> np.ndarray:
        return (a - self.mu) / self.sigma

    def transform_ann(self,
                      a: np.ndarray,
                      k: int = None,
                      self_indices: np.ndarray = None) -> tuple:
        if k is None:
            k = self.k
        # Adding +1 to k because first neighbour will be the query itself
        if self_indices is None:
            i, d = self.annIdx.knn_query(a, k=k)
            return i, d
        else:
            i, d = self.annIdx.knn_query(a, k=k + 1)
            return fix_knn_query(i, d, self_indices)

    def _fit_pca(self, disable_scaling, use_for_pca) -> None:
        from sklearn.decomposition import IncrementalPCA
        # We fit 1 extra PC dim than specified and then ignore the last PC.
        self._pca = IncrementalPCA(n_components=self.dims + 1,
                                   batch_size=self.batchSize)
        do_sample_subset = False if use_for_pca.sum() == self.nCells else True
        s, e = 0, 0
        # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit
        # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is
        # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than
        # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough.
        end_reservoir = []
        # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly.
        carry_over = []
        for i in self.iter_blocks(msg='Fitting PCA'):
            if do_sample_subset:
                e = s + i.shape[0]
                i = i[use_for_pca[s:e]]
                s = e
            if disable_scaling is False:
                i = self.transform_z(i)
            if len(carry_over) > 0:
                i = np.vstack((carry_over, i))
                carry_over = []
            if len(i) < (self.dims + 1):
                carry_over = i
                continue
            if len(end_reservoir) == 0:
                end_reservoir = i
                continue
            try:
                self._pca.partial_fit(i, check_input=False)
            except LinAlgError:
                # Add retry counter to make memory consumption doesn't escalate
                carry_over = i
        if len(carry_over) > 0:
            i = np.vstack((end_reservoir, carry_over))
        else:
            i = end_reservoir
        try:
            self._pca.partial_fit(i, check_input=False)
        except LinAlgError:
            logger.warning(
                "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError",
                flush=True)
        self.loadings = self._pca.components_[:-1, :].T

    def _fit_lsi(self) -> None:
        from gensim.models import LsiModel
        from gensim.matutils import Dense2Corpus

        self._lsiModel = LsiModel(
            Dense2Corpus(
                controlled_compute(self.data.blocks[0], self.nthreads).T),
            num_topics=self.dims,
            chunksize=self.data.chunksize[0],
            id2word={x: x
                     for x in range(self.data.shape[1])},
            extra_samples=0)
        for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")):
            if n == 0:
                continue
            self._lsiModel.add_documents(Dense2Corpus(i.T))
        self.loadings = self._lsiModel.get_topics().T

    def _fit_ann(self):
        import hnswlib

        dims = self.dims
        if dims < 1:
            dims = self.data.shape[1]
        ann_idx = hnswlib.Index(space=self.annMetric, dim=dims)
        ann_idx.init_index(max_elements=self.nCells,
                           ef_construction=self.annEfc,
                           M=self.annM,
                           random_seed=self.randState)
        ann_idx.set_ef(self.annEf)
        ann_idx.set_num_threads(self.annThreads)
        for i in self.iter_blocks(msg='Fitting ANN'):
            ann_idx.add_items(self.reducer(i))
        return ann_idx

    def _fit_kmeans(self, do_ann_fit):
        from sklearn.cluster import MiniBatchKMeans

        if do_ann_fit is False:
            return None
        kmeans = MiniBatchKMeans(n_clusters=self.nClusters,
                                 random_state=self.randState,
                                 batch_size=self.batchSize)
        with threadpool_limits(limits=self.nthreads):
            for i in self.iter_blocks(msg='Fitting kmeans'):
                kmeans.partial_fit(self.reducer(i))
        temp = []
        for i in self.iter_blocks(msg='Estimating seed partitions'):
            temp.extend(kmeans.predict(self.reducer(i)))
        self.clusterLabels = np.array(temp)
        return kmeans

Example #9

0

Show file

File: Reza_Program_1.py Project: Reza-Marzban/Unsupervised-Topic-Modeling-Spark

def main():
    conf = SparkConf().setAppName("Program Number 1")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    # creates Spark Session
    spark = SparkSession.builder.appName("Program Number 1").getOrCreate()

    # tweets folder address on HDFS server -  ignore files with .tmp extensions (Flume active files).
    inputpath = "hdfs://hdfs input path"

    spark.conf.set("spark.sql.shuffle.partitions", 1)

    # get the raw tweets from HDFS
    raw_tweets = spark.read.format("json").option(
        "inferScehma", "true").option("mode", "dropMalformed").load(inputpath)

    # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet
    tweets = raw_tweets.select(
        functions.lower(functions.col("text"))).withColumnRenamed(
            "lower(text)", "text").distinct().withColumn(
                "id", functions.monotonically_increasing_id())

    # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,...
    tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(
        4).setInputCol("text").setOutputCol("tokens")

    # Tokenize tweets
    tokenized_tweets = tokenizer.transform(tweets)
    remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned")

    # remove stopwords
    cleaned_tweets = remover.transform(tokenized_tweets)

    # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
    vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol(
        "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
    wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features")

    # LDA
    # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics
    lda = LDA(k=5, maxIter=25)
    # fit the model on data
    ldaModel = lda.fit(wordVectors)
    # create topics based on LDA
    lda_topics = ldaModel.describeTopics()
    # show LDA topics

    # ______________________________________________________________________________________________________________
    # LSA
    clean_tweets_list = []
    tweet_list = []
    # for creating the document term matrix for the LSIModel as input
    # this is needed as LSI needs tuples of (vocabulary_index, frequency) form
    for tweet_row in wordVectors.select('features').collect():
        tweet_list.clear()
        # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list
        # idx = vocabulary_index, val=frequency of that word in that tweet
        for idx, val in zip(tweet_row[0].indices, tweet_row[0].values):
            # converting the frequency from float to integer
            tweet_list.append((idx, int(val)))
        clean_tweets_list.append(tweet_list[:])

    # calling the LSIModel and passing the number of topics as 5
    lsa_model = LsiModel(clean_tweets_list, num_topics=5)
    # show LSA topics

    # ______________________________________________________________________________________________________________
    # #Comparison

    # get the weights and indices of words from LDA topics in format of List[list[]]
    lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()]
    lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()]

    # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape.
    # each element is the weight of the corresponding word in that specific topic.
    lsa_weightsMatrix = lsa_model.get_topics()

    # function to calculate the similarity between an lsa topic and an lda topic.
    def topic_similarity_calculator(lsa_t, lda_t):
        (lda_index, lda_weight) = lda_t
        sum = 0
        for index, weight in zip(lda_index, lda_weight):
            sum = sum + (np.abs(lsa_t[index] * weight))
        return sum

    # run the similarity function on 25 possibilities (5 LSA * 5 LDA)
    similarity = []
    eachLSA = []
    for i in range(0, 5):
        eachLSA.clear()
        for j in range(0, 5):
            temp = topic_similarity_calculator(
                lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j]))
            eachLSA.append(temp)
        similarity.append(eachLSA[:])

    # Print the similarity table
    # each row is a LDA topic and each column is an LSA topic.
    print(" ")
    print("Similarity table")

    def similarity_print(s):
        i = 1
        print("|--------------------------------------------------------|")
        print("|      |  LSA 1  |  LSA 2  |  LSA 3  |  LSA 4  |  LSA 5  |")
        print("|--------------------------------------------------------|")
        for one, two, three, four, five in zip(*similarity):
            print(
                '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |'
                .format(i, one, two, three, four, five))
            print("|--------------------------------------------------------|")
            i = i + 1
#creates the similarity matrix

    similarity_print(similarity)

    # ______________________________________________________________________________________________________________
    # Final result Table
    # Manually found the following Topics to be similar
    # (LSA1 - LDA1)
    # (LSA5 - LDA2)
    # rest are alone
    lsa_words_idx = []
    for idx, curr_topic in enumerate(lsa_weightsMatrix):
        lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1])
    lsa_topics_bow = {}
    lda_topics_bow = {}
    lsa_bow_list = []
    lda_bow_list = []
    for curr_idx, (lda_topic,
                   lsa_topic) in enumerate(zip(lda_wordIndices,
                                               lsa_words_idx)):
        lsa_bow_list.clear()
        lda_bow_list.clear()
        for idx in range(10):
            lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]])
            lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]])
        lsa_topics_bow[curr_idx] = lsa_bow_list[:]
        lda_topics_bow[curr_idx] = lda_bow_list[:]

    results = []
    names = []
    # Creating word dictionary for LDA2 and LSA5
    lda2_lsa5 = lda_topics_bow[1][:]
    for word in (lsa_topics_bow[4]):
        if word not in lda2_lsa5:
            lda2_lsa5.append(word)

# Creating word dictionary for LDA1 and LSA1
    lda1_lsa1 = lda_topics_bow[0][:]
    for word in (lsa_topics_bow[0]):
        if word not in lda1_lsa1:
            lda1_lsa1.append(word)
    results.append(lda1_lsa1)
    names.append("LDA1 - LSA1 ")
    results.append(lda2_lsa5)
    names.append("LDA2 - LSA5 ")
    results.append(lda_topics_bow[2])
    names.append("LDA3        ")
    results.append(lda_topics_bow[3])
    names.append("LDA4        ")
    results.append(lda_topics_bow[4])
    names.append("LDA5        ")
    results.append(lsa_topics_bow[1])
    names.append("LSA2        ")
    results.append(lsa_topics_bow[2])
    names.append("LSA3        ")
    results.append(lsa_topics_bow[3])
    names.append("LSA4        ")
    #printing the topics and related words
    print(" ")
    print("Topics Table")
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    print(
        "|    Topic     |  Significant Words                                                    |"
    )
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    for name, r in zip(names, results):
        print('| {} |  {} |'.format(name, r))
        print(
            "|------------------------------------------------------------------------------------------|"
        )

    print(" ")
    print(" ")

Example #10

0

Show file

#-*- coding: utf-8 -*-
import pickle

from gensim.corpora import Dictionary
from gensim.models import LsiModel

with open("../data/corpus_test.pkl", "rb") as f:
    corpus = pickle.load(f)

corpus_dictionary = Dictionary(corpus)
corpus = [corpus_dictionary.doc2bow(text) for text in corpus]

CORPUS = corpus
ID2WORD = corpus_dictionary
NUM_TOPICS = 200

lsi = LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=ID2WORD)

topic_word_matrix = lsi.get_topics()
sigma_matrix = lsi.projection.s  # SVD에서 sigular value들로 이루어진 sigma matrix. 편의를 위해 형태는 그냥 k vector
# document-topic matrix를 구해주지 않는다.

new_doc = ["영화/Noun", "재미/Noun"]  # 새로운 document
new_doc_bow = corpus_dictionary.doc2bow(new_doc)  # 새로운 document의 bag of words
vec_lsi = lsi[new_doc_bow]  # 새로운 document를 LSI 공간으로 사상.
print(vec_lsi)