Example #1
0
def run_on_time_period(start, stop):
    # create lists to hold data
    start_date = start
    stop_date = stop
    date_list = []
    raw_docs = []

    # run the data getter
    while start_date <= stop_date:
        timestamp = start_date.replace(tzinfo=timezone.utc).timestamp()
        doc = get_data(timestamp)
        raw_docs.append(doc)
        real_date = start_date - timedelta(days=1)
        date_list.append(real_date.date())
        start_date += timedelta(days=1)

    # make list of docs without name

    for i in range(len(raw_docs)):
        for name in nicknames:
            if name in raw_docs[i]:
                raw_docs[i] = raw_docs[i].replace(name, '')

    final_docs = preprocess(raw_docs)
    dict, doc_term_matrix = create_corpus(final_docs)
    # lsi_models, coherence_values = get_coherence_values(dict, doc_term_matrix, final_docs, 10, 1, 2)
    lsi_model = LsiModel(doc_term_matrix, num_topics=10, id2word=dict)
    counter = 1
    print(lsi_model.print_topics(num_topics=5, num_words=5))
Example #2
0
def build_lsi(docs):
    '''
    build lsi model from beginning
    the documents that needs to extract topics
    '''
    logging.info('There are {} documents'.format(docs.count()))
    # copy the iterator
    # build the dictionary
    logging.info('Building the dictionary...')
    dictionary = Dict.build_dict(docs)
    corpus = [i for i in get_corpus(dictionary)]  # freeze all the corpus
    logging.info('number of corpus {}'.format(len(corpus)))
    logging.info('Construction Completed.')

    # build the tfidf model
    logging.info('Building the tfidf model...')
    tfidf_model = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf_model[corpus]

    logging.info('Construction Completed.')

    # build the lsi model
    logging.info('Building the LSI model...')
    lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    corpus_lsi = lsi_model[corpus_tfidf]
    logging.info('Construction Complete.')

    lsi_model.show_topics()
    return
Example #3
0
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix,
                        num_topics=number_of_topics,
                        id2word=dictionary)
    return (lsamodel.print_topics(num_topics=number_of_topics,
                                  num_words=words))
Example #4
0
def build_lsi(docs):
    '''
    build lsi model from beginning
    the documents that needs to extract topics
    '''
    logging.info('There are {} documents'.format(docs.count()))
    # copy the iterator
    # build the dictionary
    logging.info('Building the dictionary...')
    dictionary = Dict.build_dict(docs)
    corpus = [i for i in get_corpus(dictionary)] # freeze all the corpus
    logging.info('number of corpus {}'.format(len(corpus)))
    logging.info('Construction Completed.')

    # build the tfidf model
    logging.info('Building the tfidf model...')
    tfidf_model = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf_model[corpus]


    logging.info('Construction Completed.')

    # build the lsi model
    logging.info('Building the LSI model...')
    lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    corpus_lsi = lsi_model[corpus_tfidf]
    logging.info('Construction Complete.')

    lsi_model.show_topics()
    return
    def train(self, data):
        """
        Fit LSA model to the data, set document topic vectors and calculate distances.

        :param data: Data to fit model on
        """

        if self.word_dict == None:
            print(
                "Dictionary must be assigned to model before training. This function call does nothing"
            )
            return
        if self.model == None:
            self.model = LsiModel(num_topics=self.vector_length,
                                  id2word=self.word_dict)

        self.name = '%s_%strain' % (self.name, data.name)
        self.path = Path('modelfiles/%s/%s' % (data.name, self.name))

        try:
            self.model = LsiModel.load(str(self.path / '.model'))
        except:
            self.path.mkdir(parents=True, exist_ok=True)

            print("Training model...", end='')
            time.sleep(0.1)

            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            self.model.add_documents(datastream)

            self.model.save(str(self.path / '.model'))
Example #6
0
    def _fit_lsi(self, lsi_skip_first, lsi_params) -> None:
        import warnings

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            from gensim.models import LsiModel
            from gensim.matutils import Dense2Corpus

        for i in ["corpus", "num_topics", "id2word", "chunksize", "dtype"]:
            if i in lsi_params:
                del lsi_params[i]
                logger.warning(
                    f"Provided parameter, {i}, for LSI model will not be used")
        self._lsiModel = LsiModel(
            corpus=Dense2Corpus(
                controlled_compute(self.data.blocks[0], self.nthreads).T),
            num_topics=self.dims + 1,  # +1 because first dim will be discarded
            chunksize=self.data.chunksize[0],
            id2word={x: x
                     for x in range(self.data.shape[1])},
            **lsi_params,
        )
        for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")):
            if n == 0:
                continue
            self._lsiModel.add_documents(Dense2Corpus(i.T))
        if lsi_skip_first:
            self.loadings = self._lsiModel.get_topics().T[:, 1:]
        else:
            self.loadings = self._lsiModel.get_topics().T
Example #7
0
def train_LSIModel(tokens, num_top):
    # reuters_text = open("test2.txt", "r")
    dct = corpora.Dictionary(tokens)
    document_matrix = [dct.doc2bow(article) for article in tokens]
    model = LsiModel(document_matrix, num_topics=num_top, id2word=dct)
    model.save("test2.LSIModel")
    return model
Example #8
0
def create_gensim_lsa_model(doc_clean,number_of_topics,words):

    prepare_corpus = project2.initialize_terms_and_postings()
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel
Example #9
0
def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    if lsa_training:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        # generate LSA model
        lsi_model = LsiModel(doc_term_matrix,
                             num_topics=number_of_topics,
                             id2word=dictionary)  # train model
        #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence()
        #print("Coherence value : ",coherence_value)
        print('Saving lsi_model...')
        lsi_model.save(lsi_model_path)
        print('lsi_model saved!')
        corpus_lsi = lsi_model[doc_term_matrix]
        with open(corupus_lsi_path, 'wb') as handle:
            pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Corpus_lsi saved.')

    else:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        print('Loading lsi_model...')
        lsi_model = LsiModel.load(lsi_model_path)
        print('lsi_model Loaded!')
        corpus_lsi = lsi_model[doc_term_matrix]

    return lsi_model, corpus_lsi, dictionary
def train(n_topics=num_topics):

    docs = read_ap.get_processed_docs()
    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50
                               )
    # save the dictionary
    with open(os.path.join(folder_path_objects,
                           'dictionary_lsi_bow'), 'wb') as f:
        pickle.dump(dictionary, f)

    # create binary and regular bow corpus
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    # save corpuses
    with open(os.path.join(folder_path_objects,
                           'corpus_binary'), 'wb') as f:
        pickle.dump(corpus_binary, f)

    # create models
    print(f'{time.ctime()} Start training LSA (binary bow)')
    lsi_bin = LsiModel(
        corpus=corpus_binary,
        id2word=dictionary,
        chunksize=1000,
        num_topics=n_topics
    )

    # save models to disk
    os.makedirs(folder_path_models, exist_ok=True)

    lsi_bin.save('./models/lsi_bin_filtered')
Example #11
0
def create_lsi_model(project,
                     corpus,
                     id2word,
                     name,
                     use_level=True,
                     force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=project.num_topics,
        )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
Example #12
0
class LsiVec(TopicVec):
    def __init__(self, vec_num):
        TopicVec.__init__(self, vec_num)

    def __gen_model(self, corpus):
        # if self.p_corpus == 'onehot':
        #     model_name = 'lsi_one_hot.model'
        # else:
        #     model_name = 'lsi_tfidf.model'
        model_name = 'lsi.model'
        self.model = LsiModel(corpus,
                              id2word=self.dictionary,
                              num_topics=self.vec_num)
        self.model.save(os.path.join(self.out_dir, model_name))

    def __get_model(self):
        model_name = 'lsi.model'
        if os.path.exists(os.path.join(self.out_dir, model_name)):
            self.model = LsiModel.load(os.path.join(self.out_dir, model_name))
        else:
            raise FileNotFoundError('"{}" file not found!'.format(model_name))

    def fit(self, doc, out_dir, use_exist_dictionary=False):
        TopicVec.fit(self, doc, out_dir, use_exist_dictionary)
        self.__gen_model(self.corpus)
Example #13
0
 def trainModel(self):
     if self.toweight:
         self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]])
     else:
         self.model = LsiModel(self.corpus, num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.corpus])
Example #14
0
def train_models():
    models = dict()
    if settings["models"]["msda"]:
        dims = settings["dimensionalities"]["msda"]
        try:
            msda = mSDA.load("reuters_msda_%sdims" % dims)
            # the line below is for testing a model I have locally on my machine
            #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3")
        except:
            ln.info("Training mSDA...")

            prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]]
            msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids)
            msda.train(bow_corpus())
            msda.save("reuters_msda_%sdims" % dims)
        msda.__out_size = dims
        models["msda"] = msda

    if settings["models"]["lsi"]:
        dims = settings["dimensionalities"]["lsi"]
        try:
            lsi = LsiModel.load("reuters_lsi_%sdims" % dims)
        except:
            ln.info("Training LSI...")
            lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary)
            lsi.save("reuters_lsi_%sdims" % dims)
        lsi.__out_size = dims
        models["lsi"] = lsi

    return models
def get_topic(text):
    np.random.seed(100)
    nlp = spacy.load('en')
    my_stop_words = [
        u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get'
    ]
    for stopword in my_stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True
    doc = nlp(text)
    article = []
    texts = []
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article.append(w.lemma_)
    texts.append(article)
    # getting bigrams out of words using gensim
    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    # Creating corpus with our words
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(i) for i in texts]
    # Applying LDA and LSI models
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
    ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    lsitopics = [[word for word, prob in topic]
                 for topicid, topic in lsimodel.show_topics(formatted=False)]
    ldatopics = [[word for word, prob in topic]
                 for topicid, topic in ldamodel.show_topics(formatted=False)]
    topics = []
    for i in ldatopics:
        topics.append(i[0])
    tags = nltk.pos_tag(topics)
    # removing verbs as generally nouns are topics
    lfinaltopics = [
        word for word, pos in tags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB'
    ]
    ldafinaltopics = list(set(lfinaltopics))
    lstopics = []
    for i in lsitopics:
        for j in i:
            lstopics.append(j)
    ltags = nltk.pos_tag(lstopics)
    lsifinaltopics = [
        word for word, pos in ltags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ'
    ]

    # Intersection of results from both models
    finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics))
    final_topics = []
    for i in finaltopics:
        if len(i) >= 2:
            final_topics.append(i)
    return final_topics
Example #16
0
def build_lsi_model(dictionary, corpus, should_rebuild):
    lsi = list()

    # DEBUG
    should_rebuild = True

    if not should_rebuild:
        try:
            print('Loading LSI Model backup...')
            lsi_file = utils.get_file_path(cfg.LDA_BACKUP)
            print('LSI file = {}'.format(lsi_file))

            lsi = LdaModel.load(lsi_file)

        except Exception as exc:
            utils.print_exception_details('Building LSI Model', exc)

    else:
        print('Building LSI Model...')
        one_pass = cfg.NUM_PASSES > 1
        lsi = LsiModel(corpus,
                       id2word=dictionary,
                       num_topics=cfg.NUM_TOPICS,
                       onepass=one_pass)
        print('Done!')
        # Save Model Structures
        LSI_FILE = utils.get_file_path(cfg.LSI_BACKUP)
        lsi.save(LSI_FILE)

    return lsi
Example #17
0
    def train_model(self, num_topics):
        corpus = self.get_corpus()
        model = LsiModel(corpus, num_topics=num_topics)
        tmp_fname = self.path + self.model_type + "_model"
        model.save(tmp_fname)

        return model
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Example #19
0
class LatentSemanticIndexing():
    """
    This class implements Latent semantic indexing using the genims library.
    """
    def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000):

        self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics)
        self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics)
        self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics)
        self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics)

        self.embedding = embedding
        self.corpus = corpus
        self.num_topics = num_topics

        if os.path.exists(self.lsi_model_path):

            print("LSI {} model already trained, loading from disk.".format(embedding))
            self.model = LsiModel.load(self.lsi_model_path)

        else:

            # Make a index to word dictionary.
            temp = corpus.dictionary[0]  # This is only to "load" the dictionary.
            id2word = corpus.dictionary.id2token

            print("Training LSI model.")
            self.model = LsiModel(
                corpus=list(corpus.get_corpus()),
                id2word=id2word,
                chunksize=chunksize,
                num_topics=num_topics
            )
            print("Saving LSI model.")
            self.model.save(self.lsi_model_path)

        self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path)

        if os.path.exists(self.sim_matrix_path):
            print("Similarities matrix {} model already trained, loading from disk.".format(embedding))
            self.index = similarities.Similarity.load(self.sim_matrix_path)
        else:
            print("Creating similarities index.")
            Path(self.sim_matrix_temp_path).touch(exist_ok=True)
            self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics)
            self.index.save(self.sim_matrix_path)

    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)

        if self.embedding == "bow":
            lsi_query = self.model[vec_query]
        elif self.embedding == "tfidf":
            lsi_query = self.model[self.corpus.tfidf_model[vec_query]]

        sims = self.index[lsi_query]
        sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1])
        return sims
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False):
    """as in [VISR12: 4.2.1]"""
    # TODO options here:
    # * if it should filter AFTER the LSI

    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")

    filtered_dcm.add_pseudo_keyworddocs()
    dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())])
    print("Start creating the LSA-Model with MORE topics than terms...")
    lsamodel_manytopics = LsiModel(doc_term_matrix,
                                   num_topics=len(all_terms) * 2,
                                   id2word=dictionary)
    print("Start creating the LSA-Model with FEWER topics than terms...")
    lsamodel_lesstopics = LsiModel(filtered_dcm.dtm,
                                   num_topics=len(filtered_dcm.all_terms) //
                                   10,
                                   id2word=dictionary)
    print()
    import matplotlib.cm
    import matplotlib.pyplot as plt
    # TODO use the mpl_tools here as well to also save plot!
    plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200],
               vmin=lsamodel_lesstopics.get_topics().min(),
               vmax=lsamodel_lesstopics.get_topics().max(),
               cmap=matplotlib.cm.get_cmap("coolwarm"))
    plt.show()
Example #21
0
def lsi(clean_docs, model_name, topics):

    from gensim import corpora
    # turn all data into a dictionary mappping of normalized words and their integer ids
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LSI model
    from gensim.models import LsiModel
    num_topics = topics  # find this number of topics in the data

    lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary)
    lsimodel.save('lsi_model_' + model_name + '.gensim')
    topics = lsimodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #23
0
def lsi(dataframe, num_topics=300):
    """Returns an LSI model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    num_topics : int (default is 300)
        The number of topics to train the LSI model with.

    Returns
    -------
    model : Gensim LsiModel
        LSI model for documents stored in the DataFrame.
    """
    filename = 'caches/models/lsi.model'

    if not os.path.isfile(filename):
        dictionary = dictionary_corpus(dataframe)
        bow = bow_corpus(dataframe)
        tfidf_model = tfidf(dataframe)
        tfidf_corpus = tfidf_model[bow]
        lsi_model = LsiModel(tfidf_corpus,
                             id2word=dictionary,
                             num_topics=num_topics)
        lsi_model.save(filename)
    else:
        lsi_model = LsiModel.load(filename)

    return lsi_model
Example #24
0
def lsi(corpus, dictionary):
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_similarity_matrix = MatrixSimilarity(lsi_corpus)
    print(lsi_model.show_topics())
    return lsi_similarity_matrix
Example #25
0
class LSI():
    @timed
    def init_lsi(self, **kwargs):
        # handle onepass=False
        model_sparse = self.sparse

        class __sparse():
            def __iter__(self):
                for a in model_sparse:
                    yield [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)]

        self.lsi = LsiModel(__sparse(), **kwargs)
        self.lsi.save(self.path + 'lsi.pkl')

    def load_lsi(self):
        self.lsi = LsiModel.load(self.path + 'lsi.pkl')

    def load_dense(self, storage='disk'):
        self.dense = sorbet(self.path + 'dense', kind=storage).load()

    def sparse_to_dense(self, sparse):
        dense = self.lsi[sparse]
        dense = sparse2full(dense, self.lsi.num_topics)
        dense = array('f', dense)
        return dense

    @timed
    def init_dense(self, storage=None, workers=None):
        _workers = workers or self.params.get(
            'dense__workers') or self.params.get('workers', 1)
        _storage = storage or self.params.get(
            'dense__storage') or self.params.get('storage', 'disk')
        if _workers > 1:
            self._init_dense_mp(workers=_workers, storage=_storage)
        else:
            self._init_dense_sp(storage=_storage)

    def _init_dense_sp(self, storage='disk'):
        self.dense = sorbet(self.path + 'dense', kind=storage).new()
        for a in self.sparse:
            sparse = [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)]
            dense = self.sparse_to_dense(sparse)
            self.dense.append(dense)
        self.dense.save()

    def _init_dense_mp(self, workers, storage):
        chunksize = self.params.get('dense__chunksize', 10)
        s = sorbet(self.path + 'dense').new()
        id_iter = range(len(self.meta))
        id_iter = tqdm(id_iter, 'dense', len(self.meta))
        with mp.Pool(workers, init_dense_worker, [
                self.path,
        ]) as pool:
            dense = pool.imap(dense_worker, id_iter, chunksize)
            for d in dense:
                s.append(d)
        self.dense = s.save()
Example #26
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
def save_lsi_model(corpus_tfidf, dictionary):
    # apply transformation to whole corpus
    print("lsi model")
    lsi = LsiModel(corpus_tfidf, id2word=dictionary,
                   num_topics=3000)  # initialize LSI transformation
    tmp_fname = get_tmpfile("lsi.model")
    print("saving tmp file")
    lsi.save(tmp_fname)
    return tmp_fname
Example #28
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
Example #29
0
 def __gen_model(self, corpus):
     # if self.p_corpus == 'onehot':
     #     model_name = 'lsi_one_hot.model'
     # else:
     #     model_name = 'lsi_tfidf.model'
     model_name = 'lsi.model'
     self.model = LsiModel(corpus,
                           id2word=self.dictionary,
                           num_topics=self.vec_num)
     self.model.save(os.path.join(self.out_dir, model_name))
 def fit_model(self, corpus: List):
     """
     This method creates the model, using Gensim Latent Semantic Analysis.
     The model isn't then returned, but gets stored in the 'model' class attribute.
     """
     dictionary = Dictionary(corpus)
     word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus]
     self.model = LsiModel(word_docs_matrix,
                           id2word=dictionary,
                           **self.additional_parameters)
Example #31
0
def lsi(all_tokens_lists):
    dictionary = corpora.Dictionary(all_tokens_lists)
    corpus = [dictionary.doc2bow(text) for text in all_tokens_lists]
    tfidf = models.TfidfModel(corpus, smartirs='ntc')
    tfidf_model = tfidf[corpus]
    lsi_model = LsiModel(corpus=tfidf_model,
                         id2word=dictionary,
                         num_topics=7,
                         decay=0.5)
    pprint(lsi_model.print_topics(-1, 10))
Example #32
0
def create_lsi(num_topic, dictionary):
    corpus, dic = generate_corpus(dictionary)
    print("__________________________Create LSI_________________________")
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dic)
    topics = lsimodel.print_topics(num_topic)  # Showing only the top 5 topics
    # see list of topics
    for topic in topics:
        print(topic)

    return lsimodel
def getLsiModel(tfidfModel) -> LsiModel:
    modelPath = os.path.join('.cache', 'lsi.gensim_model')
    try:
        lsiModel = LsiModel.load(modelPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        lsiModel = LsiModel(corpus, num_topics=200)
        lsiModel.save(modelPath)

    return lsiModel
Example #34
0
def latent_semantic_indexing(corpus, num_topics, id2word):
    ''' LATENT SEMANTIC INDEXING
    # Advantage of LSI: ranks topics by itself. Outputs topics in a ranked order.
    # Requires a num_topics parameter (200 by default) to determine the number of latent dimensions after the SVD.
    '''
    print 'Latent Semantic Indexing'
    lsi_model = LsiModel(corpus = corpus, num_topics = num_topics, id2word = id2word)
    lsi_model.show_topics(num_topics = num_topics)
    lsi_topic = lsi_model.show_topics(formatted = False)
    return lsi_model
    def topicsLSI(self, num_topics=10, num_words=10):
        # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100)
        lsi = LsiModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word)

        # show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
        # Return num_topics most significant topics (return all by default).
        # For each topic, show num_words most significant words (10 words by default).
        # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False.
        # If log is True, also output this result to log.

        return lsi.show_topics(num_words=num_words, formatted=False)
def lsi_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lsi):  # 使用lsi模型,获取主题分布
    lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20)
    f_keyword = open(cluster_keyword_lsi, 'w+',encoding='utf-8')
    for topic in lsi.print_topics(20, 20):
        print(topic[0])
        words = []
        for word in topic[1].split('+'):
            word = word.split('*')[1].replace(' ', '')
            words.append(word)
        f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
    return lsi
Example #37
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms)
    # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(HERE, "sogou.dict"))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, "sogou.model"))
    lsi_model.save(os.path.join(HERE, "sogou.lsi"))
    print "save dictionary and tfidf model"
    """    
Example #38
0
class LSITransformation:
    def __init__(self, input_space_vectors_map):
        self.input_space_vectors = input_space_vectors_map.values()
        self.transform()

    def transform(self):
        self.space = LSISpace(self.input_space_vectors)
        #TODO Handle Saner Reduction
        self.reduced_space = 15

        input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors]
        self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word())
        return self.lsi_model

    def dissimilarity_score(self, tokens, other_tokens):
        bows = self.space.doc2bow(tokens)
        other_bows = self.space.doc2bow(other_tokens)

        vector = self.infer_and_vectorize(bows)
        other_vector = self.infer_and_vectorize(other_bows)
        similarity = CosineSimilarity().calculate(vector, other_vector)
        return 1 - similarity

    def infer_and_vectorize(self, bows):
        transformed_bow = defaultdict(float)
        transformed_bow.update(dict(self.lsi_model[bows]))
        return [transformed_bow[dimension] for dimension in range(0, self.reduced_space)]

    def print_transformation(self):
        topics = self.lsi_model.show_topics(num_words=self.space.length(), formatted=False)
        for topic in topics:
                print [(round(value, 4), token) for value, token in topic]
Example #39
0
    def transform(self):
        self.space = LSISpace(self.input_space_vectors)
        #TODO Handle Saner Reduction
        self.reduced_space = 15

        input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors]
        self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word())
        return self.lsi_model
Example #40
0
    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)
	def build(self):
		### need to find out a way to pick the proper number of the cluster - may be based on the number of POST 
		self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3)
		self.corpus_lsi = self.lsi_model[self.corpus_tfidf]
		##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4)
		#print "topics difference"
		#print self.lsi_model.print_topic(2, topn=4)
		self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False)
Example #42
0
def load_corpus():
    dictionary = corpora.Dictionary.load(os.path.join(HERE, "sogou.dict"))
    tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, "sogou.model"))
    lsi_model = LsiModel.load(os.path.join(HERE, "sogou.lsi"))
    try:
        sg_class = joblib.load(os.path.join(HERE, "sgdc_clf.pkl"))
    except:
        sg_class = None
    return dictionary, tfidf_model, lsi_model, sg_class
Example #43
0
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        self.model = LsiModel.load(self.model_fn)

        if os.path.exists(self.model_fn + '.tfidf'):
            self.tfidf = TfidfModel.load(self.model_fn + '.tfidf')

        return self
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=project.num_topics,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
Example #45
0
  def algorithm_lsi(self, category_id, objs, goldstandards):
    numTopics = self.calculate_k_using_firstnames(objs)
    print "Using k = "+str(numTopics)

    texts = []
    for obj in objs:
      texts.append(get_categorizedproduct_content(obj))

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts] # bag of words

    print "Create models"
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=numTopics)
    corpus_lsi = lsi_model[corpus]
    print "Done creating models"

    results = []
    labels = []
    cont = 0
    for probabilities, obj in izip(corpus_lsi, objs):
      if probabilities:
        max_prop = max(probabilities, key=lambda item:item[1])[0]
      else:
        max_prop = "WARNING "+str(texts[cont])
      labels.append(max_prop)
      results.append(str(max_prop)+" # "+obj['name'].encode('utf8'))
      cont += 1
    results.sort()
    for r in results:
      print r

    topic_id = 0
    for topic in lsi_model.show_topics(num_words=5):
        print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
        topic_id+=1

    if numTopics > 1:
      self.calculate_metrics(category_id, objs, labels, goldstandards)
    else:
      print "number of clusters equals or lower than 1, ignoring metric"
Example #46
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
Example #47
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #48
0
def export_model(model_file, out_file):
    """Saves the model. The output will be utf-8 encoded."""
    #    model = model_mapping[model_type].load(model_file)
    model = LsiModel.load(model_file)
    with FileWriter(out_file, "w").open() as out:
        out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics))
        for term in xrange(model.numTerms):
            word = model.id2word.id2token[term].decode("utf-8")
            while len(word) > 0 and not word[-1].isalnum():
                word = word[0:-1]
            out.write(u"{0}\n".format(word))
            out.write(
                u"{0}\n".format(u"\t".join(str(f) for f in numpy.asarray(model.projection.u.T[:, term]).flatten()))
            )
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
Example #50
0
logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary)  # Log Entropy weights frequencies of all document features in the corpus

tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
document = "Some text to be transformed."
bow_document = dictionary.doc2bow(tokenize_func(document))  # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
logent_document = logent_transformation[[bow_document]]  # converts a single document to log entropy representation. document must be in the same vector space as corpus.

documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents)  # use a generator expression because...
logent_documents = logent_transformation[bow_documents]  # ...transformation is done during iteration of documents using generators, so this uses constant memory

### Chained transformations
logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary)  # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus.

lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400)  # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.

lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, num_features=400)  # Performs same operation as above, but with implicit chaining

# Can persist transformation models, too.
logent_transformation.save("logent.model")
lsi_transformation.save("lsi.model")


### Similarities (the best part)
from gensim.similarities import Similarity

documents = ["A bear walked in the dark forest.",
             "Tall trees have many more leaves than short bushes.",
             "A starship may someday travel across vast reaches of space to other stars.",
             "Difference is the concept of how two or more entities are not the same."]
from gensim.matutils import cossim
from gensim.models import LsiModel

logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)

parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model")
parser.add_argument("-d", "--data", help="path to training.tsv")
args = parser.parse_args()

# Load model
# Note - model contains dictionary that intentionally omits stopwords
model = LsiModel.load(args.model, mmap='r')

# Load 'training' data
training_data = open(args.data)
training_data.readline()  # advance past header line

correct = 0
total = 0

for line in training_data:
    elements = line.split("\t")
    question_id = elements.pop(0)
    correct_answer = elements.pop(1)

    # Get bag-of-words representation of question and answers
    doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements]
Example #52
0
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)

	# word2vec model
	class MyCorpus():
		def __iter__(self):
			for d in corpus.get_texts():
				yield [w for w in d if w in corpus.dictionary.token2id]
	if not os.path.exists(f_w2v):
		w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5)
		w2v.save_word2vec_format(f_w2v, binary=True)

	# LANGUAGE MODELS
	lm_cache = models.Cache(window=50)
	lm_lsa = models.LSA(f_lsa, f_dict, tfidf=f_tfidf, window=50)
	lm_w2v = models.Word2Vec(f_w2v, window=50)
logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))
Example #54
0
tfidf_model = TfidfModel(corpus)
tfidf_model.save("wiki_en_tfidf.model")

# lsi_model = LsiModel(corpus)

# topic_id = 0
# for topic in lsi_model.show_topics():
#    topic_id+=1
#    print "TOPIC (LSI) " + str(topic_id) + " : " + topic

# lsi_model.print_topic(20, topn=10)
# corpus_lsi = lsi_model[corpus]

corpus_tfidf = tfidf_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
# corpus_lsi_2 = lsi_model_2[corpus]
print "Done creating models"

lsi_model_2.save("wiki_en_model.lsi")

# lsi_model_2 .print_topics(5)

"""
topic_id = 0
for topic in lsi_model_2.show_topics():
    print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
    #group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5]
    group_topic = [doc for doc in corpus_lsi_2]
    print str(group_topic)
    topic_id+=1
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

corpus = MyCorpus(test_data_dir) # create a dictionary
for vector in corpus: # convert each document to a bag-of-word vector
    print vector

topics = 200
num_clusters = 4

print "Create models"
lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics)
corpus_lsi = lsi_model[corpus]

print "Done creating models"


#lsi_model_2 .print_topics(5)

topic_id = 0
for topic in lsi_model.show_topics(num_words=5):
    print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
    topic_id+=1


#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print "Doc " + str(doc)
Example #56
0
corpus_filename = 'deerwester.mm'
if not os.path.isfile(corpus_filename):
    # convert the dictionary to a bag of words corpus for reference
    corpus = [dictionary.doc2bow(review) for review in abstract_vectors]
    corpora.MmCorpus.serialize(corpus_filename, corpus)
else:
    corpus = corpora.MmCorpus(corpus_filename)



#  vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts

print("lsi")
lsi_filename = 'model.lsi'
if not os.path.isfile(lsi_filename):
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=5)  # initialize an LSI transformation, 5 topicos
    #
    lsi.save(lsi_filename)  # same for tfidf, lda, ...
else:
    lsi = LsiModel.load(lsi_filename)

lsi_topics = 5  # numero predefinido de topicos
def print_topic(lsi, topicno, topn=7):
    """
        Return a single topic as a formatted string. See `show_topic()` for parameters.

        >>> lsimodel.print_topic(topicno, topn)
        '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'

        """
    return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)])
tfidf[bows[0]]


# In[19]:

dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]])


# Notice how "you" didn't get as much weight as "enjoy"  
# Let's look at some other tweets  

# In[9]:

from gensim.models import LsiModel
lsi = LsiModel.load('../../data/lsi100')
len(lsi.id2word)


# This is starting to look a lot like a set of vectors that we could use as features  
# But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?

# In[ ]:

len(vocab)


# 100k dimensions isn't a good idea  
# Even for a masively parallel deep learning project this would be big  
# Like the cat/dog picture classification on 256x256 images  
# What about PCA (Principal Component Analysis) like is used on images?  
            # 将文档(a list of words) 转换成 bag-of-words format = list of `(token_id, token_count)` 2-tuples.
            # 可通过参数allow_update来设置对模型的更新或只读

corpus = MyCorpus(test_data_dir) # 创建一个字典

for vector in corpus: # 每个文档转换成 a bag-of-word vector后的输出
    print (vector)
    break

print ("创建模型")
tfidf_model = TfidfModel(corpus)#转换成局部/全局加权TF_IDF矩阵,它可以将一个简单的计数表示成TFIDF空间。
# tfidf = TfidfModel(corpus)
# print(tfidf[some_doc])#输出模型
# tfidf.save('/tmp/foo.tfidf_model')#保存模型

lsi_model = LsiModel(corpus)
#LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index),
#是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents),
#并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。
#同义词和多义词如何导致传统向量空间模型检索精确度的下降。
#LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。

topic_id = 0
for topic in lsi_model.show_topics():
    topic_id+=1
    print ("TOPIC (LSI) " + str(topic_id) + " : ", topic)

print('#'*50)
print(lsi_model.num_topics)
for i in range(0, lsi_model.num_topics-1):
    if lsi_model.print_topic(i):
class LSA(object):
	def __init__(self, stopwords, ignorechars):
		#self.stopwords = stopwords
		self.ignorechars = ignorechars
		self.wdict = {} 
		self.dcount = 0
	def createStopwords(self, stopword_path):
		with open(stopword_path, 'r') as file1:
			temp = file1.read()
			self.stopwords = temp.split()

	def parse_dic_bow(self, seg_post):
		self.posts = [post for post in seg_post.values()]
		logger.info("BOW process... ")
		print "original post:"
		logger.debug("original post:")
		logger.debug(self.posts)
		#print self.posts
		self.mergeLineForOnePost = [" ".join(post) for post in self.posts] #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw'] 
		#print self.mergeLineForOnePost
		#self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']]
		## covert UTF to ASCII
		self.texts = [[word.encode('utf8') for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']]
		print "self.mergeLineForOnePost: "
	
		self.dictionary = gensim.corpora.Dictionary(self.texts)


		self.postIdList = [str(postId) for postId in seg_post.keys()]
		logger.debug("original dic and list:")
		logger.debug(self.dictionary, len(self.dictionary), self.postIdList)
		print "original dic and list:"
		print self.dictionary, self.postIdList

		### preprocess - remove the once-word, stopwords, other shits 
		stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords if stopword in self.dictionary.token2id]
		once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
		### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster)
		
		#self.dictionary.filter_tokens(once_ids)
		self.dictionary.filter_tokens(stop_ids)
		logger.info("removed once-words and stopwords......")
		logger.debug(self.dictionary, len(self.dictionary))
		print "removed once-words and stopwords......"
		print self.dictionary
		self.dictionary.compactify()
		self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts]
		#self.new_vec = self.dictionary.doc2bow(post for post in self.coverts)
	def store(self):
		logger.info("store process starts")
		self.dictionary.save(testDictionary)
		self.dictionary.save_as_text(testDictionaryString)
		corpora.MmCorpus.serialize(testBOWCorpus, self.new_vec) # store to disk, for later use
		#corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store 
		#dictionary = corpora.Dictionary.load(testDictionary) # comes from the store
	def TFIDF(self):
		logger.info("TFIDF process starts")
		self.tfidf = TfidfModel(self.new_vec)
		self.corpus_tfidf = self.tfidf[self.new_vec]
	def printInfo(self):
		print 'show Dic: '
		print self.dictionary
		print 'show BOW: '
		for bow in self.new_vec: 
			print bow
		print 'show corpus_tfidf model: '
		print self.tfidf
		print "show corpus_tfidf: "
		for i in self.corpus_tfidf:
			print i
		print "show LSA assignment of each post: "
		#self.num = len(self.corpus_lsi)
		#for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
		for doc, postId in zip(self.corpus_lsi,self.postIdList):
			templist = [] 
			print 'post: {0}'.format(postId)
			print doc
			#print "breakdown"
			#for each in doc:
			#	templist.append(abs(each[1]))
			#print "templist: "
			#print templist
			theLarge = nlargest(1, doc, key=lambda e:abs(e[1])) ## 1 means find the largest one
			if theLarge:
				print "the largest one with absoule value: ", theLarge[0][0]
			else:
				print "cannot find it!!!!"
		print "LSA Topics : "
		print self.topics
		print "Break down : "
		for i in self.topics:
			print i
			print type(i)
	def build(self):
		### need to find out a way to pick the proper number of the cluster - may be based on the number of POST 
		self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3)
		self.corpus_lsi = self.lsi_model[self.corpus_tfidf]
		##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4)
		#print "topics difference"
		#print self.lsi_model.print_topic(2, topn=4)
		self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False)
		#print "tuple!@!"
		#print ss 
	def repaserForOutput(self): 
	### post_assignment = {post_id:topic} Ex. {"p1":"t1"}
	### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"]
		#print "start to extact info for post_assignment"
		self.post_assignment = {}
		self.topic_assignment = {}
		for doc, postId in zip(self.corpus_lsi,self.postIdList): #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8']
			theTopic = nlargest(1, doc, key=lambda e:abs(e[1]))
			if theTopic:
				self.post_assignment[postId] = theTopic[0][0]
			else: 
				self.post_assignment[postId] = "NB"
			#self.post_assignment[postId] = theTopic[0]
		self.num = len(self.topics)
		for topic, num in zip(self.topics, range(self.num)):
			topicWords = []
			for each in topic:
				#covert from string to unicode
				topicWords.append(each[1].decode('utf8'))
				#topicWords.append(each[1])
			## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"]
			#self.topic_assignment[str(num)] = topicWords[0]
			self.topic_assignment[str(num)] = topicWords
		#matchObj = re.match( r'(.*) are(\.*)', line)
		#rerurn(self.post_assignment,self.topic_assignment)
		return (self.post_assignment,self.topic_assignment)
	def create_result(self,seg_post):
		logger.info('LSA main process starts.....')
		self.createStopwords(stopword_path)
		self.parse_dic_bow(seg_post)
		self.TFIDF()
		self.build()
		self.store()
	def get_result(self):
		self.printInfo()
		return (self.repaserForOutput())
Example #60
0
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 800)
pd.set_option('precision', 2)
get_ipython().magic(u'precision 4')
get_ipython().magic(u'pprint')


# In[3]:

from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR


# In[6]:

lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi100'))
lsi2 = LsiModel.load(os.path.join(DATA_PATH, 'lsi2'))


# In[7]:

with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'rb') as f:
    topics = pd.DataFrame.from_csv(f, encoding='utf8')
topics = topics.fillna(0)


# In[8]:

dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')