Example #1
0
def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    if lsa_training:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        # generate LSA model
        lsi_model = LsiModel(doc_term_matrix,
                             num_topics=number_of_topics,
                             id2word=dictionary)  # train model
        #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence()
        #print("Coherence value : ",coherence_value)
        print('Saving lsi_model...')
        lsi_model.save(lsi_model_path)
        print('lsi_model saved!')
        corpus_lsi = lsi_model[doc_term_matrix]
        with open(corupus_lsi_path, 'wb') as handle:
            pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Corpus_lsi saved.')

    else:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        print('Loading lsi_model...')
        lsi_model = LsiModel.load(lsi_model_path)
        print('lsi_model Loaded!')
        corpus_lsi = lsi_model[doc_term_matrix]

    return lsi_model, corpus_lsi, dictionary
Example #2
0
def train_lsa(is_tfidf, num_topics):
    # Create corpus
    print('Create corpus')
    corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf)

    # Set training parameters.
    num_topics = num_topics
    chunksize = 20000

    start = time.time()
    temp = dictionary[0]
    id2word = dictionary.id2token
    print('Start LSI training')

    lsi_model = LsiModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        chunksize=chunksize,
    )
    lsi_model.show_topics()

    ir_method = 'tfidf'  if is_tfidf else 'bow'

    lsi_model.save('saved_models/lsi_model_%s_%s' % (ir_method, num_topics))
    print('LSA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Example #4
0
def lsi(dataframe, num_topics=300):
    """Returns an LSI model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    num_topics : int (default is 300)
        The number of topics to train the LSI model with.

    Returns
    -------
    model : Gensim LsiModel
        LSI model for documents stored in the DataFrame.
    """
    filename = 'caches/models/lsi.model'

    if not os.path.isfile(filename):
        dictionary = dictionary_corpus(dataframe)
        bow = bow_corpus(dataframe)
        tfidf_model = tfidf(dataframe)
        tfidf_corpus = tfidf_model[bow]
        lsi_model = LsiModel(tfidf_corpus,
                             id2word=dictionary,
                             num_topics=num_topics)
        lsi_model.save(filename)
    else:
        lsi_model = LsiModel.load(filename)

    return lsi_model
Example #5
0
class LsiVec(TopicVec):
    def __init__(self, vec_num):
        TopicVec.__init__(self, vec_num)

    def __gen_model(self, corpus):
        # if self.p_corpus == 'onehot':
        #     model_name = 'lsi_one_hot.model'
        # else:
        #     model_name = 'lsi_tfidf.model'
        model_name = 'lsi.model'
        self.model = LsiModel(corpus,
                              id2word=self.dictionary,
                              num_topics=self.vec_num)
        self.model.save(os.path.join(self.out_dir, model_name))

    def __get_model(self):
        model_name = 'lsi.model'
        if os.path.exists(os.path.join(self.out_dir, model_name)):
            self.model = LsiModel.load(os.path.join(self.out_dir, model_name))
        else:
            raise FileNotFoundError('"{}" file not found!'.format(model_name))

    def fit(self, doc, out_dir, use_exist_dictionary=False):
        TopicVec.fit(self, doc, out_dir, use_exist_dictionary)
        self.__gen_model(self.corpus)
Example #6
0
class LatentSemanticIndexing():
    """
    This class implements Latent semantic indexing using the genims library.
    """
    def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000):

        self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics)
        self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics)
        self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics)
        self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics)

        self.embedding = embedding
        self.corpus = corpus
        self.num_topics = num_topics

        if os.path.exists(self.lsi_model_path):

            print("LSI {} model already trained, loading from disk.".format(embedding))
            self.model = LsiModel.load(self.lsi_model_path)

        else:

            # Make a index to word dictionary.
            temp = corpus.dictionary[0]  # This is only to "load" the dictionary.
            id2word = corpus.dictionary.id2token

            print("Training LSI model.")
            self.model = LsiModel(
                corpus=list(corpus.get_corpus()),
                id2word=id2word,
                chunksize=chunksize,
                num_topics=num_topics
            )
            print("Saving LSI model.")
            self.model.save(self.lsi_model_path)

        self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path)

        if os.path.exists(self.sim_matrix_path):
            print("Similarities matrix {} model already trained, loading from disk.".format(embedding))
            self.index = similarities.Similarity.load(self.sim_matrix_path)
        else:
            print("Creating similarities index.")
            Path(self.sim_matrix_temp_path).touch(exist_ok=True)
            self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics)
            self.index.save(self.sim_matrix_path)

    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)

        if self.embedding == "bow":
            lsi_query = self.model[vec_query]
        elif self.embedding == "tfidf":
            lsi_query = self.model[self.corpus.tfidf_model[vec_query]]

        sims = self.index[lsi_query]
        sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1])
        return sims
Example #7
0
def train_models():
    models = dict()
    if settings["models"]["msda"]:
        dims = settings["dimensionalities"]["msda"]
        try:
            msda = mSDA.load("reuters_msda_%sdims" % dims)
            # the line below is for testing a model I have locally on my machine
            #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3")
        except:
            ln.info("Training mSDA...")

            prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]]
            msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids)
            msda.train(bow_corpus())
            msda.save("reuters_msda_%sdims" % dims)
        msda.__out_size = dims
        models["msda"] = msda

    if settings["models"]["lsi"]:
        dims = settings["dimensionalities"]["lsi"]
        try:
            lsi = LsiModel.load("reuters_lsi_%sdims" % dims)
        except:
            ln.info("Training LSI...")
            lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary)
            lsi.save("reuters_lsi_%sdims" % dims)
        lsi.__out_size = dims
        models["lsi"] = lsi

    return models
Example #8
0
    def train_model(self, num_topics):
        corpus = self.get_corpus()
        model = LsiModel(corpus, num_topics=num_topics)
        tmp_fname = self.path + self.model_type + "_model"
        model.save(tmp_fname)

        return model
def train(n_topics=num_topics):

    docs = read_ap.get_processed_docs()
    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50
                               )
    # save the dictionary
    with open(os.path.join(folder_path_objects,
                           'dictionary_lsi_bow'), 'wb') as f:
        pickle.dump(dictionary, f)

    # create binary and regular bow corpus
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    # save corpuses
    with open(os.path.join(folder_path_objects,
                           'corpus_binary'), 'wb') as f:
        pickle.dump(corpus_binary, f)

    # create models
    print(f'{time.ctime()} Start training LSA (binary bow)')
    lsi_bin = LsiModel(
        corpus=corpus_binary,
        id2word=dictionary,
        chunksize=1000,
        num_topics=n_topics
    )

    # save models to disk
    os.makedirs(folder_path_models, exist_ok=True)

    lsi_bin.save('./models/lsi_bin_filtered')
Example #10
0
def lsi(clean_docs, model_name, topics):

    from gensim import corpora
    # turn all data into a dictionary mappping of normalized words and their integer ids
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LSI model
    from gensim.models import LsiModel
    num_topics = topics  # find this number of topics in the data

    lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary)
    lsimodel.save('lsi_model_' + model_name + '.gensim')
    topics = lsimodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
Example #11
0
def create_lsi_model(project,
                     corpus,
                     id2word,
                     name,
                     use_level=True,
                     force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=project.num_topics,
        )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #13
0
def build_lsi_model(dictionary, corpus, should_rebuild):
    lsi = list()

    # DEBUG
    should_rebuild = True

    if not should_rebuild:
        try:
            print('Loading LSI Model backup...')
            lsi_file = utils.get_file_path(cfg.LDA_BACKUP)
            print('LSI file = {}'.format(lsi_file))

            lsi = LdaModel.load(lsi_file)

        except Exception as exc:
            utils.print_exception_details('Building LSI Model', exc)

    else:
        print('Building LSI Model...')
        one_pass = cfg.NUM_PASSES > 1
        lsi = LsiModel(corpus,
                       id2word=dictionary,
                       num_topics=cfg.NUM_TOPICS,
                       onepass=one_pass)
        print('Done!')
        # Save Model Structures
        LSI_FILE = utils.get_file_path(cfg.LSI_BACKUP)
        lsi.save(LSI_FILE)

    return lsi
Example #14
0
def train_LSIModel(tokens, num_top):
    # reuters_text = open("test2.txt", "r")
    dct = corpora.Dictionary(tokens)
    document_matrix = [dct.doc2bow(article) for article in tokens]
    model = LsiModel(document_matrix, num_topics=num_top, id2word=dct)
    model.save("test2.LSIModel")
    return model
def save_lsi_model(corpus_tfidf, dictionary):
    # apply transformation to whole corpus
    print("lsi model")
    lsi = LsiModel(corpus_tfidf, id2word=dictionary,
                   num_topics=3000)  # initialize LSI transformation
    tmp_fname = get_tmpfile("lsi.model")
    print("saving tmp file")
    lsi.save(tmp_fname)
    return tmp_fname
Example #16
0
class LSI():
    @timed
    def init_lsi(self, **kwargs):
        # handle onepass=False
        model_sparse = self.sparse

        class __sparse():
            def __iter__(self):
                for a in model_sparse:
                    yield [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)]

        self.lsi = LsiModel(__sparse(), **kwargs)
        self.lsi.save(self.path + 'lsi.pkl')

    def load_lsi(self):
        self.lsi = LsiModel.load(self.path + 'lsi.pkl')

    def load_dense(self, storage='disk'):
        self.dense = sorbet(self.path + 'dense', kind=storage).load()

    def sparse_to_dense(self, sparse):
        dense = self.lsi[sparse]
        dense = sparse2full(dense, self.lsi.num_topics)
        dense = array('f', dense)
        return dense

    @timed
    def init_dense(self, storage=None, workers=None):
        _workers = workers or self.params.get(
            'dense__workers') or self.params.get('workers', 1)
        _storage = storage or self.params.get(
            'dense__storage') or self.params.get('storage', 'disk')
        if _workers > 1:
            self._init_dense_mp(workers=_workers, storage=_storage)
        else:
            self._init_dense_sp(storage=_storage)

    def _init_dense_sp(self, storage='disk'):
        self.dense = sorbet(self.path + 'dense', kind=storage).new()
        for a in self.sparse:
            sparse = [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)]
            dense = self.sparse_to_dense(sparse)
            self.dense.append(dense)
        self.dense.save()

    def _init_dense_mp(self, workers, storage):
        chunksize = self.params.get('dense__chunksize', 10)
        s = sorbet(self.path + 'dense').new()
        id_iter = range(len(self.meta))
        id_iter = tqdm(id_iter, 'dense', len(self.meta))
        with mp.Pool(workers, init_dense_worker, [
                self.path,
        ]) as pool:
            dense = pool.imap(dense_worker, id_iter, chunksize)
            for d in dense:
                s.append(d)
        self.dense = s.save()
Example #17
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
Example #18
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
def getLsiModel(tfidfModel) -> LsiModel:
    modelPath = os.path.join('.cache', 'lsi.gensim_model')
    try:
        lsiModel = LsiModel.load(modelPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        lsiModel = LsiModel(corpus, num_topics=200)
        lsiModel.save(modelPath)

    return lsiModel
def train_LSI(corpus, name, num_topics=500):
    tic = time.perf_counter()

    LSI_model = LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

    toc = time.perf_counter()
    print(f"Trained LSI {name} in {toc - tic:0.4f} seconds")  # ~4min

    LSI_model.save(f'/LSI_{name}_model_{num_topics}.mm')

    return LSI_model
Example #21
0
    def train(self, dataset):
        corpus, dictionary = self._prepare(dataset)
        dictionary.save('../models.nosync/lsa/dict')

        print('starting LSA')
        model = LsiModel(corpus=corpus,
                         id2word=dictionary.id2token,
                         num_topics=self.c.lsa_topics)
        path = '../models.nosync/lsa/model'
        model.save(path)
        return model, corpus
Example #22
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms)
    # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(HERE, "sogou.dict"))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, "sogou.model"))
    lsi_model.save(os.path.join(HERE, "sogou.lsi"))
    print "save dictionary and tfidf model"
    """    
Example #23
0
    def lsimodel(self, corpus_t=None, topic=200, save=False, savename=None):
        """

        :param tfidf:
        :param topic:
        :return:
        """
        print('using Lsimodel...')
        lsimodel = LsiModel(corpus=corpus_t, id2word=self.word_dict, num_topics=topic)
        if save:
            print('输出lsi模型到文件:{}'.format(savename))
            lsimodel.save(savename)
        return lsimodel
Example #24
0
def build_and_save_lsi_model():
    print('Connecting to the database...')
    sentences = SentencesIterator(tokens_generator)
    dct = Dictionary(sentences)
    # Corpus as dictionary ids lists, in memory
    # Can be transformed in an iterable as done with the others if needed
    print('Calculating the LSI model...')
    bow_corpus = [dct.doc2bow(s) for s in sentences]
    model = LsiModel(bow_corpus, id2word=dct)
    model.print_debug()
    model.save(LSI_MODEL_FILE)
    for t in range(model.get_topics().shape[0]):
        print(t)
        print(model.print_topic(t))
Example #25
0
def main(Tweet=None):
    qs = Tweet.objects.filter(is_strict__gte=13)
    tweets = np.array(qs.values_list('pk', 'text', 'user__screen_name', 'user__is_bot'))
    tweets = pd.DataFrame(np.array(tweets), columns='pk text user is_bot'.split())
    tweets = tweets.set_index('pk', drop=True)
    tweets['tokens'] = tweets.text.apply(casual_tokenize)

    vocab = Dictionary(tweets.tokens)
    tfidf = TfidfModel(dictionary=vocab, id2word=vocab)
    bows = pd.Series(vocab.doc2bow(toks) for toks in tweets.tokens)
    lsi = LsiModel(tfidf[bows], num_topics=80, id2word=vocab, extra_samples=100, power_iters=2)
    lsi.save('/home/hobs/src/hackor/twote/data/lsi{}x{}x{}.saved'.format(len(tweets), lsi.num_topics, lsi.num_terms))
    topics = lsi[tfidf[bows]]
    topics = pd.DataFrame([dict(d) for d in topics], index=tweets.index, columns=range(80))
def train(corpus, dictionary):
    print("Training model ...")
    print("Number of topics:", ARGS.num_topics)

    if ARGS.model_type == "LSI":
        print(corpus)
        print(ARGS.num_topics)
        model = LsiModel(corpus, id2word=dictionary, num_topics=ARGS.num_topics)
        model.save(ARGS.save_dir + "/models/"+ARGS.model_type+"_"+ARGS.corpus_type+".mm")

    elif ARGS.model_type == "LDA": 
        model = LdaModel(corpus, id2word=dictionary, num_topics=ARGS.num_topics)
        model.save(ARGS.save_dir + "/models/"+ARGS.model_type+"_"+ARGS.corpus_type+".mm")

    return model 
Example #27
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf,
                         id2word=corpus.dictionary,
                         num_topics=num_terms)
    #corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(
        HERE, 'sogou.dict'))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, 'sogou.model'))
    lsi_model.save(os.path.join(HERE, 'sogou.lsi'))
    print 'save dictionary and tfidf model'
    '''    
Example #28
0
def train_and_save_gensim_model(model_type_str,
                                corpus,
                                dct,
                                file_name='model_300.model',
                                num_topics=None):
    if model_type_str == "lsi":
        model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct)
    elif model_type_str == "lda":
        model = LdaModel(corpus=corpus,
                         alpha='auto',
                         num_topics=num_topics,
                         id2word=dct)
    elif model_type_str == "hdp":
        model = HdpModel(corpus=corpus, id2word=dct)
    model.save(file_name)
    return model
def train_lsa(docs: Iterable, outputFolder: str):
    docs = list(docs)
    id2word = Dictionary(docs)
    id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000)
    corpus = [id2word.doc2bow(doc) for doc in docs]
    corpus = log_entropy_norm(corpus)
    print("Starting training...")
    lsa = LsiModel(corpus=corpus, id2word=id2word, num_topics=300)
    path = outputFolder + "/lsa.model"
    lsa.save(outputFolder + "/lsa.bin")
    matrix = np.transpose(lsa.get_topics())
    with open(path, "wt", encoding='utf-8') as f:
        f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1)))
        for idx in range(np.size(matrix, 0)):
            f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n")
    print("Model saved to ", path)
Example #30
0
    def train(self):
        print("Reading serializations...")
        sr = SerializationReader(self.series)
        documents, doc2idx, idx2doc = sr.read()

        print("Building dictionary...")
        dictionary = Dictionary(documents)
        corpus = [dictionary.doc2bow(doc) for doc in documents]

        print("Building model...")
        lsi = LsiModel(corpus, id2word=dictionary, num_topics=self.dimensions)

        print("Building index...")
        index = MatrixSimilarity(lsi[corpus])

        print("Saving...")
        dictionary.save(self.dictionary)
        lsi.save(self.lsi)
        index.save(self.index)
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=project.num_topics,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
Example #32
0
def save_model(docs,file_path):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = Dictionary(docs)

    # dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    # model = TfidfModel(corpus)  # fit model
    # corpus = model[corpus]

    CHUNKSIZE = 500
    passes = 10
    temp = dictionary[0]


    NUM_TOPICS = 15
    model = LsiModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary)

    model.save(file_path)
 def set_model(self, lang: str, data_version: int,
               dictionary_version: float, model_version: str,
               param_name: str, param_version: int, model_file_path: str,
               language_processed_data: list):
     logging.info("---- Create LSI model ")
     tf_idf = TfidfModel(self.essentials.corpus)
     tf_idf_corpus = tf_idf[self.essentials.corpus]
     model = LsiModel(tf_idf_corpus,
                      id2word=self.essentials.dictionary,
                      num_topics=self.number_of_topics)
     model.save(model_file_path)
     self.model = model
     logging.info("---- LSI model is created")
     metrics = self.get_model_evaluation_metrics(language_processed_data)
     parameters = self.get_model_parameters()
     self.write_model_evaluation_metrics(lang, data_version,
                                         dictionary_version, model_version,
                                         param_name, param_version, metrics,
                                         parameters)
     return
Example #34
0
def train_models():
    models = dict()
    if settings["models"]["msda"]:
        dims = settings["dimensionalities"]["msda"]
        try:
            msda = mSDA.load("reuters_msda_%sdims" % dims)
            # the line below is for testing a model I have locally on my machine
            #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3")
        except:
            ln.info("Training mSDA...")

            prototype_ids = [
                id_ for id_, freq in sorted(dictionary.dfs.items(),
                                            key=lambda (k, v): v,
                                            reverse=True)[:dims]
            ]
            msda = mSDA(0.5,
                        5,
                        len(dictionary),
                        dims,
                        prototype_ids=prototype_ids)
            msda.train(bow_corpus())
            msda.save("reuters_msda_%sdims" % dims)
        msda.__out_size = dims
        models["msda"] = msda

    if settings["models"]["lsi"]:
        dims = settings["dimensionalities"]["lsi"]
        try:
            lsi = LsiModel.load("reuters_lsi_%sdims" % dims)
        except:
            ln.info("Training LSI...")
            lsi = LsiModel(corpus=bow_corpus(),
                           num_topics=dims,
                           id2word=dictionary)
            lsi.save("reuters_lsi_%sdims" % dims)
        lsi.__out_size = dims
        models["lsi"] = lsi

    return models
Example #35
0
def train_lsi(corpus, dictionary, num_topics, corpus_type):
    """
    Train the LSI model given the dataset for a given amount of topics.
    """
    #train model and save for later use
    model_filename = 'lsi_' + str(corpus_type) + '_num_topics=' + str(
        num_topics) + '.model'
    model_path = './tmp/' + model_filename

    if not os.path.exists(model_path):
        print(('Starting training {} lsi for num_topics = {}').format(
            corpus_type, num_topics))
        lsi = LsiModel(corpus=corpus,
                       id2word=dictionary,
                       num_topics=num_topics,
                       onepass=False)
        lsi.save(model_path)

    else:
        print(('{} Lsi for num_topics = {} is already created, loading now...'
               ).format(corpus_type, num_topics))
        lsi = LsiModel.load(model_path)

    #construct BOW index for trained lsi model, save for later use
    index_filename = 'index_' + str(corpus_type) + '_num_topics=' + str(
        num_topics) + '.mm.index'
    index_path = './tmp/' + index_filename

    if not os.path.exists(index_path):
        print(('Starting construction {} index for num_topics = {}').format(
            corpus_type, num_topics))
        index = similarities.MatrixSimilarity(lsi[corpus])
        index.save(index_path)
    else:
        print((
            'index for {} corpus with num_topics = {} is already created, loading now...'
        ).format(corpus_type, num_topics))
        index = similarities.MatrixSimilarity.load(index_path)

    return lsi, index
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(
        MMFILE,
        wiki,
        progress_cnt=10000,
    )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #37
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #38
0
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)

	# word2vec model
	class MyCorpus():
		def __iter__(self):
			for d in corpus.get_texts():
				yield [w for w in d if w in corpus.dictionary.token2id]
	if not os.path.exists(f_w2v):
		w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5)
		w2v.save_word2vec_format(f_w2v, binary=True)

	# LANGUAGE MODELS
	lm_cache = models.Cache(window=50)
	lm_lsa = models.LSA(f_lsa, f_dict, tfidf=f_tfidf, window=50)
	lm_w2v = models.Word2Vec(f_w2v, window=50)
logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))
tweetids = pd.Series(range(6), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids,
                        name='tweet') for i in tweetids],
             index=tweetids)


# In[29]:

lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2)
lsi2


# In[30]:

lsi.save(os.path.join(DATA_PATH, 'lsi100'))
lsi2.save(os.path.join(DATA_PATH, 'lsi2'))


# In[16]:

lsi2.show_topics()


# In[23]:

# for topic in lsi.show_topics():
#     print(topic)

lsi.show_topic(0, 100)
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
Example #42
0
# topic_id = 0
# for topic in lsi_model.show_topics():
#    topic_id+=1
#    print "TOPIC (LSI) " + str(topic_id) + " : " + topic

# lsi_model.print_topic(20, topn=10)
# corpus_lsi = lsi_model[corpus]

corpus_tfidf = tfidf_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
# corpus_lsi_2 = lsi_model_2[corpus]
print "Done creating models"

lsi_model_2.save("wiki_en_model.lsi")

# lsi_model_2 .print_topics(5)

"""
topic_id = 0
for topic in lsi_model_2.show_topics():
    print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
    #group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5]
    group_topic = [doc for doc in corpus_lsi_2]
    print str(group_topic)
    topic_id+=1
"""


print "Docs Processed " + str(lsi_model_2.docs_processed)
Example #43
0
def load_model(wordid_txt_file, tfidf_txt_file, model_file):
    id2word = Dictionary.load_from_text(wordid_txt_file)
    mm = MmCorpus(tfidf_txt_file)
    lsi = LsiModel(corpus=mm, id2word=id2word, num_topics=400)
    lsi.save(model_file)
    return lsi
Example #44
0
logent_document = logent_transformation[[bow_document]]  # converts a single document to log entropy representation. document must be in the same vector space as corpus.

documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents)  # use a generator expression because...
logent_documents = logent_transformation[bow_documents]  # ...transformation is done during iteration of documents using generators, so this uses constant memory

### Chained transformations
logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary)  # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus.

lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400)  # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.

lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, num_features=400)  # Performs same operation as above, but with implicit chaining

# Can persist transformation models, too.
logent_transformation.save("logent.model")
lsi_transformation.save("lsi.model")


### Similarities (the best part)
from gensim.similarities import Similarity

documents = ["A bear walked in the dark forest.",
             "Tall trees have many more leaves than short bushes.",
             "A starship may someday travel across vast reaches of space to other stars.",
             "Difference is the concept of how two or more entities are not the same."]
# A corpus can be anything, as long as iterating over it produces a representation of the corpus documents as vectors.
corpus = (dictionary.doc2bow(tokenize_func(document)) for document in documents)

index = Similarity(corpus=lsi_transformation[logent_transformation[corpus]], num_features=400, output_prefix="shard")

print "Index corpus:"
Example #45
0
    # convert the dictionary to a bag of words corpus for reference
    corpus = [dictionary.doc2bow(review) for review in abstract_vectors]
    corpora.MmCorpus.serialize(corpus_filename, corpus)
else:
    corpus = corpora.MmCorpus(corpus_filename)



#  vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts

print("lsi")
lsi_filename = 'model.lsi'
if not os.path.isfile(lsi_filename):
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=5)  # initialize an LSI transformation, 5 topicos
    #
    lsi.save(lsi_filename)  # same for tfidf, lda, ...
else:
    lsi = LsiModel.load(lsi_filename)

lsi_topics = 5  # numero predefinido de topicos
def print_topic(lsi, topicno, topn=7):
    """
        Return a single topic as a formatted string. See `show_topic()` for parameters.

        >>> lsimodel.print_topic(topicno, topn)
        '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'

        """
    return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)])