Example #1
0
    def load(lsi_path=None, id2word_path=None, index_path=None):
        """
        If specified, attempts to load gensim LsiModel from `lsi_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        lsi_path: str
            File-path designating where self.model should be saved.
        id2word_path: str
            File-path designating where self.dictionary should be saved.
        """
        if lsi_path is not None:
            from gensim.models import LsiModel
            if not os.path.exists(lsi_path):
                raise IOError(
                    'The provided file path to the LsiModel was not found.'
                    'Please ensure that the argument is the correct path.')
            return LsiModel.load(lsi_path)
        if id2word_path is not None:
            from gensim.corpora.dictionary import Dictionary
            if not os.path.exists(id2word_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return Dictionary.load(id2word_path)
        if index_path is not None:
            from gensim.similarities import MatrixSimilarity
            if not os.path.exists(index_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return MatrixSimilarity.load(index_path)
Example #2
0
def create_lsi_model(project,
                     corpus,
                     id2word,
                     name,
                     use_level=True,
                     force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=project.num_topics,
        )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
Example #3
0
def train_models():
    models = dict()
    if settings["models"]["msda"]:
        dims = settings["dimensionalities"]["msda"]
        try:
            msda = mSDA.load("reuters_msda_%sdims" % dims)
            # the line below is for testing a model I have locally on my machine
            #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3")
        except:
            ln.info("Training mSDA...")

            prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]]
            msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids)
            msda.train(bow_corpus())
            msda.save("reuters_msda_%sdims" % dims)
        msda.__out_size = dims
        models["msda"] = msda

    if settings["models"]["lsi"]:
        dims = settings["dimensionalities"]["lsi"]
        try:
            lsi = LsiModel.load("reuters_lsi_%sdims" % dims)
        except:
            ln.info("Training LSI...")
            lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary)
            lsi.save("reuters_lsi_%sdims" % dims)
        lsi.__out_size = dims
        models["lsi"] = lsi

    return models
Example #4
0
    def __init__(self, dictionary_path, corpus_path, tfidf_path,
                 corpus_tfidf_path, tfidf_index_sim_path, lsi_path,
                 lsi_index_path, stopwords_path, tweet_corpus_path):

        self.dictionary = gensim.corpora.Dictionary.load(dictionary_path)
        self.corpus = MmCorpusMeta(corpus_path,
                                   id2word=self.dictionary,
                                   metadata=True)
        self.tweet_corpus = MmCorpusMeta(tweet_corpus_path,
                                         id2word=self.dictionary,
                                         metadata=True)
        self.tfidf = TfidfModel.load(tfidf_path)
        self.corpus_tfidf = gensim.utils.unpickle(corpus_tfidf_path)
        self.tfidf_index = gensim.similarities.MatrixSimilarity.load(
            tfidf_index_sim_path)
        self.lsi = LsiModel.load(lsi_path)
        self.lsi_index = gensim.similarities.MatrixSimilarity.load(
            lsi_index_path)
        with open(stopwords_path) as f:
            self.stopwords = json.load(f)

        self.tdidf_tweets = self.tfidf[self.tweet_corpus]
        self.lsi_tweets = self.lsi[self.tdidf_tweets]
        self.sim_tweets = gensim.similarities.MatrixSimilarity(self.lsi_tweets)
        print("loaded")
def load_and_cluster():
    corpus_tfidf, dictionary, titles = train_tfidf_model()
    # fname = save_lsi_model(corpus_tfidf,dictionary)
    print("fname")
    fname = "/var/folders/ft/jlv83lxd58zb3v6bjqtzlr0c0000gn/T/lsi.model"
    print(fname)
    lsi = LsiModel.load(fname)
    print("lsi corpus")
    corpus_lsi = lsi[
        corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    X = [[row[1] for row in document] for document in corpus_lsi]
    print("Kmeans")
    print([row for row in X if len(row) != 3000])
    np_array = np.array([np.array(row) for row in X if len(row) == 3000])
    print(type(np_array))
    print(type(np_array[0]))
    print(np.unique([len(row) for row in np_array]))
    kmeans_clustering(np_array, 1000)
    numeric_labels = []
    with open("pickle_model.pkl", 'rb') as file:
        kmeans_model = pickle.load(file)
        cluster_centers = kmeans_model.cluster_centers_
        numeric_labels = kmeans_model.labels_
    #, model.predict(X), model.labels_
    print(np.unique(numeric_labels))
    for i in range(1000):
        if i % 100 == 0:
            print(i)
            for j, label in enumerate(numeric_labels):
                if label == i:
                    print(titles[j])
    def train(self, data):
        """
        Fit LSA model to the data, set document topic vectors and calculate distances.

        :param data: Data to fit model on
        """

        if self.word_dict == None:
            print(
                "Dictionary must be assigned to model before training. This function call does nothing"
            )
            return
        if self.model == None:
            self.model = LsiModel(num_topics=self.vector_length,
                                  id2word=self.word_dict)

        self.name = '%s_%strain' % (self.name, data.name)
        self.path = Path('modelfiles/%s/%s' % (data.name, self.name))

        try:
            self.model = LsiModel.load(str(self.path / '.model'))
        except:
            self.path.mkdir(parents=True, exist_ok=True)

            print("Training model...", end='')
            time.sleep(0.1)

            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            self.model.add_documents(datastream)

            self.model.save(str(self.path / '.model'))
Example #7
0
    def __init__(self, docs, num_topics=500, chunksize=20000, no_below=50, no_above=0.5,
                 tfidf=True, model_path="./lsi_data"):
        # Set training parameters.
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.no_below = no_below
        self.no_above = no_above
        self.tfidf = tfidf
        self.model_path = model_path

        if not os.path.exists(model_path):
            os.makedirs(model_path)
        index_path = './data.index'
        if os.path.exists(index_path):
            assert os.path.exists("./corpus_bow") and os.path.exists(os.path.join("./corpus_tfidf")),\
                "Corpus file missing! Please rebuild index."
            with open(index_path, "rb") as reader:
                index = pkl.load(reader)
                self.index = index["index"]
                self.index2docid = index["index2docid"]
            with open("./corpus_bow", "rb") as reader:
                self.corpus_bow = pkl.load(reader)
            with open("./corpus_tfidf", "rb") as reader:
                self.corpus_tfidf = pkl.load(reader)
            if os.path.exists(os.path.join(self.model_path, "lsi.model")):
                self.model = LsiModel.load(os.path.join(self.model_path, "lsi.model"))
            else:
                self.model = self.train()
        else:
            self.rebuild_index(docs, index_path)
Example #8
0
def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    if lsa_training:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        # generate LSA model
        lsi_model = LsiModel(doc_term_matrix,
                             num_topics=number_of_topics,
                             id2word=dictionary)  # train model
        #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence()
        #print("Coherence value : ",coherence_value)
        print('Saving lsi_model...')
        lsi_model.save(lsi_model_path)
        print('lsi_model saved!')
        corpus_lsi = lsi_model[doc_term_matrix]
        with open(corupus_lsi_path, 'wb') as handle:
            pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Corpus_lsi saved.')

    else:

        dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training)
        print('Loading lsi_model...')
        lsi_model = LsiModel.load(lsi_model_path)
        print('lsi_model Loaded!')
        corpus_lsi = lsi_model[doc_term_matrix]

    return lsi_model, corpus_lsi, dictionary
Example #9
0
def lsi(dataframe, num_topics=300):
    """Returns an LSI model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    num_topics : int (default is 300)
        The number of topics to train the LSI model with.

    Returns
    -------
    model : Gensim LsiModel
        LSI model for documents stored in the DataFrame.
    """
    filename = 'caches/models/lsi.model'

    if not os.path.isfile(filename):
        dictionary = dictionary_corpus(dataframe)
        bow = bow_corpus(dataframe)
        tfidf_model = tfidf(dataframe)
        tfidf_corpus = tfidf_model[bow]
        lsi_model = LsiModel(tfidf_corpus,
                             id2word=dictionary,
                             num_topics=num_topics)
        lsi_model.save(filename)
    else:
        lsi_model = LsiModel.load(filename)

    return lsi_model
Example #10
0
    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)
Example #11
0
    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)
Example #12
0
def train_tsne(training_size=2000,
               metric='cosine',
               n_components=3,
               perplexity=100,
               angle=.12):
    # adjust this downward to see it it affects accuracy
    np = pd.np

    tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz'))
    tweets = tweets[tweets.isbot >= 0]
    gc.collect()  # reclaim RAM released above

    # labels3 = tweets.isbot.apply(lambda x: int(x * 3))
    labels = tweets.isbot.apply(lambda x: int(x * 2))

    lsa = LsiModel.load(
        os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl'))
    tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word)
    bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text])
    # tfidfs = tfidf[bows]

    X = pd.DataFrame(
        [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))],
        index=tweets.index)

    mask = ~X.isnull().any(axis=1)
    mask.index = tweets.index
    # >>> sum(~mask)
    # 99
    # >>> tweets.loc[mask.argmin()]
    # isbot                 0.17
    # strict                  13
    # user      b'CrisParanoid:'
    # text         b'#sad again'
    # Name: 571, dtype: object

    X = X[mask]
    y = tweets.isbot[mask]
    labels = labels[mask]

    test_size = 1.0 - training_size if training_size < 1 else float(
        len(X) - training_size) / len(X)
    Xindex, Xindex_test, yindex, yindex_test = train_test_split(
        X.index.values, y.index.values, test_size=test_size)
    X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[
        yindex], y.loc[yindex_test]

    # labels_test = labels.loc[yindex_test]
    labels = labels.loc[yindex]

    tsne = TSNE(metric='precomputed',
                n_components=n_components,
                angle=angle,
                perplexity=perplexity)
    tsne = tsne.fit(positive_distances(X.values, metric=metric))

    return tsne, X, Xtest, y, ytest
Example #13
0
    def get_model(self, num_topics):
        tmp_fname = self.path + self.model_type + "_model"

        if os.path.exists(tmp_fname):
            return LsiModel.load(tmp_fname)

        else:
            print("Training model.")
            return self.train_model(num_topics)
Example #14
0
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        self.model = LsiModel.load(self.model_fn)

        if os.path.exists(self.model_fn + '.tfidf'):
            self.tfidf = TfidfModel.load(self.model_fn + '.tfidf')

        return self
Example #15
0
def load_corpus():
    dictionary = corpora.Dictionary.load(os.path.join(HERE, "sogou.dict"))
    tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, "sogou.model"))
    lsi_model = LsiModel.load(os.path.join(HERE, "sogou.lsi"))
    try:
        sg_class = joblib.load(os.path.join(HERE, "sgdc_clf.pkl"))
    except:
        sg_class = None
    return dictionary, tfidf_model, lsi_model, sg_class
Example #16
0
def load_corpus():
    dictionary = corpora.Dictionary.load(os.path.join(HERE, 'sogou.dict'))
    tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, 'sogou.model'))
    lsi_model = LsiModel.load(os.path.join(HERE, 'sogou.lsi'))
    try:
        sg_class = joblib.load(os.path.join(HERE, 'sgdc_clf.pkl'))
    except:
        sg_class = None
    return dictionary, tfidf_model, lsi_model, sg_class
Example #17
0
 def __init__(self, model, namespace2idx):
     if isinstance(namespace2idx, str):
         idx2namespace, namespace2idx = utils.read_vocab(namespace2idx)
     if isinstance(model, str):
         from gensim.models import LsiModel
         model = LsiModel.load(model)
     self.vocab = pd.Series(namespace2idx).sort_values()
     self.weights = pd.DataFrame(model.projection.u, index=self.vocab.index)
     super(LSI, self).__init__(model)
def getLsiModel(tfidfModel) -> LsiModel:
    modelPath = os.path.join('.cache', 'lsi.gensim_model')
    try:
        lsiModel = LsiModel.load(modelPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        lsiModel = LsiModel(corpus, num_topics=200)
        lsiModel.save(modelPath)

    return lsiModel
Example #19
0
def _load_model(type, fname='../../model/'):
    try:
        if type == 'lsi':
            return LsiModel.load(fname)
        elif type == 'lda':
            return LdaModel.load(fname)
        elif type == 'mallet':
            return LdaMallet.load(fname)
    except:
        return None
Example #20
0
def load_topic_model(vec_method, model_path, index_path, dict_path,
                     corpus_path):
    if vec_method == 'LDA':
        model = LdaModel.load(model_path)
    elif vec_method == 'LSI':
        model = LsiModel.load(model_path)
    index = similarities.MatrixSimilarity.load(index_path)
    dictionary = corpora.Dictionary.load(dict_path)
    corpus = corpora.MmCorpus(corpus_path)
    # vec_lda = lda[corpus]
    return model, index, dictionary, corpus
Example #21
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
Example #22
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
Example #23
0
def export_model(model_file, out_file):
    """Saves the model. The output will be utf-8 encoded."""
    #    model = model_mapping[model_type].load(model_file)
    model = LsiModel.load(model_file)
    with FileWriter(out_file, "w").open() as out:
        out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics))
        for term in xrange(model.numTerms):
            word = model.id2word.id2token[term].decode("utf-8")
            while len(word) > 0 and not word[-1].isalnum():
                word = word[0:-1]
            out.write(u"{0}\n".format(word))
            out.write(
                u"{0}\n".format(u"\t".join(str(f) for f in numpy.asarray(model.projection.u.T[:, term]).flatten()))
            )
Example #24
0
def _load_model(model_type, fname):
    logger.info(f'{model_type} type of {fname} is loading..')
    try:
        if model_type == 'lsi':
            return LsiModel.load(f'../model/lsi_model/{fname}')
        elif model_type == 'lda':
            return LdaModel.load(f'../model/lda_model/{fname}')
        elif model_type == 'mallet':
            return LdaMallet.load(f'../model/mallet_model/{fname}')
        elif model_type == 'hdp':
            return HdpModel.load(f'../model/mallet_model/{fname}')
    except Exception as ex:
        logger.warning(f'{model_type} type of {fname} could not be loaded.',
                       exc_info=ex)
        return None
Example #25
0
def export_model(model_file, out_file):
    """Saves the model. The output will be utf-8 encoded."""
    #    model = model_mapping[model_type].load(model_file)
    model = LsiModel.load(model_file)
    with FileWriter(out_file, 'w').open() as out:
        out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics))
        for term in xrange(model.numTerms):
            word = model.id2word.id2token[term].decode("utf-8")
            while len(word) > 0 and not word[-1].isalnum():
                word = word[0:-1]
            out.write(u"{0}\n".format(word))
            out.write(u"{0}\n".format(u"\t".join(
                str(f)
                for f in numpy.asarray(model.projection.u.T[:,
                                                            term]).flatten())))
Example #26
0
def get_lsi_model(doc_term_matrix, id2word, fname):
    if fname is not None:
        try:
            return LsiModel.load(fname)
        except:
            pass

    lsi_model = LsiModel(corpus=doc_term_matrix,
                         id2word=id2word,
                         num_topics=params['num_topics'],
                         chunksize=params['chunksize'])

    _save_model(lsi_model, fname)

    return lsi_model
Example #27
0
 def _load_model(self, param_id, nb_topics):
     """
     Load an LDA model.
     """
     if self.lsi:
         model_dir = join(LSI_PATH, self.version, self.corpus_type)
         model_file = f'{self.dataset}_LSImodel_{nb_topics}'
         model_path = join(model_dir, model_file)
         model = LsiModel.load(model_path)
     else:
         model_dir = join(self.directory, self.corpus_type, param_id)
         model_file = f'{self.dataset}_LDAmodel_{param_id}_{nb_topics}_{self.epochs}'
         model_path = join(model_dir, model_file)
         model = LdaModel.load(model_path)
     self.logg(f'Loading model from {model_path}')
     return model
Example #28
0
    def __init__(self, series, dictionary, lsi, index, sim_opt, rank_opt):
        super().__init__()

        self.norm = LookupNormalization()

        self.dictionary: Dictionary = Dictionary.load(dictionary)
        self.lsi: LsiModel = LsiModel.load(lsi)
        self.index: MatrixSimilarity = MatrixSimilarity.load(index)

        sr = SerializationReader(series)
        self.documents, self.doc2idx, self.idx2doc = sr.read()

        sim_class = globals()[self.SIM_OPTS[sim_opt]["cls"]]
        self.sim_strategy: SimilarityStrategy = sim_class(self.SIM_OPTS[sim_opt]["constant"])

        rank_class = globals()[self.RANK_OPTS[rank_opt]]
        self.rank_strategy: RankingStrategy = rank_class()
Example #29
0
def stacking(text, results, infos):
    integrated = copy.deepcopy(results[0])

    d2v = TfidfModel.load('./modules/models/tfidf.model')
    dct = Dictionary.load('./modules/models/dic.model')
    lsi = LsiModel.load('./modules/models/lsi.model')

    # generate feature vector
    text = u' '.join(unicode(text))
    word_list = text.split()
    corpus = dct.doc2bow(word_list)
    sent_feature = [item[1] for item in lsi[d2v[corpus]]]

    x = list()
    x += sent_feature
    for info in infos:
        for result in results:
            x.append(len(unicode(result['%s' % info]))/10.0)
            try:
                pos = result['%s_p' % info][0]
            except IndexError:
                pos = 0
            try:
                x.append(pos/float(len(unicode(text))))
            except ZeroDivisionError:
                x.append(0)
            x.append(result['%s_confidence' % info])

    # predict every type of info.
    for info in infos:
        probs = list()
        for i in range(len(results)):
            model = joblib.load('./modules/models/integrator_%s%s.model' % (info, i))
            # print model.predict_proba(x)
            probs.append(model.predict_proba([x])[0][1])

        y = probs.index(max(probs))
        integrated[info] = results[y][info]
        integrated['%s_p' % info] = results[y]['%s_p' % info]

        conf = 1.0
        for result in results:
            conf *= (1 - result['%s_confidence' % info])
        integrated['%s_confidence' % info] = 1 - conf

    return integrated
Example #30
0
def train_lda(training_size=2000, metric='cosine'):
    tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz'))
    tweets = tweets[tweets.isbot >= 0]

    # labels3 = tweets.isbot.apply(lambda x: int(x * 3))
    labels = tweets.isbot.apply(lambda x: int(x * 2))

    lsa = LsiModel.load(
        os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl'))
    tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word)
    bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text])
    # tfidfs = tfidf[bows]

    X = pd.DataFrame(
        [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))],
        index=tweets.index)
    mask = ~X.isnull().any(axis=1)
    mask.index = tweets.index
    X = X[mask]
    y = tweets.isbot[mask]
    labels = labels[mask]
    # labels3 = labels3[mask]

    test_size = 1.0 - training_size if training_size < 1 else float(
        len(X) - training_size) / len(X)
    Xindex, Xindex_test, yindex, yindex_test = train_test_split(
        X.index.values, y.index.values, test_size=test_size)
    X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[
        yindex], y.loc[yindex_test]
    labels_test = labels.loc[yindex_test]
    labels = labels.loc[yindex]

    lda = LDA('lsqr', 'auto', n_components=3)
    print(cross_val_score(lda, Xtest, labels_test, cv=7))

    lda = LDA('lsqr', 'auto', n_components=3)
    lda = lda.fit(X.values, labels.values)
    y_lda = lda.predict(Xtest)
    print(mean_squared_error(y_lda, ytest))

    df_test = pd.DataFrame(lda.predict(Xtest),
                           index=Xtest.index,
                           columns=['predict'])
    df_test['truth'] = labels_test
    return lda, df_test
 def __init__(self, name, model, components=None):
     if name == "lsa":
         self.vsm = LsiModel.load(model)
         self.vocab = self.vsm.id2word.token2id
         self.vector_size = self.vsm.num_topics
     elif name == "w2v":
         self.vsm = keyedvectors.KeyedVectors.load_word2vec_format(model, binary=True, unicode_errors='ignore')
         self.vocab = self.vsm.vocab
         self.vector_size = self.vsm.syn0.shape[1]
         # https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/keyedvectors.py
     elif name == "pickle":
         vsm_obj = pickle.load(open(model, "rb"))
         self.vsm = vsm_obj["vsm"]
         self.vocab = vsm_obj["map"]
         self.vector_size = self.vsm.shape[1]
     try:
         self.components = load(components)
     except (IOError, AttributeError):
         self.components = 1
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lsi.gz'

    if not os.path.exists(model_fname) or force:
        model = LsiModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=project.num_topics,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LsiModel.load(model_fname)

    return model, model_fname
Example #33
0
    def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000):

        self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics)
        self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics)
        self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics)
        self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics)

        self.embedding = embedding
        self.corpus = corpus
        self.num_topics = num_topics

        if os.path.exists(self.lsi_model_path):

            print("LSI {} model already trained, loading from disk.".format(embedding))
            self.model = LsiModel.load(self.lsi_model_path)

        else:

            # Make a index to word dictionary.
            temp = corpus.dictionary[0]  # This is only to "load" the dictionary.
            id2word = corpus.dictionary.id2token

            print("Training LSI model.")
            self.model = LsiModel(
                corpus=list(corpus.get_corpus()),
                id2word=id2word,
                chunksize=chunksize,
                num_topics=num_topics
            )
            print("Saving LSI model.")
            self.model.save(self.lsi_model_path)

        self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path)

        if os.path.exists(self.sim_matrix_path):
            print("Similarities matrix {} model already trained, loading from disk.".format(embedding))
            self.index = similarities.Similarity.load(self.sim_matrix_path)
        else:
            print("Creating similarities index.")
            Path(self.sim_matrix_temp_path).touch(exist_ok=True)
            self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics)
            self.index.save(self.sim_matrix_path)
Example #34
0
 def load(cls, save_dir='./'):
     """
     Load a SimSearch object and it's underlying KeySearch from the 
     specified directory. Returns both objects.
     """
     
     # First create and load the underlying KeySearch.
     ksearch = KeySearch.load(save_dir)
     
     # Create a SimSearch object.
     ssearch = SimSearch(ksearch)
     
     # Load the LSI index.
     ssearch.index = similarities.MatrixSimilarity.load(save_dir + 'index.mm')
     
     # Load the LSI model.
     ssearch.lsi = LsiModel.load(save_dir + 'lsi.model')
     
     return (ksearch, ssearch)
     
Example #35
0
def train_models():
    models = dict()
    if settings["models"]["msda"]:
        dims = settings["dimensionalities"]["msda"]
        try:
            msda = mSDA.load("reuters_msda_%sdims" % dims)
            # the line below is for testing a model I have locally on my machine
            #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3")
        except:
            ln.info("Training mSDA...")

            prototype_ids = [
                id_ for id_, freq in sorted(dictionary.dfs.items(),
                                            key=lambda (k, v): v,
                                            reverse=True)[:dims]
            ]
            msda = mSDA(0.5,
                        5,
                        len(dictionary),
                        dims,
                        prototype_ids=prototype_ids)
            msda.train(bow_corpus())
            msda.save("reuters_msda_%sdims" % dims)
        msda.__out_size = dims
        models["msda"] = msda

    if settings["models"]["lsi"]:
        dims = settings["dimensionalities"]["lsi"]
        try:
            lsi = LsiModel.load("reuters_lsi_%sdims" % dims)
        except:
            ln.info("Training LSI...")
            lsi = LsiModel(corpus=bow_corpus(),
                           num_topics=dims,
                           id2word=dictionary)
            lsi.save("reuters_lsi_%sdims" % dims)
        lsi.__out_size = dims
        models["lsi"] = lsi

    return models
Example #36
0
def train_lsi(corpus, dictionary, num_topics, corpus_type):
    """
    Train the LSI model given the dataset for a given amount of topics.
    """
    #train model and save for later use
    model_filename = 'lsi_' + str(corpus_type) + '_num_topics=' + str(
        num_topics) + '.model'
    model_path = './tmp/' + model_filename

    if not os.path.exists(model_path):
        print(('Starting training {} lsi for num_topics = {}').format(
            corpus_type, num_topics))
        lsi = LsiModel(corpus=corpus,
                       id2word=dictionary,
                       num_topics=num_topics,
                       onepass=False)
        lsi.save(model_path)

    else:
        print(('{} Lsi for num_topics = {} is already created, loading now...'
               ).format(corpus_type, num_topics))
        lsi = LsiModel.load(model_path)

    #construct BOW index for trained lsi model, save for later use
    index_filename = 'index_' + str(corpus_type) + '_num_topics=' + str(
        num_topics) + '.mm.index'
    index_path = './tmp/' + index_filename

    if not os.path.exists(index_path):
        print(('Starting construction {} index for num_topics = {}').format(
            corpus_type, num_topics))
        index = similarities.MatrixSimilarity(lsi[corpus])
        index.save(index_path)
    else:
        print((
            'index for {} corpus with num_topics = {} is already created, loading now...'
        ).format(corpus_type, num_topics))
        index = similarities.MatrixSimilarity.load(index_path)

    return lsi, index
Example #37
0
def load(model_file):
    """load the lsi model into memory"""
    lsi = LsiModel.load(model_file)
    return lsi;
Example #38
0
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 800)
pd.set_option('precision', 2)
get_ipython().magic(u'precision 4')
get_ipython().magic(u'pprint')


# In[3]:

from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR


# In[6]:

lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi100'))
lsi2 = LsiModel.load(os.path.join(DATA_PATH, 'lsi2'))


# In[7]:

with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'rb') as f:
    topics = pd.DataFrame.from_csv(f, encoding='utf8')
topics = topics.fillna(0)


# In[8]:

dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
from gensim.matutils import cossim
from gensim.models import LsiModel

logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)

parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model")
parser.add_argument("-d", "--data", help="path to training.tsv")
args = parser.parse_args()

# Load model
# Note - model contains dictionary that intentionally omits stopwords
model = LsiModel.load(args.model, mmap='r')

# Load 'training' data
training_data = open(args.data)
training_data.readline()  # advance past header line

correct = 0
total = 0

for line in training_data:
    elements = line.split("\t")
    question_id = elements.pop(0)
    correct_answer = elements.pop(1)

    # Get bag-of-words representation of question and answers
    doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements]
Example #40
0
    corpora.MmCorpus.serialize(corpus_filename, corpus)
else:
    corpus = corpora.MmCorpus(corpus_filename)



#  vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts

print("lsi")
lsi_filename = 'model.lsi'
if not os.path.isfile(lsi_filename):
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=5)  # initialize an LSI transformation, 5 topicos
    #
    lsi.save(lsi_filename)  # same for tfidf, lda, ...
else:
    lsi = LsiModel.load(lsi_filename)

lsi_topics = 5  # numero predefinido de topicos
def print_topic(lsi, topicno, topn=7):
    """
        Return a single topic as a formatted string. See `show_topic()` for parameters.

        >>> lsimodel.print_topic(topicno, topn)
        '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'

        """
    return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)])


def show_topic(lsi, topicno, topn=7):
    """
tfidf[bows[0]]


# In[19]:

dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]])


# Notice how "you" didn't get as much weight as "enjoy"  
# Let's look at some other tweets  

# In[9]:

from gensim.models import LsiModel
lsi = LsiModel.load('../../data/lsi100')
len(lsi.id2word)


# This is starting to look a lot like a set of vectors that we could use as features  
# But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?

# In[ ]:

len(vocab)


# 100k dimensions isn't a good idea  
# Even for a masively parallel deep learning project this would be big  
# Like the cat/dog picture classification on 256x256 images  
# What about PCA (Principal Component Analysis) like is used on images?