Ejemplo n.º 1
0
def test_lda_transform_mismatch():
    # test `n_features` mismatch in partial_fit and transform
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    lda.partial_fit(X)
    assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
Ejemplo n.º 2
0
def main():
    X = np.array([[0, 1, 0, 2, 2, 0], [1, 0, 1, 1, 3, 3]])

    olda = OnlineLDA(n_topics=2)
    olda.partial_fit(X)
    print(olda.lambda_)

    lda = LatentDirichletAllocation(n_topics=2, total_samples=2)
    lda.partial_fit(X)
    print(lda.components_)
Ejemplo n.º 3
0
def test_lda_partial_fit_dim_mismatch():
    # test `n_features` mismatch in `partial_fit`
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5.,
                                    total_samples=20, random_state=rng)
    lda.partial_fit(X_1)
    assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
Ejemplo n.º 4
0
def test_lda_transform_mismatch():
    # test `n_features` mismatch in partial_fit and transform
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    lda.partial_fit(X)
    assert_raises_regexp(ValueError, r"^The provided data has",
                         lda.partial_fit, X_2)
Ejemplo n.º 5
0
def test_lda_partial_fit_dim_mismatch():
    # test `n_features` mismatch in `partial_fit`
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5.,
                                    total_samples=20, random_state=rng)
    lda.partial_fit(X_1)
    assert_raises_regexp(ValueError, r"^The provided data has",
                         lda.partial_fit, X_2)
Ejemplo n.º 6
0
def test_lda_transform_mismatch():
    # test `n_features` mismatch in partial_fit and transform
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_components = rng.randint(3, 6)
    lda = LatentDirichletAllocation(n_components=n_components,
                                    random_state=rng)
    lda.partial_fit(X)
    with pytest.raises(ValueError, match=r"^The provided data has"):
        lda.partial_fit(X_2)
Ejemplo n.º 7
0
def test_lda_partial_fit_multi_jobs():
    # Test LDA online training with multi CPU
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=-1, learning_offset=5.,
                                    total_samples=30, random_state=rng)
    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Ejemplo n.º 8
0
def test_lda_partial_fit_multi_jobs():
    # Test LDA online training with multi CPU
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                    learning_offset=5., total_samples=30,
                                    random_state=rng)
    for i in range(2):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps
Ejemplo n.º 9
0
def test_lda_partial_fit():
    # Test LDA online learning (`partial_fit` method)
    # (same as test_lda_batch)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10.,
                                    total_samples=100, random_state=rng)
    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Ejemplo n.º 10
0
def test_lda_partial_fit():
    # Test LDA online learning (`partial_fit` method)
    # (same as test_lda_batch)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10.,
                                    total_samples=100, random_state=rng)
    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Ejemplo n.º 11
0
def topic_modeling(gen, n_components):
    """
        Takes in a cursor generator and number of componenets for LDA and returns topics
    """
    count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)

    count_vectorizer.fit(gen)
    lda = LatentDirichletAllocation(n_components)
    for _ in range(10):
    
        for file in gen:
            vec_file = count_vectorizer.transform([file])
            lda.partial_fit(vec_file)
    return(display_topics(lda, count_vectorizer.get_feature_names(), 10))
def iter_epochs(n_word_types, docs, n_topics, seed):
    D = len(docs)
    V = n_word_types
    docs = [list(Counter(doc).items()) for doc in docs]
    X = lil_matrix((D, V), dtype=np.int)
    for d, doc in enumerate(docs):
        for v, c in doc:
            X[d, v] = c
    X = X.tocsr()
    model = LatentDirichletAllocation(n_topics=n_topics,
                                      learning_method='online',
                                      random_state=seed)
    while True:
        start_time_s = time.time()
        model.partial_fit(X)
        processing_time_s = time.time() - start_time_s
        phikv = model.components_
        yield dict(topic_word_distribution=phikv,
                   processing_time_s=processing_time_s)
Ejemplo n.º 13
0
def main():
    X = np.array([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 2, 1]])

    olda = OnlineLDA(n_topics=2, tau0=80)
    olda.partial_fit(X)
    print(olda.lambda_)

    lda = LatentDirichletAllocation(n_topics=2,
                                    total_samples=2,
                                    learning_offset=80,
                                    learning_decay=0.8,
                                    mean_change_tol=0.00001,
                                    max_iter=10000)
    lda.fit(X)
    print(lda.perplexity(X))

    lda = LatentDirichletAllocation(n_topics=2,
                                    total_samples=2,
                                    learning_offset=80,
                                    learning_decay=0.8,
                                    mean_change_tol=0.00001,
                                    max_iter=10000)
    lda.partial_fit(X)
    print(lda.perplexity(X))
class ScikitLda(object):

    def __init__(self, corpus=None, lda=None, n_topics=10,
                 max_iter=5, learning_method='online', learning_offset=50.,
                 **kwargs):
        if lda is None:
            self.lda = LatentDirichletAllocation(
                n_topics=n_topics, max_iter=max_iter,
                learning_method=learning_method,
                learning_offset=learning_offset, **kwargs)
        else:
            self.lda = lda

        self._corpus = corpus
        self._weights = None

    def fit(self):
        self.lda.fit(self.corpus.sparse_matrix())

    def partial_fit(self, corpus):
        self.lda.partial_fit(corpus.sparse_matrix())
        self._weights = None

    @property
    def topics(self):
        return self.lda.components_

    @property
    def n_topics(self):
        return self.lda.n_topics

    @property
    def corpus(self):
        return self._corpus

    @property
    def weights(self):
        if self._weights is None:
            self._weights = self.partial_weights(self.corpus)
        return self._weights

    def partial_weights(self, corpus):
        weights = self.transform(corpus)
        return (weights.T / weights.sum(axis=1)).T

    def transform(self, corpus):
        return self.lda.transform(corpus.sparse_matrix())

    def topic_words(self, n_words=10):
        topicWords = []
        topicWeightedWords = []

        for topic_idx, topic in enumerate(self.topics):
            weightedWordIdx = topic.argsort()[::-1]
            wordsInTopic = [self.corpus.word(i)
                            for i in weightedWordIdx[:n_words]]

            weights = topic / topic.sum()
            topicWeights = [(weights[i], self.corpus.word(i))
                            for i in weightedWordIdx[:n_words]]

            topicWords.append(wordsInTopic)
            topicWeightedWords.append(topicWeights)

        return (topicWords, topicWeightedWords)

    def save(self, filename):
        joblib.dump(self.lda, filename)

    @classmethod
    def load(cls, filename, corpus=None):
        lda = joblib.load(filename)
        return cls(lda=lda, corpus=corpus)
Ejemplo n.º 15
0
def main():
    from neomodel import config, db

    from model.graph import connection_url
    from model.graph.spotify.track import Track
    from model.graph.spotify.playlist import Playlist

    from sklearn.neural_network import MLPClassifier
    from sklearn.linear_model import Perceptron, SGDClassifier
    from sklearn.multioutput import MultiOutputClassifier
    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.preprocessing import normalize
    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.datasets import make_multilabel_classification

    from joblib import dump, load
    from tqdm import tqdm
    import numpy as np
    from math import log10
    import os

    config.DATABASE_URL = connection_url()

    db.set_connection(connection_url())

    stopval = len(Track.nodes)
    print(stopval)

    print('Playlists', len(Playlist.nodes))
    playlists = Playlist.get_all()
    num_playlists = len(playlists)
    playlists = {node.uri: ind for ind, node in enumerate(playlists)}

    def get_minibatches(stopval, count=0, interval=20):
        while count < stopval:
            to_analyze: List[Track] = Track.get_songs_in_playlists(
                interval, count)

            X = [a.get_song_features(as_list=True) for a in to_analyze]
            y = [[playlists[x.uri] for x in Track.get_playlists(a.spotify_id)]
                 for a in to_analyze]
            print(count, interval)
            yield np.array(list(
                map(lambda x: list(map(abs, x)),
                    X))), MultiLabelBinarizer().fit_transform(y)
            count += len(to_analyze)

    lda = LatentDirichletAllocation(n_components=num_playlists)
    startval = 0
    if len(os.listdir('trained_models')) > 0:
        startval = max(os.listdir('trained_models'))
        lda = load(os.path.join('trained_models', startval))

    startval = int(startval.split('.')[0])
    interval = 20
    # get_minibatches(stopval, count=startval * interval)
    for i, val in enumerate(
            tqdm(get_minibatches(stopval, count=startval * interval))):
        i += startval + 1
        X, y = val
        lda.partial_fit(X)
        dump(
            lda,
            os.path.join('trained_models',
                         f'{str(i).zfill(int(log10(stopval) + 1))}.joblib'))
        os.remove(
            os.path.join(
                'trained_models',
                f'{str(i - 1).zfill(int(log10(stopval) + 1))}.joblib'))
Ejemplo n.º 16
0
def fast_lda_topics(X,
                    n_components: int = 10,
                    batch_size=128,
                    max_iter=100,
                    doc_topic_prior=None,
                    topic_word_prior=None,
                    learning_decay=.7,
                    learning_offset=10.,
                    total_samples=1e6,
                    max_doc_update_iter=100,
                    n_jobs=2,
                    random_state=1) -> LatentDirichletAllocation:
    r""" Latent dirichlet allocation using online variational Bayes method.
  In each EM update, use mini-batch of training data to update the
  ``components_`` variable incrementally.
  The learning rate is controlled by the ``learning_decay`` and
  the ``learning_offset`` parameters.

  Arguments:
    n_components : int, optional (default=10)
        Number of topics.
    doc_topic_prior : float, optional (default=None)
        Prior of document topic distribution `theta`. If the value is None,
        defaults to `1 / n_components`.
    topic_word_prior : float, optional (default=None)
        Prior of topic word distribution `beta`. If the value is None, defaults
        to `1 / n_components`.
    learning_decay : float, optional (default=0.7)
        It is a parameter that control learning rate in the online learning
        method. The value should be set between (0.5, 1.0] to guarantee
        asymptotic convergence. When the value is 0.0 and batch_size is
        ``n_samples``, the update method is same as batch learning.
        literature, this is called kappa.
    learning_offset : float, optional (default=10.)
        A (positive) parameter that downweights early iterations in online
        learning.  It should be greater than 1.0.
    max_iter : integer, optional (default=10)
        The maximum number of iterations.
    batch_size : int, optional (default=128)
        Number of documents to use in each EM iteration. Only used in online
        learning.
    total_samples : int, optional (default=1e6)
        Total number of documents. Only used in the :meth:`partial_fit` method.
    mean_change_tol : float, optional (default=1e-3)
        Stopping tolerance for updating document topic distribution in E-step.
    max_doc_update_iter : int (default=100)
        Max number of iterations for updating document topic distribution in
        the E-step.
    n_jobs : int or None, optional (default=None)
        The number of jobs to use in the E-step.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    random_state : int, RandomState instance, default=None
        Pass an int for reproducible results across multiple function calls.
        See :term:`Glossary <random_state>`.
  """
    lda = LatentDirichletAllocation(
        n_components=n_components,
        batch_size=batch_size,
        max_iter=max_iter,
        doc_topic_prior=doc_topic_prior,
        topic_word_prior=topic_word_prior,
        learning_method='online',
        learning_decay=learning_decay,
        learning_offset=learning_offset,
        total_samples=total_samples,
        max_doc_update_iter=max_doc_update_iter,
        n_jobs=n_jobs,
        verbose=False,
        random_state=random_state,
    )
    prog = tqdm(desc="Perp(None)", total=max_iter)
    if isinstance(X, (tf.Tensor, tf.SparseTensor)):
        X = X.numpy()
    if isinstance(X, (np.ndarray, sparse.spmatrix)):
        for it in range(max_iter):
            lda.partial_fit(X)
            prog.update(1)
    elif isinstance(X, DatasetV2):
        for it, x in enumerate(
                X.repeat(-1).shuffle(100) if hasattr(X, 'repeat') else X):
            if it >= max_iter:
                break
            if isinstance(x, (tuple, list)):
                x = x[0]
            lda.partial_fit(x.numpy())
            if it % 10 == 0:
                perp = lda.perplexity(x)
                prog.desc = f"Perp({perp:.2f})"
            prog.update(1)
    prog.close()
    return lda
Ejemplo n.º 17
0
class HarmonizeClassRasters:
    def __init__(self,
                 class_code2vocab,
                 class_errmat,
                 vocab_creation="union",
                 **kwargs):
        # class_code2vocab: list of dict, class_code_within_each_raster:common_class_code_or_name_across_rasters
        # class_errmat: list of pandas DataFrame.
        # vocab_creation: "union" or "combination", options to create vocabulary from the input class labels of two or more rasters.
        #
        # kwargs: keyword arguments for the LDA model,
        # LatentDirichletAllocation[http://scikit-learn.org/dev/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation]
        # if scikit-learn package; LDA[https://lda.readthedocs.io/en/stable/]
        # if lda package
        self._vocab_union = 1
        self._vocab_combination = 2

        self.class_code2vocab = [pd.Series(cv) for cv in class_code2vocab]
        errmat_in_vocab = []
        if class_errmat is None:
            errmat_in_vocab = [
                pd.DataFrame(np.eye(len(set(cv.values()))), set(cv.values()),
                             set(cv.values())) for cv in class_code2vocab
            ]
        else:
            for cv, old_em in zip(class_code2vocab, class_errmat):
                em = old_em.copy()
                em.index = pd.Series(cv).reindex(em.index).values
                em.columns = pd.Series(cv).reindex(em.columns).values
                errmat_in_vocab.append(em)
        self.class_errmat = errmat_in_vocab
        self.kwargs = kwargs

        # generate all the unique classes (words) as the vocabulary
        if vocab_creation == "union":
            self.vocab_creation = self._vocab_union
            self.vocab = list(
                set(
                    list(
                        itertools.chain(
                            *[c2v.values() for c2v in class_code2vocab]))))
            self._dw = pd.Series(np.zeros(len(self.vocab)),
                                 index=pd.Index(self.vocab))
            self._m2t_prob = None
        elif vocab_creation == "combination":
            self.vocab_creation = self._vocab_combination
            self.vocab = list(
                itertools.product(
                    *[set(c2v.values()) for c2v in class_code2vocab]))
            index = pd.MultiIndex.from_tuples(self.vocab)
            self._dw = pd.Series(np.zeros(len(self.vocab)), index=index)
            if class_errmat is None:
                m2t_prob = None  # pd.DataFrame(np.eye(len(self.vocab)), index=index, columns=index)
            else:
                m2t_prob = pd.DataFrame(np.zeros(
                    (len(self.vocab), len(self.vocab))),
                                        index=index,
                                        columns=index)
                em_col_comb = list(
                    itertools.product(
                        *[em.columns.values for em in errmat_in_vocab]))
                for idx in m2t_prob.index.values:
                    m2t_list = [
                        errmat_in_vocab[i].loc[val, :] /
                        errmat_in_vocab[i].loc[val, :].sum()
                        for i, val in enumerate(idx)
                    ]
                    m2t_list = list(zip(*itertools.product(*m2t_list)))
                    m2t = m2t_list[0]
                    for val in m2t_list[1:]:
                        m2t = np.multiply(m2t, val)
                    m2t_prob.loc[idx, em_col_comb] = m2t
            self._m2t_prob = m2t_prob
        else:
            raise RuntimeError("Unknown option for vocabulary creation")

        self.lda = LatentDirichletAllocation(**kwargs)

    def _translateArray(self, img, code2vocab):
        # img: 2D array
        # code2vocab: pandas series to translate class codes (indexes of the
        # series) to vocabulary codes (values of the series).
        out = img.copy()
        for idx, v in code2vocab.items():
            if idx != v:
                out[img == idx] = v
        return out

    def genDocWordFromArray(self, multiband_img, use_errmat=True, N_factor=1):
        self._dw[:] = 0
        img_list = []
        for ib in range(multiband_img.shape[2]):
            img = multiband_img[:, :, ib]
            img_list.append(
                self._translateArray(img, self.class_code2vocab[ib]))

        if self.vocab_creation == self._vocab_union:
            for ib, words in enumerate(img_list):
                uw, uc = np.unique(words, return_counts=True)
                uw_mask = np.ones_like(uw, dtype=np.bool)
                for v in set(uw) - set(self.vocab):
                    uw_mask = np.logical_and(uw_mask, uw != v)

                uw = uw[uw_mask]
                uc = uc[uw_mask]

                n_words = np.sum(uc)
                if use_errmat:
                    # Do adjustment of word counts according to error matrix
                    em = self.class_errmat[ib]
                    em_row = em.loc[uw, :].values
                    tmp = em_row / np.tile(
                        np.sum(em_row, axis=1)[:, np.newaxis],
                        (1, em_row.shape[1]))
                    # Calculate the proportion of vocabulary words in this image
                    # Later N_factor multiplication gives word counts that
                    # create a document of this designated number of words.
                    # This can be used to have all the documents of the same
                    # lengths/word counts in the LDA training.
                    uc = np.matmul(uc, tmp)
                    uw = em.columns
                # Calculate the proportion of vocabulary words in this image
                # Later N_factor multiplication gives word counts that create a
                # document of this designated number of words.  This can be
                # used to have all the documents of the same lengths/word
                # counts in the LDA training.
                self._dw.loc[uw] += uc / n_words
        elif self.vocab_creation == self._vocab_combination:
            uw, uc = np.unique(np.asarray(
                list(zip(*[img.flatten() for img in img_list]))),
                               axis=0,
                               return_counts=True)
            uw_mask = np.ones(uw.shape[0], dtype=np.bool)
            for v in set([tuple(val)
                          for val in uw.tolist()]) - set(self.vocab):
                uw_mask = np.logical_and(
                    uw_mask, np.all(uw != np.tile(v, (uw.shape[0], 1)),
                                    axis=1))
            uw = uw[uw_mask, :]
            uc = uc[uw_mask]
            n_words = np.sum(uc)
            uw = [tuple(val) for val in uw]
            if use_errmat and (self._m2t_prob is not None):
                uc = np.matmul(uc, self._m2t_prob.loc[uw, :])
                uw = self._m2t_prob.columns.values
            # Calculate the proportion of vocabulary words in this image
            # Later N_factor multiplication gives word counts that
            # create a document of this designated number of words.
            # This can be used to have all the documents of the same
            # lengths/word counts in the LDA training.
            self._dw.loc[uw] = uc / n_words
        else:
            raise RuntimeError("Unknown option for vocabulary creation.")
        return self._dw.values.copy() * N_factor

    def fitTopicModel(self, X, partial=True):
        if partial:
            self.lda.partial_fit(X)
        else:
            self.lda.fit(X)

    def getTopicWordDist(self):
        return self.lda.components_

    def estDocTopicDist(self, X):
        return self.lda.transform(X)

    def estHarmonized(self,
                      mb_img,
                      img_mask,
                      N_factor=1,
                      class_nodata=0,
                      prob_nodata=0):
        # img_mask: valid being 1 and invalid (not to be processed) being 0.
        win_ysize, win_xsize, nbands = mb_img.shape

        pixel_mask = img_mask.ravel()

        pixel_word = np.array(
            [mb_img[:, :, ib].ravel() for ib in range(nbands)]).T

        pixel_prob = np.zeros(
            (len(pixel_mask), self.lda.n_components)) + prob_nodata
        pixel_class = np.zeros(len(pixel_mask)) + class_nodata

        if np.sum(pixel_mask) > 0:
            pixel_prob[pixel_mask, :] = self.estDocTopicDist(
                np.array([
                    self.genDocWordFromArray(pw[np.newaxis, np.newaxis, :],
                                             use_errmat=True,
                                             N_factor=N_factor)
                    for pw in pixel_word[pixel_mask, :]
                ]))
            pixel_class[pixel_mask] = np.argmax(pixel_prob[pixel_mask, :],
                                                axis=1) + 1

        class_img = pixel_class.reshape(win_ysize, win_xsize)
        prob_img = np.dstack([
            pixel_prob[:, ib].reshape(win_ysize, win_xsize)
            for ib in range(nbands)
        ])

        return class_img, prob_img
Ejemplo n.º 18
0
def LDA(review_data, df, n_features = 10000, length = 10, n_top_words = 25, max_df = 0.01, min_df = 0.00001, n_components = 30, 
		max_features = None, min_samples_split = None, max_depth = None, min_samples_leaf = None, myCsvRow = None):

	print "Start tf_vectorizer"
	tf_vectorizer = CountVectorizer(max_df = max_df, min_df=min_df,
									max_features=n_features,
									stop_words='english',
									token_pattern = r"(?u)\b[A-Za-z0-9]{3,}\b")
	tf = tf_vectorizer.fit_transform(review_data)
	tf = tf[np.array(tf.sum(1)).flatten() > length,:]
	tf_feature_names = tf_vectorizer.get_feature_names()

	test = tf[tf.shape[0]-100000:]
	tf = tf[:tf.shape[0]-100000]
	file1_name = 'TF_Vectorizer_' + 'Topic'+str(n_components) + '_Feature' + str(n_features)  + '_length' + str(length) + 'max_df'+str(max_df) + 'min_df' + str(min_df) + '.pkl'
	joblib.dump(tf_vectorizer, file1_name)	
	print "Finished tf_vectorizer"

	print "start LDA"
	lda = LatentDirichletAllocation(n_components=n_components,
									learning_method='online', verbose = 1, learning_decay=0.5, batch_size = 4096,
									learning_offset=64, total_samples = tf.shape[0],
									random_state=0, n_jobs=8)
	
	last_bound = 1000000
	for it in range(8):
		for i, ll in enumerate(chunks(range(tf.shape[0]), 100000)):
			lda.partial_fit(tf[ll])
		bound = lda.perplexity(test)
		print "preplexity:",bound
		if last_bound and last_bound - bound < 0.1:
			break
		last_bound = bound
	print_top_words(lda, tf_feature_names, n_top_words)


	file2_name = 'LDA_' + 'Topic'+str(n_components) + '_Feature' + str(n_features)  + '_length' + str(length) + 'max_df'+str(max_df) + 'min_df' + str(min_df) + '.pkl'
	joblib.dump(lda, file2_name)
	print "Finished LDA"

######################################################################### Machine Learning part ############################################################
	target_name = ['BugsCrashes','Experience','Hardware','Pricing']

	data = pd.concat([pd.DataFrame(lda.transform(tf_vectorizer.transform(df.Body.tolist()))),df.BugsCrashes,df.Experience, df.Hardware, df.Pricing], 1)

	X = lda.transform(tf_vectorizer.transform(df.Body.tolist()))
	y = df[['BugsCrashes','Experience', 'Hardware', 'Pricing']]
	y = np.array(y)

	full_rf_pred = np.empty((0,4))
	full_y_test = np.empty((0,4))
	k_fold = KFold(data.shape[0], n_folds=10, shuffle=True, random_state=40)
	for fold in k_fold:
		train_idx = fold[0] 
		test_idx = fold[1]
		X_train, y_train = X[train_idx,:], y[train_idx,:]
		X_test, y_test = X[test_idx, :], y[test_idx, :]

		rf = RandomForestClassifier(n_jobs = 8, random_state = 10, n_estimators = 300, max_features = max_features, min_samples_split = min_samples_split, 
									max_depth = max_depth, min_samples_leaf = min_samples_leaf).fit(X_train, y_train)
		rf_pred = rf.predict(X_test)

		full_rf_pred = np.append(full_rf_pred,rf_pred, axis = 0)
		full_y_test = np.append(full_y_test,y_test, axis = 0)
	print '############rf#############\n',classification_report(full_y_test, full_rf_pred, target_names = target_name, digits = 3) 

	with open('classification_report.csv', 'a') as csvfile:
		csvfile.write('\n')
		csvfile.write(myCsvRow)
		csvfile.write('\n')

	report = classification_report(full_y_test, full_rf_pred, target_names = target_name)
	classification_report_csv(report)
Ejemplo n.º 19
0
corpus = open('E:\\dataset2.csv').read()
docs = corpus.split('\n')

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_X = vec.fit_transform(docs)

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2,
                                max_iter=200,
                                learning_offset=4.0,
                                learning_method='online')

step = matrix_X.shape[0] / 10
step = int(step)

index = 0
for i in range(10):
    if i == 9:
        lda.partial_fit(matrix_X[index:])
    else:
        lda.partial_fit(matrix_X[index:index + step])
    index = index + step
    print('\niteration ', i)
    print(lda.components_)
Ejemplo n.º 20
0
    print("Handling %s LDA" % file_name)
    Data = json.load(open(file_name))
    handle_doc_num = 0
    Tmp_Total_Metrix = []
    for Doc_obj in Data:
        Tmp_metrix = [0] * len(Vocab_cut)
        handle_doc_num += 1
        Doc_Content = Doc_obj['main_content'].split(" ")
        for term in Doc_Content:
            if (term in Vocab_to_index):
                Tmp_metrix[Vocab_to_index[term]] += 1

        Tmp_Total_Metrix.append(Tmp_metrix)

        if (handle_doc_num >= 1024):
            clf.partial_fit(Tmp_Total_Metrix)
            Tmp_Total_Metrix = []
            handle_doc_num = 0

data_dir.append('test.json')
for i in range(3):
    print("Predicting %s By LDA Model" % data_class[i])
    file_name = data_dir[i]
    Data = json.load(open(file_name))
    output_data = []
    for Doc_obj in Data:
        Tmp_metrix = [0] * len(Vocab_cut)
        Doc_Content = Doc_obj['main_content'].split(" ")
        for term in Doc_Content:
            if (term in Vocab_to_index):
                Tmp_metrix[Vocab_to_index[term]] += 1
Ejemplo n.º 21
0
        test_scores = []  # size: (max_iter / valid_iter) * (n_splits)
        train_perplexities = []  # size: (max_iter / valid_iter) * (n_splits)
        test_perplexities = []  # size: (max_iter / valid_iter) * (n_splits)

        for i in range(int(max_iter / valid_iter)):
            train_s = []
            test_s = []
            train_p = []
            test_p = []

            print '\ntraining ', i * valid_iter + 1, '-th iteration'

            for train_index, test_index in splited_index:
                train_data, test_data = dataset[train_index], dataset[
                    test_index]
                lda_model.partial_fit(train_data)

                train_s.append(lda_model.score(train_data))
                test_s.append(lda_model.score(test_data))

                train_p.append(lda_model.perplexity(train_data))
                test_p.append(lda_model.perplexity(test_data))

            train_scores.append(train_s)
            test_scores.append(test_s)
            train_perplexities.append(train_p)
            test_perplexities.append(test_p)

            print "train_scores: ", train_scores[
                i], " test_scores: ", test_scores[
                    i], " train_perplexities: ", train_perplexities[
        test_scores = []        # size: (max_iter / valid_iter) * (n_splits)
        train_perplexities = []  # size: (max_iter / valid_iter) * (n_splits)
        test_perplexities = []  # size: (max_iter / valid_iter) * (n_splits)


        for i in range(int(max_iter / valid_iter)):
            train_s = []
            test_s = []
            train_p = []
            test_p = []

            print '\ntraining ', i * valid_iter + 1, '-th iteration'

            for train_index, test_index in splited_index:
                train_data, test_data = dataset[train_index], dataset[test_index]
                lda_model.partial_fit(train_data)

                train_s.append(lda_model.score(train_data))
                test_s.append(lda_model.score(test_data))

                train_p.append(lda_model.perplexity(train_data))
                test_p.append(lda_model.perplexity(test_data))

            train_scores.append(train_s)
            test_scores.append(test_s)
            train_perplexities.append(train_p)
            test_perplexities.append(test_p)

            print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i]

Ejemplo n.º 23
0
N_CLASSES = np.unique(y_train)

scores_train = []
scores_test = []

# EPOCH
epoch = 0
while epoch < N_EPOCHS:
    print('epoch: ', epoch)
    # SHUFFLING
    random_perm = np.random.permutation(X_train.shape[0])
    mini_batch_index = 0
    while True:
        # MINI-BATCH
        indices = random_perm[mini_batch_index:mini_batch_index + N_BATCH]
        clf.partial_fit(X_train[indices], y_train[indices], classes=N_CLASSES)
        mini_batch_index += N_BATCH

        if mini_batch_index >= N_TRAIN_SAMPLES:
            break

    # SCORE TRAIN
    scores_train.append(clf.score(X_train, y_train))

    # SCORE TEST
    scores_test.append(clf.score(X_test, y_test))

    epoch += 1
plt.figure()
plt.plot(scores_train, color='b', alpha=0.8, label='Train')
plt.plot(scores_test, color='r', alpha=0.8, label='Test')