Exemple #1
0
def test_nmf_underflow():
    # Regression test for an underflow issue in _beta_divergence
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 10, 2, 2
    X = np.abs(rng.randn(n_samples, n_features)) * 10
    W = np.abs(rng.randn(n_samples, n_components)) * 10
    H = np.abs(rng.randn(n_components, n_features))

    X[0, 0] = 0
    ref = nmf._beta_divergence(X, W, H, beta=1.0)
    X[0, 0] = 1e-323
    res = nmf._beta_divergence(X, W, H, beta=1.0)
    assert_almost_equal(res, ref)
Exemple #2
0
def test_nmf_decreasing():
    # test that the objective function is decreasing at each iteration
    n_samples = 20
    n_features = 15
    n_components = 10
    alpha = 0.1
    l1_ratio = 0.5
    tol = 0.

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.abs(X, X)
    W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
                                 random_state=42)

    for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
        for solver in ('cd', 'mu'):
            if solver != 'mu' and beta_loss != 2:
                # not implemented
                continue
            W, H = W0.copy(), H0.copy()
            previous_loss = None
            for _ in range(30):
                # one more iteration starting from the previous results
                W, H, _ = non_negative_factorization(
                    X, W, H, beta_loss=beta_loss, init='custom',
                    n_components=n_components, max_iter=1, alpha=alpha,
                    solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0,
                    regularization='both', random_state=0, update_H=True)

                loss = nmf._beta_divergence(X, W, H, beta_loss)
                if previous_loss is not None:
                    assert previous_loss > loss
                previous_loss = loss
Exemple #3
0
def test_beta_divergence():
    # Compare _beta_divergence with the reference _beta_divergence_dense
    n_samples = 20
    n_features = 10
    n_components = 5
    beta_losses = [0., 0.5, 1., 1.5, 2.]

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.clip(X, 0, None, out=X)
    X_csr = sp.csr_matrix(X)
    W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42)

    for beta in beta_losses:
        ref = _beta_divergence_dense(X, W, H, beta)
        loss = nmf._beta_divergence(X, W, H, beta)
        loss_csr = nmf._beta_divergence(X_csr, W, H, beta)

        assert_almost_equal(ref, loss, decimal=7)
        assert_almost_equal(ref, loss_csr, decimal=7)
def test_nmf_decreasing(solver):
    # test that the objective function is decreasing at each iteration
    n_samples = 20
    n_features = 15
    n_components = 10
    alpha = 0.1
    l1_ratio = 0.5
    tol = 0.0

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.abs(X, X)
    W0, H0 = nmf._initialize_nmf(X,
                                 n_components,
                                 init="random",
                                 random_state=42)

    for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5):
        if solver != "mu" and beta_loss != 2:
            # not implemented
            continue
        W, H = W0.copy(), H0.copy()
        previous_loss = None
        for _ in range(30):
            # one more iteration starting from the previous results
            W, H, _ = non_negative_factorization(
                X,
                W,
                H,
                beta_loss=beta_loss,
                init="custom",
                n_components=n_components,
                max_iter=1,
                alpha_W=alpha,
                solver=solver,
                tol=tol,
                l1_ratio=l1_ratio,
                verbose=0,
                random_state=0,
                update_H=True,
            )

            loss = (nmf._beta_divergence(X, W, H, beta_loss) +
                    alpha * l1_ratio * n_features * W.sum() +
                    alpha * l1_ratio * n_samples * H.sum() + alpha *
                    (1 - l1_ratio) * n_features * (W**2).sum() + alpha *
                    (1 - l1_ratio) * n_samples * (H**2).sum())
            if previous_loss is not None:
                assert previous_loss > loss
            previous_loss = loss
Exemple #5
0
    def calc_rec_error(self, df, date_range):
        """
        Calculate reconstruction error. For the data of one trading day, take previous day's NMF
        model, apply transform method to the data, and calculate reconstruction error
        Parameters:
            df: Pandas DataFrame
                Data for a particular date range for the output of read_raw_data() method in modules.tweet_data
            date_range: DateTimeIndex
                DateTimeIndex of dates which will serve as range for fitting the data

        Returns: list, list
                List of reconstruction errors for the fitted models
                List of reconstruction errors for the transformed data
        """

        model_err = []
        new_err = []

        for i in range(len(date_range) - 1):
            str_date = str(date_range[i + 1].date())
            prev_str_date = str(date_range[i].date())
            print("Working on : ", str_date, end="\r")

            # Take portion of df in the range of a trading day
            sub_df = df[date_range[i]:(date_range[i + 1] -
                                       dt.timedelta(seconds=1))].tweet
            # Tokenize with Spacy NLP pipe. Disable tagger, parser and ner for faster calculation
            sub_df = [
                self.twitter_tokenizer(text)
                for text in nlp.pipe(sub_df,
                                     disable=["tagger", "parser", "ner"])
            ]
            # Use previous day's tfidf model to transform data to tfidf format used to fit NMF model
            tfidf_vecs = self.tfidf_dict[prev_str_date].transform(sub_df)
            # Calculate reconstruction error using method from
            # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/decomposition/_nmf.py
            new_rec_err = _beta_divergence(
                tfidf_vecs,
                self.nmf_dict[prev_str_date].transform(tfidf_vecs),
                self.nmf_dict[prev_str_date].components_,
                'frobenius',
                square_root=True)
            # Reconstruction error from original model
            rec_err = self.nmf_dict[str_date].reconstruction_err_

            model_err.append(rec_err)
            new_err.append(new_rec_err)

        print("\nFinished")

        return model_err, new_err
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
              n_components, random_state):
    W = W0.copy()
    H = H0.copy()

    clf = clf_type(**clf_params)
    st = time()
    W = clf.fit_transform(X, W=W, H=H)
    end = time()
    H = clf.components_

    this_loss = _beta_divergence(X, W, H, 2.0, True)
    duration = end - st
    return this_loss, duration
Exemple #7
0
"""
==============================
Beta-divergence loss functions
==============================

A plot that compares the various Beta-divergence loss functions supported by
the Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition._nmf import _beta_divergence

print(__doc__)

x = np.linspace(0.001, 4, 1000)
y = np.zeros(x.shape)

colors = 'mbgyr'
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
    for i, xi in enumerate(x):
        y[i] = _beta_divergence(1, xi, 1, beta)
    name = "beta = %1.1f" % beta
    plt.plot(x, y, label=name, color=colors[j])

plt.xlabel("x")
plt.title("beta-divergence(1, x)")
plt.legend(loc=0)
plt.axis([0, 4, 0, 3])
plt.show()
Exemple #8
0
def k_fold(run_id, k_folds):
    stat = RunStats.objects.get(run_id=run_id)
    qid = stat.query.id
    K = stat.K
    alpha = stat.alpha
    n_features = stat.max_features
    if n_features == 0:
        n_features = 100000000000
    limit = stat.limit
    ng = stat.ngram

    if stat.method == "LD":
        if stat.max_iter == 200:
            stat.max_iter = 10
        if stat.max_iter > 100:
            stat.max_iter = 90

    n_samples = stat.max_iter

    if stat.fulltext:
        docs = Doc.objects.filter(query=qid, fulltext__iregex='\w')
    else:
        docs = Doc.objects.filter(query=qid, content__iregex='\w')

    # if we are limiting, probably for testing, then do that
    if limit > 0:
        docs = docs[:limit]

    tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=snowball_stemmer(),
                                       stop_words=stoplist)

    count_vectorizer = CountVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=snowball_stemmer(),
                                       stop_words=stoplist)

    abstracts, docsizes, ids = proc_docs(docs, stoplist, stat.fulltext)

    doc_ids = ids
    random.shuffle(doc_ids)

    if stat.method == "NM":
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        vectorizer = tfidf_vectorizer
    else:
        tfidf = count_vectorizer.fit_transform(abstracts)
        vectorizer = count_vectorizer

    for k in range(k_folds):
        train_set = [i for i, x in enumerate(doc_ids) if i % k_folds != k]
        test_set = [i for i, x in enumerate(doc_ids) if i % k_folds == k]

        X_train = tfidf[train_set, ]
        X_test = tfidf[test_set, ]

        if stat.method == "NM":
            model = NMF(n_components=K,
                        random_state=1,
                        alpha=alpha,
                        l1_ratio=.1,
                        verbose=False,
                        init='nndsvd',
                        max_iter=n_samples).fit(X_train)
            w_test = model.transform(X_test)
            rec_error = _beta_divergence(X_test,
                                         w_test,
                                         model.components_,
                                         'frobenius',
                                         square_root=True)

        else:
            model = LDA(n_components=K,
                        doc_topic_prior=stat.alpha,
                        max_iter=stat.max_iter,
                        n_jobs=6).fit(X_test)
            w_test = model.transform(X_test)
            rec_error = _beta_divergence(X_test,
                                         w_test,
                                         model.components_,
                                         'frobenius',
                                         square_root=True)
        kf, created = KFold.objects.get_or_create(model=stat, K=k)
        kf.error = rec_error
        kf.save()

    return
Exemple #9
0
def contrastive_NMF(v, w_init, h_init, h_tilde, delta=0, mu=0, beta=0, n_iter=100, nr_src=2):
    """

    Parameters
    ----------
    v: [array of shape (F, N)] magnitude spectrogram of the mixture
    w_init: [array of shape (F, K)] initialization of the dictionary w
    h_init: [array of shape (K, N)] initialization of the activations h
    h_tilde: [array of shape (K1, N)] activations of the source to enhance 
    delta: [float > 0] weight of the contrastive term
    mu: [float > 0] weight of the l1 regularizer on h
    beta: [float > 0] weight of the l1 regularizer on w
    n_iter: [int > 0] number of NMF iterations
    n_src: [int > 0] number of sources in the mixture


    Returns
    -------
    the dictionary w and the corresponding activations h resulting from the 
    factorization of v and a list containing the total cost at each iteration.
    """

    flr = 1e-9
    cost = []

    # initial values
    x = v.copy()
    w = w_init.copy()
    h = h_init.copy()

    # avoid too small values
    x[x <= flr] = flr
    w[w <= flr] = flr
    h[h <= flr] = flr

    # normalize h_tilde
    hn_tilde = np.sqrt(np.sum(h_tilde ** 2, axis=1))
    h_tilde = h_tilde / hn_tilde[:, None]

    # normalize h and rescale w
    hn = np.sqrt(np.sum(h ** 2, axis=1))
    h = h / hn[:, None]
    w = w * hn[None, :]

    # NMF iterations
    for i in range(n_iter):

        # update H
        WH = np.maximum(w @ h, flr)
        h, contrast = update_h(w, h, x, h_tilde, WH, delta, mu, flr, nr_src)
        h[h <= flr] = flr
        
        # normalize h and rescale w
        hn = np.sqrt(np.sum(h ** 2, axis=1))
        h = h / hn[:, None]
        w = w * hn[None, :]

        #  update W        
        WH = np.maximum(w@h, flr)
        w = update_w(w, h, x, WH, beta, flr)
        w[w <= 0] = flr

        # keep track of the cost
        cost.append(_beta_divergence(x, w, h, 'kullback-leibler', square_root=True) - delta * contrast + mu * np.linalg.norm(h) + beta * np.linalg.norm(w))

    return w, h, cost