# The root of the problem may be that the dictionary is biased towards high expression # (because the loss function in SMAF wasn't told to specifically care about lowly expressed genes) # So we might change the loss function to be sensitive to the average loss for each gene # ...possibly normalized somehow so that we care the same about each gene individually # This may also help with the poor performance for each gene across tissues # In addition, we might weight Phi by a prior on (inverse) expression # Note that applying this weight without making the dictionary sensitive to low expression did not improve the results k = min(int(xa.shape[1] * 1.5), 150) UW = (np.random.random((xa.shape[0], k)), np.random.random( (k, xa.shape[1]))) ua, va = smaf(xa, k, 5, 0.0005, maxItr=10, use_chol=True, activity_lower=0., module_lower=xa.shape[0] / 10, UW=UW, donorm=True, mode=1, mink=3.) x2a, phi, y, w, d, psi = recover_system_knownBasis( xa, measurements, sparsity, Psi=ua, snr=SNR, use_ridge=False, f="smaf_train_measurement") Results['SMAF (training)'] = compare_results(xa, x2a) np.save("SMAF_gene_actual_train_100", xa)
module_size_nmf = np.array([np.exp(entropy(abs(x))) for x in ua.T]) usage_nmf = np.array([np.exp(entropy(abs(x))) for x in va.T]) ua_nmf = ua w_nmf = va k = min(int(xa.shape[1] * 1.5), ds * 4) k = min(k, MAX_BASIS) UW = (np.random.random((xa.shape[0], k)), np.random.random( (k, xa.shape[1]))) lda2 = ERROR_THRESH while True: U, W = smaf(xa, k, 10, lda2, maxItr=10, use_chol=True, activity_lower=4., module_lower=400, UW=UW, donorm=True, mode=1, mink=5) nz = np.nonzero(U.sum(0))[0] U = U[:, nz] W = W[nz] xh_smaf = U.dot(W) fit_smaf = 1 - np.linalg.norm(xa - xh_smaf)**2 / np.linalg.norm(xa)**2 if (len(nz) > MIN_BASIS) and (fit_smaf > MIN_FIT): break elif lda2 < ERROR_THRESH / 16: break else: