def unitTests(): """ Just test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions don't change. """ random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = ZIFA.fitModel(Y, k) assert np.allclose(Zhat[-1, :], [1.50067515, 0.04742477]) assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555]) assert np.allclose(params['decay_coef'], 0.10458794970222711) assert np.allclose(params['sigmas'][0], 0.30219903) Zhat, params = block_ZIFA.fitModel(Y, k) assert np.allclose( Zhat[-1, :], [1.49712162, 0.05823952] ) # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555]) assert np.allclose(params['decay_coef'], 0.10458794970222711) assert np.allclose(params['sigmas'][0], 0.30219903) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Zhat[-1, :], [9.84455438e-01, 4.50924335e-02]) n = 50 d = 60 k = 3 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Zhat[-1, :], [-1.69609638, -0.5475882, 0.08008015]) X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = ZIFA.fitModel(Y, k) print(Zhat[-1, :]) assert np.allclose(Zhat[-1, :], [-0.63075905, -0.77361427, -0.11544281]) print('Tests passed!')
def train_ZIFA(input_data, feature_names, sample_names, outfile, use_block=False): from ZIFA import ZIFA X = [] for m in range(len(data)): X.append(np.vstack(data[m])) # concatenate samples across groups X = np.hstack(X) # concatenate features across views keep_sample = (~np.isnan(X)).sum(axis=1) > 0 sample_names = np.concatenate(sample_names)[keep_sample] X = X[keep_sample, :] if not use_block: Z, model_params = ZIFA.fitModel(X, K=2) pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv") pd.DataFrame(model_params['A'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_A.csv") pd.DataFrame( model_params['mus'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_mus.csv") pd.DataFrame( model_params['sigmas'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_sigmas.csv") else: from ZIFA import block_ZIFA Z, model_params = block_ZIFA.fitModel(X, K=2, p0_thresh=0.95) feature_names = np.array(feature_names)[(X == 0).sum(axis=0) / X.shape[0] <= 0.95] pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv") pd.DataFrame(model_params['A'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_A.csv") pd.DataFrame( model_params['mus'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_mus.csv") pd.DataFrame( model_params['sigmas'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_sigmas.csv")
def main(): parser = ArgumentParser(description="Fit a ZIFA model on the data.") parser.add_argument('-b', '--block', action='store_true', default=False, help="Whether the block algorithm should be used.") parser.add_argument('-d', '--dimensions', type=int, default=2, help="The number of dimensions [2].") parser.add_argument('input_file', type=str, help="The input CSV file.") parser.add_argument('output_file', type=str, help="The output CSV file.") args = parser.parse_args() df = read_csv(args.input_file) del df['Unnamed: 0'] lc = np.array(df) Y = np.transpose(lc) if(args.block): Z, model_params = block_ZIFA.fitModel(Y, args.dimensions) else: Z, model_params = ZIFA.fitModel(Y, args.dimensions) np.savetxt(args.output_file, Z, delimiter=',')
def testAlgorithm(): random.seed(30) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) Zhat, params = ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components = k) factor_analysis_Zhat = model.fit_transform(Y) figure(figsize = [15, 5]) subplot(131) for id in cluster_ids: scatter(Z[ids == id, 0], Z[ids == id, 1], color = colors[id - 1], s = 4) title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) xlim([-4, 4]) ylim([-4, 4]) subplot(132) for id in cluster_ids: scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color = colors[id - 1], s = 4) xlim([-4, 4]) ylim([-4, 4]) title('ZIFA Estimated Latent Positions') #title(titles[method]) subplot(133) for id in cluster_ids: scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4) xlim([-4, 4]) ylim([-4, 4]) title('Factor Analysis Estimated Latent Positions') show()
def testAlgorithm(): random.seed(30) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, true_ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) zimm_ids, params = ZIMM.fitModel(Y, n_clusters) kmeans_ids = KMeans(n_clusters).fit_predict(Y) hc_ids = AgglomerativeClustering(n_clusters).fit_predict(Y) Zhat, params = ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] factor_analysis_Zhat = FactorAnalysis(n_components = k).fit_transform(Y) zimm_ids,zimm_errors = calc_error(true_ids,zimm_ids) kmeans_ids,kmeans_errors = calc_error(true_ids,kmeans_ids) hc_ids,hc_errors = calc_error(true_ids,hc_ids) print 'Fraction misclassified by ZIMM: %f' % np.mean(zimm_errors) print 'Fraction misclassified by Kmeans: %f' % np.mean(kmeans_errors) print 'Fraction misclassified by AgglomerativeClustering: %f' % np.mean(hc_errors) figure(figsize = [10, 13]) subplot(321) for id in xrange(n_clusters): scatter(Z[true_ids == id, 0], Z[true_ids == id, 1], color = colors[id - 1], s = 12) title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) xlim([-4, 4]) ylim([-4, 4]) subplot(322) for id in xrange(n_clusters): scatter(Zhat[zimm_ids == id, 0], Zhat[zimm_ids == id, 1], color = colors[id - 1], s = 12) xlim([-4, 4]) ylim([-4, 4]) title('ZIMM labels,\nZIFA Estimated Latent Positions') subplot(323) for id in xrange(n_clusters): scatter(Zhat[kmeans_ids == id, 0], Zhat[kmeans_ids == id, 1], color = colors[id - 1], s = 12) xlim([-4, 4]) ylim([-4, 4]) title('K-means, with ZIFA') subplot(324) for id in xrange(n_clusters): scatter(Zhat[hc_ids == id, 0], Zhat[hc_ids == id, 1], color = colors[id - 1], s = 12) xlim([-4, 4]) ylim([-4, 4]) title('Agglomerative Clustering, with ZIFA') subplot(325) for id in xrange(n_clusters): scatter(factor_analysis_Zhat[kmeans_ids == id, 0], factor_analysis_Zhat[kmeans_ids == id, 1], color = colors[id - 1], s = 12) xlim([-4, 4]) ylim([-4, 4]) title('K-means,\nwith Classic Factor Analysis') subplot(326) for id in xrange(n_clusters): scatter(factor_analysis_Zhat[hc_ids == id, 0], factor_analysis_Zhat[hc_ids == id, 1], color = colors[id - 1], s = 12) xlim([-4, 4]) ylim([-4, 4]) title('Agglomerative Clustering,\nwith Classic Factor Analysis') tight_layout() savefig('example_output.png') show()
genes_of_interest.append(df_trans.columns.values[i]) genes = genes_of_interest subset_df = df_clean[df_clean.index.isin(genes)] subset_df.to_csv(os.path.join(os.path.dirname(sys.argv[1]), "_DFresult.txt"), sep="\t") variance = subset_df.var(axis=0) #variance in columns if dim_red_method == 'ZIFA': f = lambda x: np.log(1 + x) logDF = subset_df.applymap(f) # DF_final.applymap(f) transposed_ZIFA = logDF.transpose() Z_trans, MP_trans = ZIFA.fitModel(transposed_ZIFA.as_matrix(), 2) X = [] Y = [] for i in Z_trans: X.append(i[0]) Y.append(i[1]) df1 = pd.DataFrame({ 'tSNEx': X, 'tSNEy': Y, 'variance': variance, 'classif': classification_vector.as_matrix() }) if dim_red_method == 'TSNE': pca = PCA(n_components=15)
Run ZIFA algorithm on input file, print the result; this is intended to test the method. Implementation of the result was done in ipython notebook. ZIFA_test1.py [input_file] ''' import pandas as pd import numpy as np import sys from ZIFA import ZIFA #from ZIFA import block_ZIFA print('script started') input_file = sys.argv[1] df = pd.DataFrame.from_csv((input_file),sep="\t") f = lambda x: np.log(1+x) df1 = df.applymap(f) print(df1) print('completed read in DF') #Z, model_params = block_ZIFA.fitModel(df1.as_matrix(), 2) Z, model_params = ZIFA.fitModel(df1.as_matrix(), 2) print('ZIFA finished') print(Z) #print(Z[:0]) #print(Z[:1])
def unitTests(): """ Test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions and parameters don't change. """ print( "\n\n\n****Running unit tests!\nIMPORTANT: These unit tests pass with:\n\ Python version 2.7.10 (your version: %s)\n\ numpy 1.13.1 (your version: %s)\n\ scipy 0.18.1 (your version: %s)\n\ sklearn 0.16.1 (your version: %s)" % (platform.python_version(), np.__version__, scipy.__version__, sklearn.__version__)) print( "Different versions of Python or those packages may yield slightly different results and fail to pass the asserts unless you increase the absolute_tolerance parameter, set below." ) print( "If your configuration yields significantly different results, please contact [email protected].\n\n" ) absolute_tolerance = 1e-8 random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = ZIFA.fitModel(Y, k) assert np.allclose(Y, old_Y) # for Z and A, we compare the absolute values of the parameters because some package versions appear to flip the sign (which is fine and will not affect results) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([1.50067515, 0.04742477]), atol=absolute_tolerance) assert np.allclose(np.abs(params['A'][0, :]), np.abs([0.66884415, -0.17173555]), atol=absolute_tolerance) assert np.allclose(params['decay_coef'], 0.10458794970222711, atol=absolute_tolerance) assert np.allclose(params['sigmas'][0], 0.30219903, atol=absolute_tolerance) Zhat, params = block_ZIFA.fitModel(Y, k) assert np.allclose(Y, old_Y) assert np.allclose( np.abs(Zhat[-1, :]), np.abs([1.49712162, 0.05823952]), atol=absolute_tolerance ) # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM assert np.allclose(np.abs(params['A'][0, :]), np.abs([0.66884415, -0.17173555]), atol=absolute_tolerance) assert np.allclose(params['decay_coef'], 0.10458794970222711, atol=absolute_tolerance) assert np.allclose(params['sigmas'][0], 0.30219903, atol=absolute_tolerance) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Y, old_Y) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([9.84455438e-01, 4.50924335e-02]), atol=absolute_tolerance) n = 50 d = 60 k = 3 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Y, old_Y) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([-1.69609638, -0.5475882, 0.08008015]), atol=absolute_tolerance) X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = ZIFA.fitModel(Y, k) print(Zhat[-1, :]) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([-0.63075905, -0.77361427, -0.11544281]), atol=absolute_tolerance) assert np.allclose(Y, old_Y) print('Tests passed with absolute tolerance %2.3e!' % absolute_tolerance)
def fit_transform(self, data): embedding, model = ZIFA.fitModel(data - 1, self.k) self.model = model return embedding
parser.add_argument("-o", "--output", dest="output", type=str, required=True) parser.add_argument("-g", "--genes", dest="genes", type=str, default=None) parser.add_argument("-d", "--dim", dest="dim", type=int, default=2) parser.add_argument("-s", "--seed", dest="seed", type=int, default=None) parser.add_argument("--clean", dest="clean", type=str, default=None) cmd_args = parser.parse_args() # Read data cb.message.info("Reading data...") x = cb.data.ExprDataSet.read_dataset(cmd_args.input).normalize() if cmd_args.clean: x = utils.clean_dataset(x, cmd_args.clean) if cmd_args.genes is not None: x = x[:, x.uns[cmd_args.genes]].exprs # Run ZIFA if cmd_args.seed is not None: np.random.seed(cmd_args.seed) start_time = time.time() x = np.log1p(x) if spsp.issparse(x): x = x.toarray() z, _ = ZIFA.fitModel(x, cmd_args.dim) elapsed_time = time.time() - start_time # Save result cb.data.write_hybrid_path(z, "%s//latent" % cmd_args.output) cb.data.write_hybrid_path(elapsed_time, "%s//time" % cmd_args.output) cb.message.info("Done!")
Run ZIFA algorithm on input file, print the result; this is intended to test the method. Implementation of the result was done in ipython notebook. ZIFA_test1.py [input_file] ''' import pandas as pd import numpy as np import sys from ZIFA import ZIFA #from ZIFA import block_ZIFA print('script started') input_file = sys.argv[1] df = pd.DataFrame.from_csv((input_file), sep="\t") f = lambda x: np.log(1 + x) df1 = df.applymap(f) print(df1) print('completed read in DF') #Z, model_params = block_ZIFA.fitModel(df1.as_matrix(), 2) Z, model_params = ZIFA.fitModel(df1.as_matrix(), 2) print('ZIFA finished') print(Z) #print(Z[:0]) #print(Z[:1])
for i in indices_of_interest: genes_of_interest.append(df_trans.columns.values[i]) genes = genes_of_interest subset_df = df_clean[df_clean.index.isin(genes)] subset_df.to_csv(os.path.join(os.path.dirname(sys.argv[1]),"_DFresult.txt"),sep="\t") variance = subset_df.var(axis=0) #variance in columns if dim_red_method == 'ZIFA': f = lambda x: np.log(1+x) logDF = subset_df.applymap(f) # DF_final.applymap(f) transposed_ZIFA = logDF.transpose() Z_trans, MP_trans = ZIFA.fitModel(transposed_ZIFA.as_matrix(),2) X=[] Y=[] for i in Z_trans: X.append(i[0]) Y.append(i[1]) df1 = pd.DataFrame({'tSNEx': X, 'tSNEy':Y, 'variance':variance,'classif':classification_vector.as_matrix()}) if dim_red_method == 'TSNE': pca = PCA(n_components=15) pcaF = pca.fit(df_clean) #THIS WILL NOT TAKE A SUBSET OF THE GENES X= (pca.components_).transpose() #X = subset_df.transpose() n_samples, n_features = X.shape[0],X.shape[1]