def unitTests(): """ Just test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions don't change. """ random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = ZIFA.fitModel(Y, k) assert np.allclose(Zhat[-1, :], [1.50067515, 0.04742477]) assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555]) assert np.allclose(params['decay_coef'], 0.10458794970222711) assert np.allclose(params['sigmas'][0], 0.30219903) Zhat, params = block_ZIFA.fitModel(Y, k) assert np.allclose( Zhat[-1, :], [1.49712162, 0.05823952] ) # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555]) assert np.allclose(params['decay_coef'], 0.10458794970222711) assert np.allclose(params['sigmas'][0], 0.30219903) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Zhat[-1, :], [9.84455438e-01, 4.50924335e-02]) n = 50 d = 60 k = 3 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Zhat[-1, :], [-1.69609638, -0.5475882, 0.08008015]) X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = ZIFA.fitModel(Y, k) print(Zhat[-1, :]) assert np.allclose(Zhat[-1, :], [-0.63075905, -0.77361427, -0.11544281]) print('Tests passed!')
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def train_ZIFA(input_data, feature_names, sample_names, outfile, use_block=False): from ZIFA import ZIFA X = [] for m in range(len(data)): X.append(np.vstack(data[m])) # concatenate samples across groups X = np.hstack(X) # concatenate features across views keep_sample = (~np.isnan(X)).sum(axis=1) > 0 sample_names = np.concatenate(sample_names)[keep_sample] X = X[keep_sample, :] if not use_block: Z, model_params = ZIFA.fitModel(X, K=2) pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv") pd.DataFrame(model_params['A'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_A.csv") pd.DataFrame( model_params['mus'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_mus.csv") pd.DataFrame( model_params['sigmas'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_sigmas.csv") else: from ZIFA import block_ZIFA Z, model_params = block_ZIFA.fitModel(X, K=2, p0_thresh=0.95) feature_names = np.array(feature_names)[(X == 0).sum(axis=0) / X.shape[0] <= 0.95] pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv") pd.DataFrame(model_params['A'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_A.csv") pd.DataFrame( model_params['mus'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_mus.csv") pd.DataFrame( model_params['sigmas'], index=np.concatenate(feature_names)).to_csv(outfile + "_ZIFA_sigmas.csv")
def ziMean(adata, groupby, organism="mmusculus"): """Calculates mean expression based on estimated dropouts given the mean expression of non zero cells. Using the decay coeficient calculated by the ZIFA algorithm, estimated dropput rates can be calculated based on the mean expression of non zero counts. New mean expression values is then calculated from imputed counts. """ import ZIFA.block_ZIFA as zf import scanpy as sc adata_copy = adata.copy() adata_copy = filter_genes(adata_copy, organism=organism) sc.pp.filter_genes(adata_copy, min_counts=1) countmatrix = adata_copy.X model, params = zf.fitModel(countmatrix, 2, singleSigma=True) dc = params["decay_coef"] df = get_adata_df(adata_copy) def ZImean(countmatrix): """Calculated new mean values given set decay coeficiant. Can be used with groupby.aggregate""" import numpy as np gene_call = dict() for gene, expression in countmatrix.iterrows(): total_cells = len(expression) expressing_cells = len(expression[expression > 0]) if expressing_cells > 0: non_zero_mean = np.mean(expression[expression > 0]) prob = np.exp(-dc * non_zero_mean**2) with_dropout = int( np.round(expressing_cells / (1 - prob), decimals=0)) if with_dropout > total_cells: with_dropout = total_cells new_counts = [non_zero_mean] * with_dropout + [0] * ( total_cells - with_dropout) new_mean = np.mean(new_counts) else: new_mean = 0 gene_call[gene] = new_mean return gene_call gene_call = df.groupby(adata_copy.obs[groupby], axis=1).aggregate(ZImean) adata.uns.update({"gene_call": gene_call}) return adata
def testAlgorithm(): import matplotlib.pyplot as plt random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components=k) factor_analysis_Zhat = model.fit_transform(Y) plt.figure(figsize=[15, 5]) plt.subplot(131) for id in cluster_ids: plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4) plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.subplot(132) for id in cluster_ids: plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('ZIFA Estimated Latent Positions') # title(titles[method]) plt.subplot(133) for id in cluster_ids: plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4) plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title('Factor Analysis Estimated Latent Positions') plt.show()
def main(): parser = ArgumentParser(description="Fit a ZIFA model on the data.") parser.add_argument('-b', '--block', action='store_true', default=False, help="Whether the block algorithm should be used.") parser.add_argument('-d', '--dimensions', type=int, default=2, help="The number of dimensions [2].") parser.add_argument('input_file', type=str, help="The input CSV file.") parser.add_argument('output_file', type=str, help="The output CSV file.") args = parser.parse_args() df = read_csv(args.input_file) del df['Unnamed: 0'] lc = np.array(df) Y = np.transpose(lc) if(args.block): Z, model_params = block_ZIFA.fitModel(Y, args.dimensions) else: Z, model_params = ZIFA.fitModel(Y, args.dimensions) np.savetxt(args.output_file, Z, delimiter=',')
valid_set_index.append(index) print valid_set_index #valid_set_index=np.random.choice(train_data.shape[0],size=s,replace=False) train_set_index = [ x for x in range(train_data.shape[0]) if x not in valid_set_index ] valid_data = train_data[valid_set_index, :] #valid_valid=train_valid[valid_set_index,:] #train_data=train_data[train_set_index,:] #train_valid=train_valid[train_set_index,:] Y = valid_data valid_Y = labeled_label[valid_set_index] print 'before shape: ', Y.shape #Y=Y[:, np.sum(Y >1e-6, axis=0)/float(Y.shape[0])>0.9]#keep genes that are expressed in 90% samples print 'after shape: ', Y.shape code, model_params = block_ZIFA.fitModel(Y, 100) print code.shape #print Z else: #if args.n_component==0: # pca=PCA() #else: print 'fitting data:' + args.fit if args.use_nmf == 1: #transform_data = if args.validation_cell_types > 0: transform_data = output_dict['test_X_TPMgn0'] #print 'vct!' else: #print 'not vct'
## set the Pancreatic folder as the working directory from datetime import datetime from ZIFA import ZIFA from ZIFA import block_ZIFA import numpy Y = numpy.loadtxt("Results/forZifa.csv", delimiter=",", skiprows=1) startTime = datetime.now() Z4, model_params = block_ZIFA.fitModel(Y, 4) print datetime.now() - startTime numpy.savetxt("Results/Z4.csv", Z4, delimiter=",") X = Y
from ZIFA import ZIFA from ZIFA import block_ZIFA import pandas as pd from sklearn.cluster import KMeans from sklearn import metrics from sklearn.metrics.cluster import adjusted_rand_score as ari from sklearn.metrics.cluster import normalized_mutual_info_score as nmi # This gives an example for how to read in a real data called input.table. # genes are columns, samples are rows, each number is separated by a space. # If you do not want to install pandas, you can also use np.loadtxt: https://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html X = pd.read_csv('yan/yan.csv', header=None) X = np.array(X) X = X.transpose() label = pd.read_csv('yan/yan_label.csv') y = np.array(label) label = y.ravel() Z, model_params = block_ZIFA.fitModel(X, 5) c = label.max() kk = KMeans(n_clusters=c) julei = kk.fit(Z) julei = julei.labels_ print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten())) print('ARI value is %f \n' % ari(julei.flatten(), label.flatten())) print('HOM value is %f \n' % metrics.homogeneity_score(julei, label)) print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
def unitTests(): """ Test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions and parameters don't change. """ print( "\n\n\n****Running unit tests!\nIMPORTANT: These unit tests pass with:\n\ Python version 2.7.10 (your version: %s)\n\ numpy 1.13.1 (your version: %s)\n\ scipy 0.18.1 (your version: %s)\n\ sklearn 0.16.1 (your version: %s)" % (platform.python_version(), np.__version__, scipy.__version__, sklearn.__version__)) print( "Different versions of Python or those packages may yield slightly different results and fail to pass the asserts unless you increase the absolute_tolerance parameter, set below." ) print( "If your configuration yields significantly different results, please contact [email protected].\n\n" ) absolute_tolerance = 1e-8 random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = ZIFA.fitModel(Y, k) assert np.allclose(Y, old_Y) # for Z and A, we compare the absolute values of the parameters because some package versions appear to flip the sign (which is fine and will not affect results) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([1.50067515, 0.04742477]), atol=absolute_tolerance) assert np.allclose(np.abs(params['A'][0, :]), np.abs([0.66884415, -0.17173555]), atol=absolute_tolerance) assert np.allclose(params['decay_coef'], 0.10458794970222711, atol=absolute_tolerance) assert np.allclose(params['sigmas'][0], 0.30219903, atol=absolute_tolerance) Zhat, params = block_ZIFA.fitModel(Y, k) assert np.allclose(Y, old_Y) assert np.allclose( np.abs(Zhat[-1, :]), np.abs([1.49712162, 0.05823952]), atol=absolute_tolerance ) # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM assert np.allclose(np.abs(params['A'][0, :]), np.abs([0.66884415, -0.17173555]), atol=absolute_tolerance) assert np.allclose(params['decay_coef'], 0.10458794970222711, atol=absolute_tolerance) assert np.allclose(params['sigmas'][0], 0.30219903, atol=absolute_tolerance) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Y, old_Y) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([9.84455438e-01, 4.50924335e-02]), atol=absolute_tolerance) n = 50 d = 60 k = 3 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3) assert np.allclose(Y, old_Y) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([-1.69609638, -0.5475882, 0.08008015]), atol=absolute_tolerance) X, Y, Z, ids = generateSimulatedDimensionalityReductionData( n_clusters, n, d, k, sigma, decay_coef) old_Y = deepcopy(Y) Zhat, params = ZIFA.fitModel(Y, k) print(Zhat[-1, :]) assert np.allclose(np.abs(Zhat[-1, :]), np.abs([-0.63075905, -0.77361427, -0.11544281]), atol=absolute_tolerance) assert np.allclose(Y, old_Y) print('Tests passed with absolute tolerance %2.3e!' % absolute_tolerance)
def __init__(self, matrix, K, barcodes=None): DR.__init__(self, matrix=matrix, barcodes=barcodes) # inherits from DR object self.name = "ZIFA" self.results, self.model_params = block_ZIFA.fitModel(matrix, K) self.clu = Cluster(self.results.astype("double"), autoplot=False)
print('Fraction of zeros: %2.3f; decay coef: %2.3f' % ((Y == 0).mean(), decay_coef)) return X, Y, Z.transpose(), cluster_ids random.seed(35) np.random.seed(32) n = 200 d = 20 k = 2 sigma = .3 n_clusters = 3 decay_coef = .1 X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef) Zhat, params = block_ZIFA.fitModel(Y, k) colors = ['red', 'blue', 'green'] cluster_ids = sorted(list(set(ids))) model = FactorAnalysis(n_components = k) factor_analysis_Zhat = model.fit_transform(Y) figure(figsize = [15, 5]) subplot(131) for id in cluster_ids: scatter(Z[ids == id, 0], Z[ids == id, 1], color = colors[id - 1], s = 4) title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean()) xlim([-4, 4]) ylim([-4, 4]) subplot(132) for id in cluster_ids: scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color = colors[id - 1], s = 4) xlim([-4, 4])
def run(self, data): if sparse.issparse(data): data = data.toarray() data = np.log1p(data) Z, model_params = block_ZIFA.fitModel(data.T, self.k) return [Z.T], 0
def runZIFA(): random.seed(42) np.random.seed(42) print 'Number of arguments:', len(sys.argv), 'arguments.' print 'Argument List:', str(sys.argv) inputfilename = sys.argv[1] outputfolder = sys.argv[2] input_alldimensions = [] with open(inputfilename, 'r') as infile: input = infile.readlines() cell_names = input.pop(0).rstrip('\r\n').split('\t') cell_names.pop(0) for line in input: line = line.rstrip('\r\n') linearray = [] l = line.split('\t') l.pop(0) for it in l: number = float(it) if number < 0.0000001: number = float(0) linearray.append(number) input_alldimensions.append(linearray) alldim = np.asarray(input_alldimensions) alldim = alldim.transpose() try: with open(outputfolder + "/log.zifa.txt", 'w') as f: sys.stdout = f Zhat, params = block_ZIFA.fitModel(alldim, min(5, len(cell_names))) sys.stdout = sys.__stdout__ except Exception as err: f = open(outputfolder + "/log.zifa.txt", 'r') output_json = {} errorMsg = str(err[0]) if errorMsg.startswith("Your input matrix contains no zeros"): output_json[ 'displayed_error'] = "Zifa is not converging. This can be due to an input matrix which contains no zeros. Zifa input should be log read counts. You can try another filtering/normalization but this may not solve the issue for this dataset." else: output_json['displayed_error'] = errorMsg output_json['original_error'] = f.read() with open(outputfolder + "/output.json", 'w') as outfile: json.dump(output_json, outfile) raise output_json = {} output_json['PC1'] = [] output_json['PC2'] = [] output_json['PC3'] = [] output_json['PC4'] = [] output_json['PC5'] = [] output_json['text'] = [] i = 0 for it in Zhat: if len(it) >= 1: output_json['PC1'].append(it[0]) if len(it) >= 2: output_json['PC2'].append(it[1]) if len(it) >= 3: output_json['PC3'].append(it[2]) if len(it) >= 4: output_json['PC4'].append(it[3]) if len(it) >= 5: output_json['PC5'].append(it[4]) output_json['text'].append(cell_names[i]) i += 1 if len(output_json['PC1']) == 0: del (output_json['PC1']) if len(output_json['PC2']) == 0: del (output_json['PC2']) if len(output_json['PC3']) == 0: del (output_json['PC3']) if len(output_json['PC4']) == 0: del (output_json['PC4']) if len(output_json['PC5']) == 0: del (output_json['PC5']) with open(outputfolder + "/output.json", 'w') as outfile: json.dump(output_json, outfile)
import numpy as np from ZIFA import ZIFA from ZIFA import block_ZIFA import pandas as pd # This gives an example for how to read in a real data called input.table. # genes are columns, samples are rows, each number is separated by a space. # If you do not want to install pandas, you can also use np.loadtxt: https://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html file = pd.read_csv('input.table', sep=' ') table = np.array(file) Z, model_params = block_ZIFA.fitModel(table, 5) np.savetxt('output.ZIFA.table', Z, fmt='%.2f')
np.random.seed(32) # Load expression data rnaseq_file = os.path.join('data', input_file) rnaseq_df = pd.read_table(rnaseq_file, index_col=0) rnaseq_df = rnaseq_df.T rnaseq_exp = rnaseq_df.as_matrix() # Perform uMAP dimension reduction on expression data if method == "umap": embedding = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='correlation').fit_transform(rnaseq_exp) umap_out = pd.DataFrame(embedding, columns=['1', '2']) umap_out.index = rnaseq_df.index umap_out.index.name = 'id' umap_out_file = os.path.join('../features', input_file + '_rnaseq_umap_features.tsv') umap_out.to_csv(umap_out_file, sep='\t') # Perform ZIFA dimension reduction on expression data elif method == "ZIFA": k = 2 Zhat, params = block_ZIFA.fitModel(rnaseq_exp, k) zifa_out = pd.DataFrame(Zhat, columns=['1', '2']) zifa_out.index = rnaseq_df.index zifa_out.index.name = 'id' zifa_out_file = os.path.join('../features', input_file + '_rnaseq_ZIFA_features.tsv') zifa_out.to_csv(zifa_out_file, sep='\t')
def fit_transform(self, data): embedding, model = block_ZIFA.fitModel(data, self.k) self.model = model return embedding
def apply(self): Zhat, params = block_ZIFA.fitModel(self.matrix, self.n_components, n_blocks = self.n_blocks) self.results = Zhat