Ejemplo n.º 1
0
def unitTests():
    """
    Just test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions don't change. 
    """
    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = ZIFA.fitModel(Y, k)
    assert np.allclose(Zhat[-1, :], [1.50067515, 0.04742477])
    assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555])
    assert np.allclose(params['decay_coef'], 0.10458794970222711)
    assert np.allclose(params['sigmas'][0], 0.30219903)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    assert np.allclose(
        Zhat[-1, :], [1.49712162, 0.05823952]
    )  # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM
    assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555])
    assert np.allclose(params['decay_coef'], 0.10458794970222711)
    assert np.allclose(params['sigmas'][0], 0.30219903)

    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Zhat[-1, :], [9.84455438e-01, 4.50924335e-02])

    n = 50
    d = 60
    k = 3
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Zhat[-1, :], [-1.69609638, -0.5475882, 0.08008015])

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = ZIFA.fitModel(Y, k)
    print(Zhat[-1, :])
    assert np.allclose(Zhat[-1, :], [-0.63075905, -0.77361427, -0.11544281])

    print('Tests passed!')
Ejemplo n.º 2
0
def train_ZIFA(input_data,
               feature_names,
               sample_names,
               outfile,
               use_block=False):
    from ZIFA import ZIFA

    X = []
    for m in range(len(data)):
        X.append(np.vstack(data[m]))  # concatenate samples across groups
    X = np.hstack(X)  # concatenate features across views

    keep_sample = (~np.isnan(X)).sum(axis=1) > 0
    sample_names = np.concatenate(sample_names)[keep_sample]
    X = X[keep_sample, :]

    if not use_block:
        Z, model_params = ZIFA.fitModel(X, K=2)

        pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv")
        pd.DataFrame(model_params['A'],
                     index=np.concatenate(feature_names)).to_csv(outfile +
                                                                 "_ZIFA_A.csv")
        pd.DataFrame(
            model_params['mus'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_mus.csv")
        pd.DataFrame(
            model_params['sigmas'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_sigmas.csv")
    else:
        from ZIFA import block_ZIFA
        Z, model_params = block_ZIFA.fitModel(X, K=2, p0_thresh=0.95)
        feature_names = np.array(feature_names)[(X == 0).sum(axis=0) /
                                                X.shape[0] <= 0.95]

        pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv")
        pd.DataFrame(model_params['A'],
                     index=np.concatenate(feature_names)).to_csv(outfile +
                                                                 "_ZIFA_A.csv")
        pd.DataFrame(
            model_params['mus'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_mus.csv")
        pd.DataFrame(
            model_params['sigmas'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_sigmas.csv")
Ejemplo n.º 3
0
def main():
    parser = ArgumentParser(description="Fit a ZIFA model on the data.")
    parser.add_argument('-b', '--block', action='store_true', default=False, help="Whether the block algorithm should be used.")
    parser.add_argument('-d', '--dimensions', type=int, default=2, help="The number of dimensions [2].")
    parser.add_argument('input_file', type=str, help="The input CSV file.")
    parser.add_argument('output_file', type=str, help="The output CSV file.")

    args = parser.parse_args()

    df = read_csv(args.input_file)
    del df['Unnamed: 0']

    lc = np.array(df)
    Y = np.transpose(lc)

    if(args.block):
        Z, model_params  = block_ZIFA.fitModel(Y, args.dimensions)
    else:
        Z, model_params  = ZIFA.fitModel(Y, args.dimensions)

    np.savetxt(args.output_file, Z, delimiter=',')
Ejemplo n.º 4
0
def testAlgorithm():
	random.seed(30)
	np.random.seed(32)
	n = 200
	d = 20
	k = 2
	sigma = .3
	n_clusters = 3
	decay_coef = .1
	X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

	Zhat, params = ZIFA.fitModel(Y, k)
	colors = ['red', 'blue', 'green']
	cluster_ids = sorted(list(set(ids)))
	model = FactorAnalysis(n_components = k)
	factor_analysis_Zhat = model.fit_transform(Y)
	figure(figsize = [15, 5])
	subplot(131)
	for id in cluster_ids:
		scatter(Z[ids == id, 0], Z[ids == id, 1], color = colors[id - 1], s = 4)
		title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
		xlim([-4, 4])
		ylim([-4, 4])
	subplot(132)
	for id in cluster_ids:
		scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color = colors[id - 1], s = 4)
		xlim([-4, 4])
		ylim([-4, 4])
		title('ZIFA Estimated Latent Positions')
		#title(titles[method])
	subplot(133)
	for id in cluster_ids:
		scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4)
		xlim([-4, 4])
		ylim([-4, 4])
		title('Factor Analysis Estimated Latent Positions')
	
	
	show()
Ejemplo n.º 5
0
def testAlgorithm():
	random.seed(30)
	np.random.seed(32)
	n = 200
	d = 20
	k = 2
	sigma = .3
	n_clusters = 3
	decay_coef = .1
	X, Y, Z, true_ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

	zimm_ids, params = ZIMM.fitModel(Y, n_clusters)
	kmeans_ids = KMeans(n_clusters).fit_predict(Y)
	hc_ids = AgglomerativeClustering(n_clusters).fit_predict(Y)
	Zhat, params = ZIFA.fitModel(Y, k)
	colors = ['red', 'blue', 'green']
	factor_analysis_Zhat = FactorAnalysis(n_components = k).fit_transform(Y)

	zimm_ids,zimm_errors = calc_error(true_ids,zimm_ids)
	kmeans_ids,kmeans_errors = calc_error(true_ids,kmeans_ids)
	hc_ids,hc_errors = calc_error(true_ids,hc_ids)
	print 'Fraction misclassified by ZIMM: %f' % np.mean(zimm_errors)
	print 'Fraction misclassified by Kmeans: %f' % np.mean(kmeans_errors)
	print 'Fraction misclassified by AgglomerativeClustering: %f' % np.mean(hc_errors)

	figure(figsize = [10, 13])
	subplot(321)
	for id in xrange(n_clusters):
		scatter(Z[true_ids == id, 0], Z[true_ids == id, 1], color = colors[id - 1], s = 12)
		title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
		xlim([-4, 4])
		ylim([-4, 4])
	subplot(322)
	for id in xrange(n_clusters):
		scatter(Zhat[zimm_ids == id, 0], Zhat[zimm_ids == id, 1], color = colors[id - 1], s = 12)
		xlim([-4, 4])
		ylim([-4, 4])
		title('ZIMM labels,\nZIFA Estimated Latent Positions')
	subplot(323)
	for id in xrange(n_clusters):
		scatter(Zhat[kmeans_ids == id, 0], Zhat[kmeans_ids == id, 1], color = colors[id - 1], s = 12)
		xlim([-4, 4])
		ylim([-4, 4])
		title('K-means, with ZIFA')
	subplot(324)
	for id in xrange(n_clusters):
		scatter(Zhat[hc_ids == id, 0], Zhat[hc_ids == id, 1], color = colors[id - 1], s = 12)
		xlim([-4, 4])
		ylim([-4, 4])
		title('Agglomerative Clustering, with ZIFA')
	subplot(325)
	for id in xrange(n_clusters):
		scatter(factor_analysis_Zhat[kmeans_ids == id, 0], factor_analysis_Zhat[kmeans_ids == id, 1], color = colors[id - 1], s = 12)
		xlim([-4, 4])
		ylim([-4, 4])
		title('K-means,\nwith Classic Factor Analysis')
	subplot(326)
	for id in xrange(n_clusters):
		scatter(factor_analysis_Zhat[hc_ids == id, 0], factor_analysis_Zhat[hc_ids == id, 1], color = colors[id - 1], s = 12)
		xlim([-4, 4])
		ylim([-4, 4])
		title('Agglomerative Clustering,\nwith Classic Factor Analysis')
	tight_layout()
	savefig('example_output.png')
	show()
Ejemplo n.º 6
0
        genes_of_interest.append(df_trans.columns.values[i])

    genes = genes_of_interest

subset_df = df_clean[df_clean.index.isin(genes)]
subset_df.to_csv(os.path.join(os.path.dirname(sys.argv[1]), "_DFresult.txt"),
                 sep="\t")

variance = subset_df.var(axis=0)  #variance in columns

if dim_red_method == 'ZIFA':
    f = lambda x: np.log(1 + x)
    logDF = subset_df.applymap(f)  # DF_final.applymap(f)

    transposed_ZIFA = logDF.transpose()
    Z_trans, MP_trans = ZIFA.fitModel(transposed_ZIFA.as_matrix(), 2)

    X = []
    Y = []
    for i in Z_trans:
        X.append(i[0])
        Y.append(i[1])
    df1 = pd.DataFrame({
        'tSNEx': X,
        'tSNEy': Y,
        'variance': variance,
        'classif': classification_vector.as_matrix()
    })

if dim_red_method == 'TSNE':
    pca = PCA(n_components=15)
Ejemplo n.º 7
0
Run ZIFA algorithm on input file, print the result; this is intended to test the method.
Implementation of the result was done in ipython notebook.

ZIFA_test1.py [input_file]

'''

import pandas as pd
import numpy as np
import sys

from ZIFA import ZIFA
#from ZIFA import block_ZIFA

print('script started')

input_file = sys.argv[1]
df = pd.DataFrame.from_csv((input_file),sep="\t")
f = lambda x: np.log(1+x)
df1 = df.applymap(f)
print(df1)
print('completed read in DF')

#Z, model_params = block_ZIFA.fitModel(df1.as_matrix(), 2)
Z, model_params = ZIFA.fitModel(df1.as_matrix(), 2)

print('ZIFA finished')
print(Z)

#print(Z[:0])
#print(Z[:1])
Ejemplo n.º 8
0
def unitTests():
    """
    Test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions and parameters don't change. 
    """

    print(
        "\n\n\n****Running unit tests!\nIMPORTANT: These unit tests pass with:\n\
    Python version 2.7.10 (your version: %s)\n\
    numpy 1.13.1 (your version: %s)\n\
    scipy 0.18.1 (your version: %s)\n\
    sklearn 0.16.1 (your version: %s)" %
        (platform.python_version(), np.__version__, scipy.__version__,
         sklearn.__version__))
    print(
        "Different versions of Python or those packages may yield slightly different results and fail to pass the asserts unless you increase the absolute_tolerance parameter, set below."
    )
    print(
        "If your configuration yields significantly different results, please contact [email protected].\n\n"
    )

    absolute_tolerance = 1e-8

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = ZIFA.fitModel(Y, k)
    assert np.allclose(Y, old_Y)

    # for Z and A, we compare the absolute values of the parameters because some package versions appear to flip the sign (which is fine and will not affect results)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([1.50067515, 0.04742477]),
                       atol=absolute_tolerance)
    assert np.allclose(np.abs(params['A'][0, :]),
                       np.abs([0.66884415, -0.17173555]),
                       atol=absolute_tolerance)
    assert np.allclose(params['decay_coef'],
                       0.10458794970222711,
                       atol=absolute_tolerance)
    assert np.allclose(params['sigmas'][0],
                       0.30219903,
                       atol=absolute_tolerance)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    assert np.allclose(Y, old_Y)
    assert np.allclose(
        np.abs(Zhat[-1, :]),
        np.abs([1.49712162, 0.05823952]),
        atol=absolute_tolerance
    )  # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM
    assert np.allclose(np.abs(params['A'][0, :]),
                       np.abs([0.66884415, -0.17173555]),
                       atol=absolute_tolerance)
    assert np.allclose(params['decay_coef'],
                       0.10458794970222711,
                       atol=absolute_tolerance)
    assert np.allclose(params['sigmas'][0],
                       0.30219903,
                       atol=absolute_tolerance)

    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Y, old_Y)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([9.84455438e-01, 4.50924335e-02]),
                       atol=absolute_tolerance)

    n = 50
    d = 60
    k = 3
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Y, old_Y)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([-1.69609638, -0.5475882, 0.08008015]),
                       atol=absolute_tolerance)

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = ZIFA.fitModel(Y, k)
    print(Zhat[-1, :])
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([-0.63075905, -0.77361427, -0.11544281]),
                       atol=absolute_tolerance)
    assert np.allclose(Y, old_Y)

    print('Tests passed with absolute tolerance %2.3e!' % absolute_tolerance)
Ejemplo n.º 9
0
 def fit_transform(self, data):
     embedding, model = ZIFA.fitModel(data - 1, self.k)
     self.model = model
     return embedding
Ejemplo n.º 10
0
parser.add_argument("-o", "--output", dest="output", type=str, required=True)
parser.add_argument("-g", "--genes", dest="genes", type=str, default=None)
parser.add_argument("-d", "--dim", dest="dim", type=int, default=2)
parser.add_argument("-s", "--seed", dest="seed", type=int, default=None)
parser.add_argument("--clean", dest="clean", type=str, default=None)
cmd_args = parser.parse_args()

# Read data
cb.message.info("Reading data...")
x = cb.data.ExprDataSet.read_dataset(cmd_args.input).normalize()
if cmd_args.clean:
    x = utils.clean_dataset(x, cmd_args.clean)
if cmd_args.genes is not None:
    x = x[:, x.uns[cmd_args.genes]].exprs

# Run ZIFA
if cmd_args.seed is not None:
    np.random.seed(cmd_args.seed)
start_time = time.time()
x = np.log1p(x)
if spsp.issparse(x):
    x = x.toarray()
z, _ = ZIFA.fitModel(x, cmd_args.dim)
elapsed_time = time.time() - start_time

# Save result
cb.data.write_hybrid_path(z, "%s//latent" % cmd_args.output)
cb.data.write_hybrid_path(elapsed_time, "%s//time" % cmd_args.output)

cb.message.info("Done!")
Ejemplo n.º 11
0
Run ZIFA algorithm on input file, print the result; this is intended to test the method.
Implementation of the result was done in ipython notebook.

ZIFA_test1.py [input_file]

'''

import pandas as pd
import numpy as np
import sys

from ZIFA import ZIFA
#from ZIFA import block_ZIFA

print('script started')

input_file = sys.argv[1]
df = pd.DataFrame.from_csv((input_file), sep="\t")
f = lambda x: np.log(1 + x)
df1 = df.applymap(f)
print(df1)
print('completed read in DF')

#Z, model_params = block_ZIFA.fitModel(df1.as_matrix(), 2)
Z, model_params = ZIFA.fitModel(df1.as_matrix(), 2)

print('ZIFA finished')
print(Z)

#print(Z[:0])
#print(Z[:1])
Ejemplo n.º 12
0
    for i in indices_of_interest:
        genes_of_interest.append(df_trans.columns.values[i])

    genes = genes_of_interest

subset_df = df_clean[df_clean.index.isin(genes)]
subset_df.to_csv(os.path.join(os.path.dirname(sys.argv[1]),"_DFresult.txt"),sep="\t")

variance = subset_df.var(axis=0) #variance in columns

if dim_red_method == 'ZIFA':
    f = lambda x: np.log(1+x)
    logDF = subset_df.applymap(f) # DF_final.applymap(f)

    transposed_ZIFA = logDF.transpose()
    Z_trans, MP_trans = ZIFA.fitModel(transposed_ZIFA.as_matrix(),2)

    X=[]
    Y=[]
    for i in Z_trans:
        X.append(i[0])
        Y.append(i[1])
    df1 = pd.DataFrame({'tSNEx': X, 'tSNEy':Y, 'variance':variance,'classif':classification_vector.as_matrix()})

if dim_red_method == 'TSNE':
    pca = PCA(n_components=15)
    pcaF = pca.fit(df_clean) #THIS WILL NOT TAKE A SUBSET OF THE GENES

    X= (pca.components_).transpose()
    #X = subset_df.transpose()
    n_samples, n_features = X.shape[0],X.shape[1]