Exemple #1
0
def unitTests():
    """
    Just test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions don't change. 
    """
    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = ZIFA.fitModel(Y, k)
    assert np.allclose(Zhat[-1, :], [1.50067515, 0.04742477])
    assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555])
    assert np.allclose(params['decay_coef'], 0.10458794970222711)
    assert np.allclose(params['sigmas'][0], 0.30219903)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    assert np.allclose(
        Zhat[-1, :], [1.49712162, 0.05823952]
    )  # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM
    assert np.allclose(params['A'][0, :], [0.66884415, -0.17173555])
    assert np.allclose(params['decay_coef'], 0.10458794970222711)
    assert np.allclose(params['sigmas'][0], 0.30219903)

    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Zhat[-1, :], [9.84455438e-01, 4.50924335e-02])

    n = 50
    d = 60
    k = 3
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Zhat[-1, :], [-1.69609638, -0.5475882, 0.08008015])

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    Zhat, params = ZIFA.fitModel(Y, k)
    print(Zhat[-1, :])
    assert np.allclose(Zhat[-1, :], [-0.63075905, -0.77361427, -0.11544281])

    print('Tests passed!')
Exemple #2
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0],
                    Z[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' %
                  (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0],
                    Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0],
                    factor_analysis_Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Exemple #3
0
def train_ZIFA(input_data,
               feature_names,
               sample_names,
               outfile,
               use_block=False):
    from ZIFA import ZIFA

    X = []
    for m in range(len(data)):
        X.append(np.vstack(data[m]))  # concatenate samples across groups
    X = np.hstack(X)  # concatenate features across views

    keep_sample = (~np.isnan(X)).sum(axis=1) > 0
    sample_names = np.concatenate(sample_names)[keep_sample]
    X = X[keep_sample, :]

    if not use_block:
        Z, model_params = ZIFA.fitModel(X, K=2)

        pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv")
        pd.DataFrame(model_params['A'],
                     index=np.concatenate(feature_names)).to_csv(outfile +
                                                                 "_ZIFA_A.csv")
        pd.DataFrame(
            model_params['mus'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_mus.csv")
        pd.DataFrame(
            model_params['sigmas'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_sigmas.csv")
    else:
        from ZIFA import block_ZIFA
        Z, model_params = block_ZIFA.fitModel(X, K=2, p0_thresh=0.95)
        feature_names = np.array(feature_names)[(X == 0).sum(axis=0) /
                                                X.shape[0] <= 0.95]

        pd.DataFrame(Z, index=sample_names).to_csv(outfile + "_ZIFA_Z.csv")
        pd.DataFrame(model_params['A'],
                     index=np.concatenate(feature_names)).to_csv(outfile +
                                                                 "_ZIFA_A.csv")
        pd.DataFrame(
            model_params['mus'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_mus.csv")
        pd.DataFrame(
            model_params['sigmas'],
            index=np.concatenate(feature_names)).to_csv(outfile +
                                                        "_ZIFA_sigmas.csv")
Exemple #4
0
def ziMean(adata, groupby, organism="mmusculus"):
    """Calculates mean expression based on estimated dropouts given the mean expression of non zero cells.
    
    Using the decay coeficient calculated by the ZIFA algorithm, estimated dropput rates can 
    be calculated based on the mean expression of non zero counts. New mean expression values is then calculated
    from imputed counts.
    """

    import ZIFA.block_ZIFA as zf
    import scanpy as sc
    adata_copy = adata.copy()
    adata_copy = filter_genes(adata_copy, organism=organism)
    sc.pp.filter_genes(adata_copy, min_counts=1)
    countmatrix = adata_copy.X
    model, params = zf.fitModel(countmatrix, 2, singleSigma=True)
    dc = params["decay_coef"]
    df = get_adata_df(adata_copy)

    def ZImean(countmatrix):
        """Calculated new mean values given set decay coeficiant. Can be used with groupby.aggregate"""

        import numpy as np
        gene_call = dict()
        for gene, expression in countmatrix.iterrows():
            total_cells = len(expression)
            expressing_cells = len(expression[expression > 0])
            if expressing_cells > 0:
                non_zero_mean = np.mean(expression[expression > 0])
                prob = np.exp(-dc * non_zero_mean**2)
                with_dropout = int(
                    np.round(expressing_cells / (1 - prob), decimals=0))
                if with_dropout > total_cells:
                    with_dropout = total_cells

                new_counts = [non_zero_mean] * with_dropout + [0] * (
                    total_cells - with_dropout)
                new_mean = np.mean(new_counts)
            else:
                new_mean = 0
            gene_call[gene] = new_mean

        return gene_call

    gene_call = df.groupby(adata_copy.obs[groupby], axis=1).aggregate(ZImean)

    adata.uns.update({"gene_call": gene_call})
    return adata
Exemple #5
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Exemple #6
0
def main():
    parser = ArgumentParser(description="Fit a ZIFA model on the data.")
    parser.add_argument('-b', '--block', action='store_true', default=False, help="Whether the block algorithm should be used.")
    parser.add_argument('-d', '--dimensions', type=int, default=2, help="The number of dimensions [2].")
    parser.add_argument('input_file', type=str, help="The input CSV file.")
    parser.add_argument('output_file', type=str, help="The output CSV file.")

    args = parser.parse_args()

    df = read_csv(args.input_file)
    del df['Unnamed: 0']

    lc = np.array(df)
    Y = np.transpose(lc)

    if(args.block):
        Z, model_params  = block_ZIFA.fitModel(Y, args.dimensions)
    else:
        Z, model_params  = ZIFA.fitModel(Y, args.dimensions)

    np.savetxt(args.output_file, Z, delimiter=',')
Exemple #7
0
                valid_set_index.append(index)
        print valid_set_index
        #valid_set_index=np.random.choice(train_data.shape[0],size=s,replace=False)
        train_set_index = [
            x for x in range(train_data.shape[0]) if x not in valid_set_index
        ]
        valid_data = train_data[valid_set_index, :]
        #valid_valid=train_valid[valid_set_index,:]
        #train_data=train_data[train_set_index,:]
        #train_valid=train_valid[train_set_index,:]
        Y = valid_data
        valid_Y = labeled_label[valid_set_index]
        print 'before shape: ', Y.shape
        #Y=Y[:, np.sum(Y >1e-6, axis=0)/float(Y.shape[0])>0.9]#keep genes that are expressed in 90% samples
        print 'after shape: ', Y.shape
        code, model_params = block_ZIFA.fitModel(Y, 100)
        print code.shape
        #print Z

    else:
        #if args.n_component==0:
        #    pca=PCA()
        #else:
        print 'fitting data:' + args.fit
        if args.use_nmf == 1:
            #transform_data =
            if args.validation_cell_types > 0:
                transform_data = output_dict['test_X_TPMgn0']
                #print 'vct!'
            else:
                #print 'not vct'
Exemple #8
0
## set the Pancreatic folder as the working directory
from datetime import datetime
from ZIFA import ZIFA
from ZIFA import block_ZIFA
import numpy
Y = numpy.loadtxt("Results/forZifa.csv", delimiter=",", skiprows=1)
startTime = datetime.now()
Z4, model_params = block_ZIFA.fitModel(Y, 4)
print datetime.now() - startTime
numpy.savetxt("Results/Z4.csv", Z4, delimiter=",")
X = Y
Exemple #9
0
from ZIFA import ZIFA
from ZIFA import block_ZIFA
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.cluster import adjusted_rand_score as ari
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi

# This gives an example for how to read in a real data called input.table.
# genes are columns, samples are rows, each number is separated by a space.
# If you do not want to install pandas, you can also use np.loadtxt: https://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html
X = pd.read_csv('yan/yan.csv', header=None)
X = np.array(X)
X = X.transpose()

label = pd.read_csv('yan/yan_label.csv')
y = np.array(label)
label = y.ravel()

Z, model_params = block_ZIFA.fitModel(X, 5)

c = label.max()
kk = KMeans(n_clusters=c)
julei = kk.fit(Z)
julei = julei.labels_

print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten()))
print('ARI value is %f \n' % ari(julei.flatten(), label.flatten()))
print('HOM value is %f \n' % metrics.homogeneity_score(julei, label))
print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
def unitTests():
    """
    Test ZIFA and block ZIFA under a variety of conditions to make sure projected dimensions and parameters don't change. 
    """

    print(
        "\n\n\n****Running unit tests!\nIMPORTANT: These unit tests pass with:\n\
    Python version 2.7.10 (your version: %s)\n\
    numpy 1.13.1 (your version: %s)\n\
    scipy 0.18.1 (your version: %s)\n\
    sklearn 0.16.1 (your version: %s)" %
        (platform.python_version(), np.__version__, scipy.__version__,
         sklearn.__version__))
    print(
        "Different versions of Python or those packages may yield slightly different results and fail to pass the asserts unless you increase the absolute_tolerance parameter, set below."
    )
    print(
        "If your configuration yields significantly different results, please contact [email protected].\n\n"
    )

    absolute_tolerance = 1e-8

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = ZIFA.fitModel(Y, k)
    assert np.allclose(Y, old_Y)

    # for Z and A, we compare the absolute values of the parameters because some package versions appear to flip the sign (which is fine and will not affect results)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([1.50067515, 0.04742477]),
                       atol=absolute_tolerance)
    assert np.allclose(np.abs(params['A'][0, :]),
                       np.abs([0.66884415, -0.17173555]),
                       atol=absolute_tolerance)
    assert np.allclose(params['decay_coef'],
                       0.10458794970222711,
                       atol=absolute_tolerance)
    assert np.allclose(params['sigmas'][0],
                       0.30219903,
                       atol=absolute_tolerance)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    assert np.allclose(Y, old_Y)
    assert np.allclose(
        np.abs(Zhat[-1, :]),
        np.abs([1.49712162, 0.05823952]),
        atol=absolute_tolerance
    )  # this is slightly different (though highly correlated) because ZIFA runs one extra half-step of EM
    assert np.allclose(np.abs(params['A'][0, :]),
                       np.abs([0.66884415, -0.17173555]),
                       atol=absolute_tolerance)
    assert np.allclose(params['decay_coef'],
                       0.10458794970222711,
                       atol=absolute_tolerance)
    assert np.allclose(params['sigmas'][0],
                       0.30219903,
                       atol=absolute_tolerance)

    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Y, old_Y)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([9.84455438e-01, 4.50924335e-02]),
                       atol=absolute_tolerance)

    n = 50
    d = 60
    k = 3
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = block_ZIFA.fitModel(Y, k, n_blocks=3)
    assert np.allclose(Y, old_Y)
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([-1.69609638, -0.5475882, 0.08008015]),
                       atol=absolute_tolerance)

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)
    old_Y = deepcopy(Y)
    Zhat, params = ZIFA.fitModel(Y, k)
    print(Zhat[-1, :])
    assert np.allclose(np.abs(Zhat[-1, :]),
                       np.abs([-0.63075905, -0.77361427, -0.11544281]),
                       atol=absolute_tolerance)
    assert np.allclose(Y, old_Y)

    print('Tests passed with absolute tolerance %2.3e!' % absolute_tolerance)
Exemple #11
0
 def __init__(self, matrix, K, barcodes=None):
     DR.__init__(self, matrix=matrix,
                 barcodes=barcodes)  # inherits from DR object
     self.name = "ZIFA"
     self.results, self.model_params = block_ZIFA.fitModel(matrix, K)
     self.clu = Cluster(self.results.astype("double"), autoplot=False)
Exemple #12
0
	print('Fraction of zeros: %2.3f; decay coef: %2.3f' % ((Y == 0).mean(), decay_coef))

	return X, Y, Z.transpose(), cluster_ids


random.seed(35)
np.random.seed(32)
n = 200
d = 20
k = 2
sigma = .3
n_clusters = 3
decay_coef = .1
X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

Zhat, params = block_ZIFA.fitModel(Y, k)
colors = ['red', 'blue', 'green']
cluster_ids = sorted(list(set(ids)))
model = FactorAnalysis(n_components = k)
factor_analysis_Zhat = model.fit_transform(Y)
figure(figsize = [15, 5])
subplot(131)
for id in cluster_ids:
	scatter(Z[ids == id, 0], Z[ids == id, 1], color = colors[id - 1], s = 4)
	title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
	xlim([-4, 4])
	ylim([-4, 4])
subplot(132)
for id in cluster_ids:
	scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color = colors[id - 1], s = 4)
	xlim([-4, 4])
 def run(self, data):
     if sparse.issparse(data):
         data = data.toarray()
     data = np.log1p(data)
     Z, model_params = block_ZIFA.fitModel(data.T, self.k)
     return [Z.T], 0
Exemple #14
0
def runZIFA():
    random.seed(42)
    np.random.seed(42)

    print 'Number of arguments:', len(sys.argv), 'arguments.'
    print 'Argument List:', str(sys.argv)

    inputfilename = sys.argv[1]
    outputfolder = sys.argv[2]

    input_alldimensions = []
    with open(inputfilename, 'r') as infile:
        input = infile.readlines()
        cell_names = input.pop(0).rstrip('\r\n').split('\t')
        cell_names.pop(0)
        for line in input:
            line = line.rstrip('\r\n')
            linearray = []
            l = line.split('\t')
            l.pop(0)
            for it in l:
                number = float(it)
                if number < 0.0000001:
                    number = float(0)
                linearray.append(number)
            input_alldimensions.append(linearray)

    alldim = np.asarray(input_alldimensions)
    alldim = alldim.transpose()

    try:
        with open(outputfolder + "/log.zifa.txt", 'w') as f:
            sys.stdout = f
            Zhat, params = block_ZIFA.fitModel(alldim, min(5, len(cell_names)))
        sys.stdout = sys.__stdout__
    except Exception as err:
        f = open(outputfolder + "/log.zifa.txt", 'r')
        output_json = {}
        errorMsg = str(err[0])
        if errorMsg.startswith("Your input matrix contains no zeros"):
            output_json[
                'displayed_error'] = "Zifa is not converging. This can be due to an input matrix which contains no zeros. Zifa input should be log read counts. You can try another filtering/normalization but this may not solve the issue for this dataset."
        else:
            output_json['displayed_error'] = errorMsg
        output_json['original_error'] = f.read()
        with open(outputfolder + "/output.json", 'w') as outfile:
            json.dump(output_json, outfile)
        raise

    output_json = {}
    output_json['PC1'] = []
    output_json['PC2'] = []
    output_json['PC3'] = []
    output_json['PC4'] = []
    output_json['PC5'] = []
    output_json['text'] = []

    i = 0
    for it in Zhat:
        if len(it) >= 1: output_json['PC1'].append(it[0])
        if len(it) >= 2: output_json['PC2'].append(it[1])
        if len(it) >= 3: output_json['PC3'].append(it[2])
        if len(it) >= 4: output_json['PC4'].append(it[3])
        if len(it) >= 5: output_json['PC5'].append(it[4])
        output_json['text'].append(cell_names[i])
        i += 1

    if len(output_json['PC1']) == 0: del (output_json['PC1'])
    if len(output_json['PC2']) == 0: del (output_json['PC2'])
    if len(output_json['PC3']) == 0: del (output_json['PC3'])
    if len(output_json['PC4']) == 0: del (output_json['PC4'])
    if len(output_json['PC5']) == 0: del (output_json['PC5'])

    with open(outputfolder + "/output.json", 'w') as outfile:
        json.dump(output_json, outfile)
import numpy as np
from ZIFA import ZIFA
from ZIFA import block_ZIFA
import pandas as pd

# This gives an example for how to read in a real data called input.table.
# genes are columns, samples are rows, each number is separated by a space.
# If you do not want to install pandas, you can also use np.loadtxt: https://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html

file = pd.read_csv('input.table', sep=' ')
table = np.array(file)
Z, model_params = block_ZIFA.fitModel(table, 5)
np.savetxt('output.ZIFA.table', Z, fmt='%.2f')
Exemple #16
0
np.random.seed(32)

# Load expression data
rnaseq_file = os.path.join('data', input_file)

rnaseq_df = pd.read_table(rnaseq_file, index_col=0)
rnaseq_df = rnaseq_df.T
rnaseq_exp = rnaseq_df.as_matrix()

# Perform uMAP dimension reduction on expression data
if method == "umap":
    embedding = umap.UMAP(n_neighbors=10, min_dist=0.1,
                          metric='correlation').fit_transform(rnaseq_exp)
    umap_out = pd.DataFrame(embedding, columns=['1', '2'])
    umap_out.index = rnaseq_df.index
    umap_out.index.name = 'id'
    umap_out_file = os.path.join('../features',
                                 input_file + '_rnaseq_umap_features.tsv')
    umap_out.to_csv(umap_out_file, sep='\t')

# Perform ZIFA dimension reduction on expression data
elif method == "ZIFA":
    k = 2
    Zhat, params = block_ZIFA.fitModel(rnaseq_exp, k)
    zifa_out = pd.DataFrame(Zhat, columns=['1', '2'])
    zifa_out.index = rnaseq_df.index
    zifa_out.index.name = 'id'
    zifa_out_file = os.path.join('../features',
                                 input_file + '_rnaseq_ZIFA_features.tsv')
    zifa_out.to_csv(zifa_out_file, sep='\t')
 def fit_transform(self, data):
     embedding, model = block_ZIFA.fitModel(data, self.k)
     self.model = model
     return embedding
Exemple #18
0
 def apply(self):
     Zhat, params = block_ZIFA.fitModel(self.matrix, self.n_components, n_blocks = self.n_blocks)
     self.results = Zhat