Esempio n. 1
0
def _compute_ksstatistic((cache_dir, images, control_images,
                          normalization_name, preprocess_file)):
    import numpy as np
    import sys
    import cpf
    from cpf.profiling.cache import Cache
    from cpf.profiling.normalization import normalizations
    from cpf.profiling.ks_2samp import ks_2samp

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, variables, _ = cache.load(images,
                                              normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images,
                                                   normalization=normalization)
    assert len(control_data) >= len(normalizeddata)
    assert variables == control_colnames
    if preprocess_file:
        preprocessor = cpf.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
        variables = preprocessor.variables
    #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    m = len(variables)
    profile = np.empty(m)
    for j in range(m):
        profile[j] = ks_2samp(control_data[:, j],
                              normalizeddata[:, j],
                              signed=True)[0]
    return profile
Esempio n. 2
0
def _compute_svmnormalvector((cache_dir, images, control_images, 
                              normalization_name, preprocess_file, rfe)):
    #try:
        import numpy as np 
        import sys
        from cpf.profiling.cache import Cache
        from cpf.profiling.normalization import RobustLinearNormalization, normalizations
        from sklearn.svm import LinearSVC
        from cpf.profiling.profile_svmnormalvector import _compute_rfe

        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames, _ = cache.load(images, normalization=normalization)
        control_data, control_colnames, _ = cache.load(control_images, normalization=normalization)
        if preprocess_file:
            preprocessor = cpf.util.unpickle1(preprocess_file)
            normalizeddata = preprocessor(normalizeddata)
            control_data = preprocessor(control_data)
        assert len(control_data) >= len(normalizeddata)
        downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
        x = np.vstack((normalizeddata, downsampled))
        y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
        clf = LinearSVC(C=1.0)
        m = clf.fit(x, y)
        normal_vector = m.coef_[0]
        if rfe:
            # Copy because it is immutable (normal_vector.flags.weriteable == False)
            normal_vector = np.array(normal_vector)
            normal_vector[~_compute_rfe(x, y)] = 0
        return normal_vector
Esempio n. 3
0
def _compute_ksstatistic((cache_dir, images, control_images, normalization_name,
                          preprocess_file)):
    import numpy as np 
    import sys
    import cpf
    from cpf.profiling.cache import Cache
    from cpf.profiling.normalization import normalizations
    from cpf.profiling.ks_2samp import ks_2samp

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, variables, _ = cache.load(images, normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images, normalization=normalization)
    assert len(control_data) >= len(normalizeddata)
    assert variables == control_colnames
    if preprocess_file:
        preprocessor = cpf.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
        variables = preprocessor.variables
    #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    m = len(variables)
    profile = np.empty(m)
    for j in range(m):
        profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j],
			      signed=True)[0]
    return profile
Esempio n. 4
0
def _compute_svmnormalvector((cache_dir, images, control_images,
                              normalization_name, preprocess_file, rfe)):
    #try:
    import numpy as np
    import sys
    from cpf.profiling.cache import Cache
    from cpf.profiling.normalization import RobustLinearNormalization, normalizations
    from sklearn.svm import LinearSVC
    from cpf.profiling.profile_svmnormalvector import _compute_rfe

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, normalized_colnames, _ = cache.load(
        images, normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images,
                                                   normalization=normalization)
    if preprocess_file:
        preprocessor = cpf.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
    assert len(control_data) >= len(normalizeddata)
    downsampled = control_data[
        np.random.randint(0, len(control_data), len(normalizeddata)), :]
    x = np.vstack((normalizeddata, downsampled))
    y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
    clf = LinearSVC(C=1.0)
    m = clf.fit(x, y)
    normal_vector = m.coef_[0]
    if rfe:
        # Copy because it is immutable (normal_vector.flags.weriteable == False)
        normal_vector = np.array(normal_vector)
        normal_vector[~_compute_rfe(x, y)] = 0
    return normal_vector
Esempio n. 5
0
def _compute_group_mean((cache_dir, images, normalization_name, 
                         preprocess_file, method)):
    try:
        import numpy as np
        from cpf.profiling.cache import Cache
        from cpf.profiling.normalization import normalizations
        from scipy.stats import norm as Gaussian
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        data, colnames, _ = cache.load(images, normalization=normalization)
        
        cellcount = np.ones(1) * data.shape[0]
        if method == 'cellcount':
            return cellcount
        
        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        data = data[~np.isnan(np.sum(data, 1)), :]

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        if preprocess_file:
            preprocessor = cpf.util.unpickle1(preprocess_file)
            data = preprocessor(data)

        if method == 'mean':
            return np.mean(data, axis=0)
        elif method == 'mean+std':
            return np.hstack((np.mean(data, axis=0), np.std(data, axis=0)))
        elif method == 'mode':
            return mode(data, axis=0)
        elif method == 'median':
            return np.median(data, axis=0)
        elif method == 'median+mad':
            c = Gaussian.ppf(3/4.)
            d = np.median(data, axis=0)
            return np.hstack((d,
                              np.median((np.fabs(data-d)) / c, axis=0)))
        elif method == 'gmm2':
            max_sample_size = 2000
            if data.shape[0] > max_sample_size:
                data = data[np.random.random_integers(0,data.shape[0]-1,size=max_sample_size),:]
            from sklearn.decomposition import PCA
            from sklearn.mixture import GMM
            pca = PCA(n_components=0.99).fit(data)
            pca_data = pca.transform(data)
            #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data)
            gmm = GMM(2, covariance_type='full').fit(pca_data)
            return pca.inverse_transform(gmm.means_).flatten()
        elif method == 'deciles':
            return np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10)))
        elif method == 'mean+deciles':
            return np.hstack((np.mean(data, axis=0), np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10)))))
    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
Esempio n. 6
0
def _compute_group_mean((cache_dir, images, normalization_name,
                         preprocess_file, method)):
    try:
        import numpy as np
        from cpf.profiling.cache import Cache
        from cpf.profiling.normalization import normalizations
        from scipy.stats import norm as Gaussian
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        data, colnames, _ = cache.load(images, normalization=normalization)

        cellcount = np.ones(1) * data.shape[0]
        if method == 'cellcount':
            return cellcount

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        data = data[~np.isnan(np.sum(data, 1)), :]

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        if preprocess_file:
            preprocessor = cpf.util.unpickle1(preprocess_file)
            data = preprocessor(data)

        if method == 'mean':
            return np.mean(data, axis=0)
        elif method == 'mean+std':
            return np.hstack((np.mean(data, axis=0), np.std(data, axis=0)))
        elif method == 'mode':
            return mode(data, axis=0)
        elif method == 'median':
            return np.median(data, axis=0)
        elif method == 'median+mad':
            c = Gaussian.ppf(3 / 4.)
            d = np.median(data, axis=0)
            return np.hstack((d, np.median((np.fabs(data - d)) / c, axis=0)))
        elif method == 'gmm2':
            max_sample_size = 2000
            if data.shape[0] > max_sample_size:
                data = data[np.random.random_integers(
                    0, data.shape[0] - 1, size=max_sample_size), :]
            from sklearn.decomposition import PCA
            from sklearn.mixture import GMM
            pca = PCA(n_components=0.99).fit(data)
            pca_data = pca.transform(data)
            #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data)
            gmm = GMM(2, covariance_type='full').fit(pca_data)
            return pca.inverse_transform(gmm.means_).flatten()
        elif method == 'deciles':
            return np.hstack(
                map(lambda d: np.percentile(data, d, axis=0),
                    range(10, 100, 10)))
        elif method == 'mean+deciles':
            return np.hstack((np.mean(data, axis=0),
                              np.hstack(
                                  map(lambda d: np.percentile(data, d, axis=0),
                                      range(10, 100, 10)))))
    except:  # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None