def _compute_svmnormalvector((cache_dir, images, control_images, normalization_name, preprocess_file, rfe)): #try: import numpy as np import sys from cpf.profiling.cache import Cache from cpf.profiling.normalization import RobustLinearNormalization, normalizations from sklearn.svm import LinearSVC from cpf.profiling.profile_svmnormalvector import _compute_rfe cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, normalized_colnames, _ = cache.load(images, normalization=normalization) control_data, control_colnames, _ = cache.load(control_images, normalization=normalization) if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) normalizeddata = preprocessor(normalizeddata) control_data = preprocessor(control_data) assert len(control_data) >= len(normalizeddata) downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] x = np.vstack((normalizeddata, downsampled)) y = np.array([1] * len(normalizeddata) + [0] * len(downsampled)) clf = LinearSVC(C=1.0) m = clf.fit(x, y) normal_vector = m.coef_[0] if rfe: # Copy because it is immutable (normal_vector.flags.weriteable == False) normal_vector = np.array(normal_vector) normal_vector[~_compute_rfe(x, y)] = 0 return normal_vector
def _compute_ksstatistic((cache_dir, images, control_images, normalization_name, preprocess_file)): import numpy as np import sys import cpf from cpf.profiling.cache import Cache from cpf.profiling.normalization import normalizations from cpf.profiling.ks_2samp import ks_2samp cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, variables, _ = cache.load(images, normalization=normalization) control_data, control_colnames, _ = cache.load(control_images, normalization=normalization) assert len(control_data) >= len(normalizeddata) assert variables == control_colnames if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) normalizeddata = preprocessor(normalizeddata) control_data = preprocessor(control_data) variables = preprocessor.variables #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] m = len(variables) profile = np.empty(m) for j in range(m): profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j], signed=True)[0] return profile
def _compute_group_mean((cache_dir, images, normalization_name, preprocess_file, method)): try: import numpy as np from cpf.profiling.cache import Cache from cpf.profiling.normalization import normalizations from scipy.stats import norm as Gaussian cache = Cache(cache_dir) normalization = normalizations[normalization_name] data, colnames, _ = cache.load(images, normalization=normalization) cellcount = np.ones(1) * data.shape[0] if method == 'cellcount': return cellcount if len(data) == 0: return np.empty(len(colnames)) * np.nan data = data[~np.isnan(np.sum(data, 1)), :] if len(data) == 0: return np.empty(len(colnames)) * np.nan if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) data = preprocessor(data) if method == 'mean': return np.mean(data, axis=0) elif method == 'mean+std': return np.hstack((np.mean(data, axis=0), np.std(data, axis=0))) elif method == 'mode': return mode(data, axis=0) elif method == 'median': return np.median(data, axis=0) elif method == 'median+mad': c = Gaussian.ppf(3/4.) d = np.median(data, axis=0) return np.hstack((d, np.median((np.fabs(data-d)) / c, axis=0))) elif method == 'gmm2': max_sample_size = 2000 if data.shape[0] > max_sample_size: data = data[np.random.random_integers(0,data.shape[0]-1,size=max_sample_size),:] from sklearn.decomposition import PCA from sklearn.mixture import GMM pca = PCA(n_components=0.99).fit(data) pca_data = pca.transform(data) #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data) gmm = GMM(2, covariance_type='full').fit(pca_data) return pca.inverse_transform(gmm.means_).flatten() elif method == 'deciles': return np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10))) elif method == 'mean+deciles': return np.hstack((np.mean(data, axis=0), np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10))))) except: # catch *all* exceptions from traceback import print_exc import sys print_exc(None, sys.stderr) return None
def _compute_svmnormalvector((cache_dir, images, control_images, normalization_name, preprocess_file, rfe)): #try: import numpy as np import sys from cpf.profiling.cache import Cache from cpf.profiling.normalization import RobustLinearNormalization, normalizations from sklearn.svm import LinearSVC from cpf.profiling.profile_svmnormalvector import _compute_rfe cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, normalized_colnames, _ = cache.load( images, normalization=normalization) control_data, control_colnames, _ = cache.load(control_images, normalization=normalization) if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) normalizeddata = preprocessor(normalizeddata) control_data = preprocessor(control_data) assert len(control_data) >= len(normalizeddata) downsampled = control_data[ np.random.randint(0, len(control_data), len(normalizeddata)), :] x = np.vstack((normalizeddata, downsampled)) y = np.array([1] * len(normalizeddata) + [0] * len(downsampled)) clf = LinearSVC(C=1.0) m = clf.fit(x, y) normal_vector = m.coef_[0] if rfe: # Copy because it is immutable (normal_vector.flags.weriteable == False) normal_vector = np.array(normal_vector) normal_vector[~_compute_rfe(x, y)] = 0 return normal_vector
def profile_mean(cache_dir, group_name, filter=None, parallel=Uniprocessing(), normalization=RobustLinearNormalization, preprocess_file=None, show_progress=True, method='mean', full_group_header=False): group, colnames_group = cpf.db.group_map(group_name, reverse=True, filter=filter) keys = group.keys() parameters = [(cache_dir, group[g], normalization.__name__, preprocess_file, method) for g in keys] if "CPA_DEBUG" in os.environ: DEBUG_NGROUPS = 5 logging.warning( 'In debug mode. Using only a few groups (n=%d) to create profile' % DEBUG_NGROUPS) parameters = parameters[0:DEBUG_NGROUPS] keys = keys[0:DEBUG_NGROUPS] if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) variables = preprocessor.variables else: cache = Cache(cache_dir) variables = normalization(cache).colnames if method == 'mean+std': variables = variables + ['std_' + v for v in variables] elif method == 'median+mad': variables = variables + ['mad_' + v for v in variables] elif method == 'gmm2': variables = ['m1_' + v for v in variables] + ['m2_' + v for v in variables] elif method == 'deciles': variables = [ 'decile_%02d_%s' % (dec, v) for dec in range(10, 100, 10) for v in variables ] elif method == 'mean+deciles': variables = variables + [ 'decile_%02d_%s' % (dec, v) for dec in range(10, 100, 10) for v in variables ] elif method == 'cellcount': variables = ['Cells_Count'] return Profiles.compute( keys, variables, _compute_group_mean, parameters, parallel=parallel, group_name=group_name, show_progress=show_progress, group_header=colnames_group if full_group_header else None)
def profile_svmnormalvector(cache_dir, group_name, control_filter, filter=None, rfe=False, job=None, parallel=Uniprocessing(), normalization=RobustLinearNormalization, preprocess_file=None): group, colnames_group = cpf.db.group_map(group_name, reverse=True, filter=filter) control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict( (row[:-2], row[-2]) for row in cpf.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [ r for image in treated_images for r in control_images_by_plate[plate_by_image[image]] ] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), normalization.__name__, preprocess_file, rfe) for k in keys] if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) variables = preprocessor.variables else: cache = Cache(cache_dir) variables = normalization(cache).colnames if job: i = job - 1 memoize(_compute_svmnormalvector) else: if memoization_dir is None: fn = _compute_svmnormalvector else: fn = memoizer(_compute_svmnormalvector) return Profiles.compute(keys, variables, fn, parameters, parallel=parallel, group_name=group_name)
def profile_ksstatistic(cache_dir, group_name, control_filter, plate_group, filter=None, parallel=Uniprocessing(), normalization=RobustLinearNormalization, preprocess_file=None): group, colnames_group = cpf.db.group_map(group_name, reverse=True, filter=filter) control_images_by_plate = images_by_plate(control_filter, plate_group) plate_by_image = dict((row[:-2], tuple(row[-2:-1])) for row in cpf.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): if plate_group is None: return control_images_by_plate[None] else: return list( set(r for image in treated_images for r in control_images_by_plate[plate_by_image[image]])) keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), normalization.__name__, preprocess_file) for k in keys] if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) variables = preprocessor.variables else: cache = Cache(cache_dir) variables = normalization(cache).colnames return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, parallel=parallel, group_name=group_name)
def _compute_group_mean((cache_dir, images, normalization_name, preprocess_file, method)): try: import numpy as np from cpf.profiling.cache import Cache from cpf.profiling.normalization import normalizations from scipy.stats import norm as Gaussian cache = Cache(cache_dir) normalization = normalizations[normalization_name] data, colnames, _ = cache.load(images, normalization=normalization) cellcount = np.ones(1) * data.shape[0] if method == 'cellcount': return cellcount if len(data) == 0: return np.empty(len(colnames)) * np.nan data = data[~np.isnan(np.sum(data, 1)), :] if len(data) == 0: return np.empty(len(colnames)) * np.nan if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) data = preprocessor(data) if method == 'mean': return np.mean(data, axis=0) elif method == 'mean+std': return np.hstack((np.mean(data, axis=0), np.std(data, axis=0))) elif method == 'mode': return mode(data, axis=0) elif method == 'median': return np.median(data, axis=0) elif method == 'median+mad': c = Gaussian.ppf(3 / 4.) d = np.median(data, axis=0) return np.hstack((d, np.median((np.fabs(data - d)) / c, axis=0))) elif method == 'gmm2': max_sample_size = 2000 if data.shape[0] > max_sample_size: data = data[np.random.random_integers( 0, data.shape[0] - 1, size=max_sample_size), :] from sklearn.decomposition import PCA from sklearn.mixture import GMM pca = PCA(n_components=0.99).fit(data) pca_data = pca.transform(data) #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data) gmm = GMM(2, covariance_type='full').fit(pca_data) return pca.inverse_transform(gmm.means_).flatten() elif method == 'deciles': return np.hstack( map(lambda d: np.percentile(data, d, axis=0), range(10, 100, 10))) elif method == 'mean+deciles': return np.hstack((np.mean(data, axis=0), np.hstack( map(lambda d: np.percentile(data, d, axis=0), range(10, 100, 10))))) except: # catch *all* exceptions from traceback import print_exc import sys print_exc(None, sys.stderr) return None
normalizations = dict((c.__name__, c) for c in [ RobustLinearNormalization, RobustStdNormalization, StdNormalization, DummyNormalization ]) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) parser = OptionParser( "usage: %prog [-r] [-m method] PROPERTIES-FILE CACHE-DIR PREDICATE") parser.add_option('-m', '--method', dest='method', action='store', default='RobustStdNormalization', help='method') options, args = parser.parse_args() if len(args) != 3: parser.error('Incorrect number of arguments') properties_file, cache_dir, predicate = args cpf.properties.LoadFile(properties_file) from cpf.profiling.cache import Cache cache = Cache(cache_dir) normalizer = normalizations[options.method](cache) normalizer._create_cache(predicate)