def _transform_cell_feats((cache_dir, images, normalization_name, output_filename, key, header)): try: import numpy as np from cpa.util.cache import Cache, normalizations cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, normalized_colnames = cache.load(images, normalization=normalization) if len(normalizeddata) == 0: return np.empty(len(normalized_colnames)) * np.nan normalizeddata = normalizeddata[ ~np.isnan(np.sum(normalizeddata,1)),:] if len(normalizeddata) == 0: return np.empty(len(normalized_colnames)) * np.nan # save the features to csv import csv filename = output_filename + "-" + "-".join(key) + ".csv" f = open(filename, 'w') w = csv.writer(f) w.writerow(header) for vector in normalizeddata: w.writerow(tuple(key) + tuple(vector)) f.close() except: # catch *all* exceptions from traceback import print_exc import sys print_exc(None, sys.stderr) return None
def profile_gmm(cache_dir, group_name, ncomponents=50, filter=None, ipython_profile=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) keys = group.keys() subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile) subsampled = np.vstack(subsamples) meanvector = np.mean(subsampled, 0) mean_centered = subsampled - meanvector #perform PCA U, s, V = linalg.svd(mean_centered, full_matrices=False) percvar_expl = s ** 2 / np.sum(s ** 2) scores = np.dot(U, np.diag(s)) loadings = np.transpose(V) # Find the number of PCs required to explain x% of variance cutoffpercentage = 80 percvar_cum = np.cumsum(percvar_expl) npc = np.nonzero(percvar_cum > float(cutoffpercentage) / 100)[0][0] if npc < 20: npc = 20 # GMM gmm = GMM(ncomponents, cvtype='full') gmm.fit(scores[:, :npc], n_iter=100000, thresh=1e-7) parameters = [(cache_dir, group[g], gmm, meanvector, loadings[:, :npc]) for g in keys] variables = ['Component %d' % i for i in range(ncomponents)] return Profiles.compute(keys, variables, _compute_mixture_probabilities, parameters, ipython_profile, group_name=group_name)
def profile_svmnormalvector(cache_dir, group_name, control_filter, filter=None, rfe=False, ipython_profile=None, job=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict( (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [ r for image in treated_images for r in control_images_by_plate[plate_by_image[image]] ] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), rfe) for k in keys] if job: i = job - 1 memoized(_compute_svmnormalvector(parameters[i])) else: return Profiles.compute(keys, variables, memoized(_compute_svmnormalvector), parameters, ipython_profile, group_name=group_name)
def profile_factoranalysis_mean(cache_dir, group_name, nfactors=5, filter=None, ipython_profile=None, save_model=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) keys = group.keys() subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile) mean = np.mean(subsamples, axis=0) subsampled_data = subsamples - mean stdev = np.std(subsampled_data, axis=0) subsampled_data = subsampled_data / stdev nfactors = min(nfactors, subsampled_data.shape[1]) variables = ['Factor %d' % (i + 1) for i in range(nfactors)] fa_node = nodes.FANode(input_dim=None, output_dim=nfactors, dtype=None, max_cycles=30) print 'Training' fa_node.train(subsampled_data) fa_node.stop_training() if save_model: cpa.util.pickle(save_model, fa_node) parameters = [(cache_dir, group[g], fa_node, mean, stdev) for g in keys] return Profiles.compute(keys, variables, _compute_group_projection_and_mean, parameters, ipython_profile, group_name=group_name)
def profile_ksstatistic(cache_dir, group_name, control_filter, filter=None, ipython_profile=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict( (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [ r for image in treated_images for r in control_images_by_plate[plate_by_image[image]] ] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k])) for k in keys] return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, ipython_profile, group_name=group_name)
def _compute_mixture_probabilities((cache_dir, images, gmm, meanvector, loadings)): import numpy as np from cpa.util import cache cache = Cache(cache_dir) normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization) mean_centered = normalizeddata - meanvector projected = np.dot(mean_centered, loadings) mixture_probabilities = gmm.predict_proba(projected) return mixture_probabilities.mean(0)
def _compute_ksstatistic((cache_dir, images, control_images)): #try: import numpy as np import sys from cpa.util.cache import Cache, RobustLinearNormalization from cpa.util.ks_2samp import ks_2samp cache = Cache(cache_dir) normalizeddata, variables = cache.load(images, normalization=RobustLinearNormalization) control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization) assert len(control_data) >= len(normalizeddata) assert variables == control_colnames #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] m = len(variables) profile = np.empty(m) for j in range(m): profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j], signed=True)[0] return profile
def _compute_ksstatistic((cache_dir, images, control_images)): import numpy as np import sys from cpa.util.cache import Cache, RobustLinearNormalization from cpa.util.ks_2samp import ks_2samp cache = Cache(cache_dir) normalizeddata, variables = cache.load(images, normalization=RobustLinearNormalization) control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization) print normalizeddata.shape, control_data.shape assert len(control_data) >= len(normalizeddata) assert variables == control_colnames #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] m = len(variables) profile = np.empty(m) for j in range(m): profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j], signed=True)[0] return profile
def _compute_svmnormalvector((cache_dir, images, control_images, rfe)): #try: import numpy as np import sys from cpa.util.cache import Cache, RobustLinearNormalization from sklearn.svm import LinearSVC from cpa.util.profile_svmnormalvector import _compute_rfe cache = Cache(cache_dir) normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization) control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization) assert len(control_data) >= len(normalizeddata) downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] x = np.vstack((normalizeddata, downsampled)) y = np.array([1] * len(normalizeddata) + [0] * len(downsampled)) clf = LinearSVC(C=1.0) m = clf.fit(x, y) normal_vector = m.coef_[0] if rfe: normal_vector[~_compute_rfe(x, y)] = 0 return normal_vector
def profile_mean(cache_dir, group_name, filter=None, ipython_profile=None, normalization=RobustLinearNormalization): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = normalization(cache).colnames keys = group.keys() parameters = [(cache_dir, group[g], normalization.__name__) for g in keys] return Profiles.compute(keys, variables, _compute_group_mean, parameters, ipython_profile, group_name=group_name)
def _compute_group_mean((cache_dir, images, normalization_name)): try: import numpy as np from cpa.util.cache import Cache, normalizations cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, normalized_colnames = cache.load(images, normalization=normalization) if len(normalizeddata) == 0: return np.empty(len(normalized_colnames)) * np.nan normalizeddata = normalizeddata[ ~np.isnan(np.sum(normalizeddata,1)),:] if len(normalizeddata) == 0: return np.empty(len(normalized_colnames)) * np.nan return np.mean(normalizeddata, axis = 0) except: # catch *all* exceptions from traceback import print_exc import sys print_exc(None, sys.stderr) return None
def _compute_group_subsample((cache_dir, images)): try: import numpy as np from cpa.util import cache cache = Cache(cache_dir) normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization) np.random.shuffle(normalizeddata) normalizeddata_sample = [x for i, x in enumerate(normalizeddata) if i % 100 == 0] return normalizeddata_sample except: # catch *all* exceptions from traceback import print_exc print_exc(None, sys.stderr) e = sys.exc_info()[1] print >>sys.stderr, "Error: %s" % (e,) return None
def _compute_svmnormalvector((cache_dir, images, control_images, rfe)): #try: import numpy as np import sys from cpa.util.cache import Cache, RobustLinearNormalization from sklearn.svm import LinearSVC from cpa.util.profile_svmnormalvector import _compute_rfe cache = Cache(cache_dir) normalizeddata, normalized_colnames = cache.load( images, normalization=RobustLinearNormalization) control_data, control_colnames = cache.load( control_images, normalization=RobustLinearNormalization) assert len(control_data) >= len(normalizeddata) downsampled = control_data[ np.random.randint(0, len(control_data), len(normalizeddata)), :] x = np.vstack((normalizeddata, downsampled)) y = np.array([1] * len(normalizeddata) + [0] * len(downsampled)) clf = LinearSVC(C=1.0) m = clf.fit(x, y) normal_vector = m.coef_[0] if rfe: normal_vector[~_compute_rfe(x, y)] = 0 return normal_vector
def _compute_group_projection_and_mean((cache_dir, images, fa_node, mean, stdev)): try: import numpy as np from cpa.util import cache cache = Cache(cache_dir) normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization) normalizeddata = (normalizeddata - mean) / stdev normalizeddata_projected = fa_node.execute(normalizeddata) normalizeddata_projected_mean = np.mean(normalizeddata_projected, axis = 0) return normalizeddata_projected_mean except: # catch *all* exceptions from traceback import print_exc print_exc(None, sys.stderr) e = sys.exc_info()[1] print >>sys.stderr, "Error: %s" % (e,) return None
def profile_ksstatistic(cache_dir, group_name, control_filter, plate_group, filter=None, parallel=Uniprocessing()): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter, plate_group) plate_by_image = dict((row[:-2], tuple(row[-2:-1])) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): if plate_group is None: return control_images_by_plate[None] else: return list(set(r for image in treated_images for r in control_images_by_plate[plate_by_image[image]])) keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k])) for k in keys] return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, parallel=parallel, group_name=group_name)