Ejemplo n.º 1
0
def _transform_cell_feats((cache_dir, images, normalization_name, output_filename, key, header)):
    try:
        import numpy as np
        from cpa.util.cache import Cache, normalizations
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames = cache.load(images,
                                                    normalization=normalization)
        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        normalizeddata = normalizeddata[
                ~np.isnan(np.sum(normalizeddata,1)),:]

        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        # save the features to csv
        import csv
        filename = output_filename + "-" + "-".join(key) + ".csv"
        f = open(filename, 'w')
        w = csv.writer(f)
        w.writerow(header)
        for vector in normalizeddata:
            w.writerow(tuple(key) + tuple(vector))
        f.close()

    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
Ejemplo n.º 2
0
def profile_gmm(cache_dir, group_name, ncomponents=50, filter=None, 
                ipython_profile=None):
    cache = Cache(cache_dir)
    group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter)

    keys = group.keys()
    subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile)

    subsampled = np.vstack(subsamples)
    meanvector = np.mean(subsampled, 0)
    mean_centered = subsampled - meanvector

    #perform PCA
    U, s, V = linalg.svd(mean_centered, full_matrices=False)
    percvar_expl = s ** 2 / np.sum(s ** 2)
    scores = np.dot(U, np.diag(s))
    loadings = np.transpose(V)

    # Find the number of PCs required to explain x% of variance
    cutoffpercentage = 80
    percvar_cum = np.cumsum(percvar_expl)
    npc = np.nonzero(percvar_cum > float(cutoffpercentage) / 100)[0][0]
    if npc < 20: 
        npc = 20
   
    # GMM
    gmm = GMM(ncomponents, cvtype='full')
    gmm.fit(scores[:, :npc], n_iter=100000, thresh=1e-7)

    parameters = [(cache_dir, group[g], gmm, meanvector, loadings[:, :npc])
                  for g in keys]
    variables = ['Component %d' % i for i in range(ncomponents)]
    return Profiles.compute(keys, variables, _compute_mixture_probabilities, 
                            parameters, ipython_profile, group_name=group_name)
def profile_svmnormalvector(cache_dir,
                            group_name,
                            control_filter,
                            filter=None,
                            rfe=False,
                            ipython_profile=None,
                            job=None):
    cache = Cache(cache_dir)
    group, colnames_group = cpa.db.group_map(group_name,
                                             reverse=True,
                                             filter=filter)
    variables = RobustLinearNormalization(cache).colnames
    control_images_by_plate = images_by_plate(control_filter)
    plate_by_image = dict(
        (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage())

    def control_images(treated_images):
        return [
            r for image in treated_images
            for r in control_images_by_plate[plate_by_image[image]]
        ]

    keys = group.keys()
    parameters = [(cache_dir, group[k], control_images(group[k]), rfe)
                  for k in keys]
    if job:
        i = job - 1
        memoized(_compute_svmnormalvector(parameters[i]))
    else:
        return Profiles.compute(keys,
                                variables,
                                memoized(_compute_svmnormalvector),
                                parameters,
                                ipython_profile,
                                group_name=group_name)
Ejemplo n.º 4
0
def profile_factoranalysis_mean(cache_dir, group_name, nfactors=5, filter=None, 
                                ipython_profile=None, save_model=None):
    cache = Cache(cache_dir)

    group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter)

    keys = group.keys()
    subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile)

    mean = np.mean(subsamples, axis=0)   
    subsampled_data = subsamples - mean
    stdev = np.std(subsampled_data, axis=0)
    subsampled_data = subsampled_data / stdev

    nfactors = min(nfactors, subsampled_data.shape[1])
    variables = ['Factor %d' % (i + 1) for i in range(nfactors)]
    fa_node = nodes.FANode(input_dim=None, output_dim=nfactors, dtype=None, max_cycles=30)
    print 'Training'
    fa_node.train(subsampled_data)
    fa_node.stop_training()

    if save_model:
        cpa.util.pickle(save_model, fa_node)
    
    parameters = [(cache_dir, group[g], fa_node, mean, stdev)
                  for g in keys]
    return Profiles.compute(keys, variables, _compute_group_projection_and_mean, 
                            parameters, ipython_profile, group_name=group_name)
Ejemplo n.º 5
0
def profile_ksstatistic(cache_dir,
                        group_name,
                        control_filter,
                        filter=None,
                        ipython_profile=None):
    cache = Cache(cache_dir)
    group, colnames_group = cpa.db.group_map(group_name,
                                             reverse=True,
                                             filter=filter)
    variables = RobustLinearNormalization(cache).colnames
    control_images_by_plate = images_by_plate(control_filter)
    plate_by_image = dict(
        (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage())

    def control_images(treated_images):
        return [
            r for image in treated_images
            for r in control_images_by_plate[plate_by_image[image]]
        ]

    keys = group.keys()
    parameters = [(cache_dir, group[k], control_images(group[k]))
                  for k in keys]

    return Profiles.compute(keys,
                            variables,
                            _compute_ksstatistic,
                            parameters,
                            ipython_profile,
                            group_name=group_name)
Ejemplo n.º 6
0
def _compute_mixture_probabilities((cache_dir, images, gmm, meanvector, loadings)):
    import numpy as np        
    from cpa.util import cache
    cache = Cache(cache_dir)
    normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization)
    mean_centered = normalizeddata - meanvector
    projected = np.dot(mean_centered, loadings)
    mixture_probabilities = gmm.predict_proba(projected)
    return mixture_probabilities.mean(0)
def _compute_ksstatistic((cache_dir, images, control_images)):
    #try:
        import numpy as np 
        import sys
        from cpa.util.cache import Cache, RobustLinearNormalization
        from cpa.util.ks_2samp import ks_2samp

        cache = Cache(cache_dir)
        normalizeddata, variables = cache.load(images, normalization=RobustLinearNormalization)
        control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization)
        assert len(control_data) >= len(normalizeddata)
        assert variables == control_colnames
        #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
        m = len(variables)
        profile = np.empty(m)
        for j in range(m):
            profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j],
                                   signed=True)[0]
        return profile
Ejemplo n.º 8
0
def _compute_ksstatistic((cache_dir, images, control_images)):
    import numpy as np 
    import sys
    from cpa.util.cache import Cache, RobustLinearNormalization
    from cpa.util.ks_2samp import ks_2samp

    cache = Cache(cache_dir)
    normalizeddata, variables = cache.load(images, normalization=RobustLinearNormalization)
    control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization)
    print normalizeddata.shape, control_data.shape
    assert len(control_data) >= len(normalizeddata)
    assert variables == control_colnames
    #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    m = len(variables)
    profile = np.empty(m)
    for j in range(m):
        profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j],
			      signed=True)[0]
    return profile
def _compute_svmnormalvector((cache_dir, images, control_images, rfe)):
    #try:
        import numpy as np 
        import sys
        from cpa.util.cache import Cache, RobustLinearNormalization
        from sklearn.svm import LinearSVC
        from cpa.util.profile_svmnormalvector import _compute_rfe

        cache = Cache(cache_dir)
        normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization)
        control_data, control_colnames = cache.load(control_images, normalization=RobustLinearNormalization)
        assert len(control_data) >= len(normalizeddata)
        downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
        x = np.vstack((normalizeddata, downsampled))
        y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
        clf = LinearSVC(C=1.0)
        m = clf.fit(x, y)
        normal_vector = m.coef_[0]
        if rfe:
            normal_vector[~_compute_rfe(x, y)] = 0
        return normal_vector
Ejemplo n.º 10
0
def profile_mean(cache_dir, group_name, filter=None, ipython_profile=None,
                 normalization=RobustLinearNormalization):
    cache = Cache(cache_dir)

    group, colnames_group = cpa.db.group_map(group_name, reverse=True,
                                             filter=filter)
    variables = normalization(cache).colnames

    keys = group.keys()
    parameters = [(cache_dir, group[g], normalization.__name__)
                  for g in keys]

    return Profiles.compute(keys, variables, _compute_group_mean, parameters,
                            ipython_profile, group_name=group_name)
Ejemplo n.º 11
0
def _compute_group_mean((cache_dir, images, normalization_name)):
    try:
        import numpy as np
        from cpa.util.cache import Cache, normalizations
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames = cache.load(images,
                                                    normalization=normalization)
        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        normalizeddata = normalizeddata[
                ~np.isnan(np.sum(normalizeddata,1)),:]

        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        return np.mean(normalizeddata, axis = 0)
    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
Ejemplo n.º 12
0
def _compute_group_subsample((cache_dir, images)):
    try:
        import numpy as np
        from cpa.util import cache
        cache = Cache(cache_dir)
        normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization)
        np.random.shuffle(normalizeddata)
        normalizeddata_sample = [x for i, x in enumerate(normalizeddata) if i % 100 == 0]
        return normalizeddata_sample
    except: # catch *all* exceptions
        from traceback import print_exc
        print_exc(None, sys.stderr)
        e = sys.exc_info()[1]
        print >>sys.stderr, "Error: %s" % (e,)
        return None
def _compute_svmnormalvector((cache_dir, images, control_images, rfe)):
    #try:
    import numpy as np
    import sys
    from cpa.util.cache import Cache, RobustLinearNormalization
    from sklearn.svm import LinearSVC
    from cpa.util.profile_svmnormalvector import _compute_rfe

    cache = Cache(cache_dir)
    normalizeddata, normalized_colnames = cache.load(
        images, normalization=RobustLinearNormalization)
    control_data, control_colnames = cache.load(
        control_images, normalization=RobustLinearNormalization)
    assert len(control_data) >= len(normalizeddata)
    downsampled = control_data[
        np.random.randint(0, len(control_data), len(normalizeddata)), :]
    x = np.vstack((normalizeddata, downsampled))
    y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
    clf = LinearSVC(C=1.0)
    m = clf.fit(x, y)
    normal_vector = m.coef_[0]
    if rfe:
        normal_vector[~_compute_rfe(x, y)] = 0
    return normal_vector
Ejemplo n.º 14
0
def _compute_group_projection_and_mean((cache_dir, images, fa_node, mean, stdev)):
    try:
        import numpy as np        
        from cpa.util import cache
        cache = Cache(cache_dir)
        normalizeddata, normalized_colnames = cache.load(images, normalization=RobustLinearNormalization)
        normalizeddata = (normalizeddata - mean) / stdev
        normalizeddata_projected = fa_node.execute(normalizeddata)
        normalizeddata_projected_mean = np.mean(normalizeddata_projected, axis = 0)
        return normalizeddata_projected_mean
    except: # catch *all* exceptions
        from traceback import print_exc
        print_exc(None, sys.stderr)
        e = sys.exc_info()[1]
        print >>sys.stderr, "Error: %s" % (e,)
        return None
Ejemplo n.º 15
0
def profile_ksstatistic(cache_dir, group_name, control_filter, plate_group,
                        filter=None, parallel=Uniprocessing()):
    cache = Cache(cache_dir)
    group, colnames_group = cpa.db.group_map(group_name, reverse=True, 
                                             filter=filter)
    variables = RobustLinearNormalization(cache).colnames
    control_images_by_plate = images_by_plate(control_filter, plate_group)
    plate_by_image = dict((row[:-2], tuple(row[-2:-1]))
                          for row in cpa.db.GetPlatesAndWellsPerImage())

    def control_images(treated_images):
        if plate_group is None:
            return control_images_by_plate[None]
        else:
            return list(set(r for image in treated_images
                            for r in control_images_by_plate[plate_by_image[image]]))

    keys = group.keys()
    parameters = [(cache_dir, group[k], control_images(group[k]))
                  for k in keys]

    return Profiles.compute(keys, variables, _compute_ksstatistic, 
                            parameters, parallel=parallel, 
                            group_name=group_name)