def _compute_group_mean((cache_dir, images, normalization_name, 
                         preprocess_file, method)):
    try:
        import numpy as np
        from cpa.profiling.cache import Cache
        from cpa.profiling.normalization import normalizations
        from scipy.stats import norm as Gaussian
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        data, colnames, _ = cache.load(images, normalization=normalization)
        
        cellcount = np.ones(1) * data.shape[0]
        if method == 'cellcount':
            return cellcount
        
        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        data = data[~np.isnan(np.sum(data, 1)), :]

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        if preprocess_file:
            preprocessor = cpa.util.unpickle1(preprocess_file)
            data = preprocessor(data)

        if method == 'mean':
            return np.mean(data, axis=0)
        elif method == 'mean+std':
            return np.hstack((np.mean(data, axis=0), np.std(data, axis=0)))
        elif method == 'mode':
            return mode(data, axis=0)
        elif method == 'median':
            return np.median(data, axis=0)
        elif method == 'median+mad':
            c = Gaussian.ppf(3/4.)
            d = np.median(data, axis=0)
            return np.hstack((d,
                              np.median((np.fabs(data-d)) / c, axis=0)))
        elif method == 'gmm2':
            max_sample_size = 2000
            if data.shape[0] > max_sample_size:
                data = data[np.random.random_integers(0,data.shape[0]-1,size=max_sample_size),:]
            from sklearn.decomposition import PCA
            from sklearn.mixture import GMM
            pca = PCA(n_components=0.99).fit(data)
            pca_data = pca.transform(data)
            #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data)
            gmm = GMM(2, covariance_type='full').fit(pca_data)
            return pca.inverse_transform(gmm.means_).flatten()
        elif method == 'deciles':
            return np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10)))
        elif method == 'mean+deciles':
            return np.hstack((np.mean(data, axis=0), np.hstack(map(lambda d: np.percentile(data, d, axis=0), range(10,100,10)))))
    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
def _compute_group_mean((cache_dir, images, normalization_name, 
                         preprocess_file)):
    try:
        import numpy as np
        from cpa.profiling.cache import Cache, normalizations
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        data, colnames, _ = cache.load(images, normalization=normalization)
        
        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        data = data[~np.isnan(np.sum(data, 1)), :]

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        if preprocess_file:
            preprocessor = cpa.util.unpickle1(preprocess_file)
            data = preprocessor(data)

        return np.mean(data, axis = 0)
    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
def _compute_ksstatistic((cache_dir, images, control_images, normalization_name,
                          preprocess_file)):
    import numpy as np 
    import sys
    import cpa
    from cpa.profiling.cache import Cache, normalizations
    from cpa.profiling.ks_2samp import ks_2samp

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, variables, _ = cache.load(images, normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images, normalization=normalization)
    assert len(control_data) >= len(normalizeddata)
    assert variables == control_colnames
    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
        variables = preprocessor.variables
    #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    m = len(variables)
    profile = np.empty(m)
    for j in range(m):
        profile[j] = ks_2samp(control_data[:, j], normalizeddata[:, j],
			      signed=True)[0]
    return profile
def _transform_cell_feats((cache_dir, images, normalization_name, output_filename, key, header)):
    try:
        import numpy as np
        from cpa.profiling.cache import Cache, normalizations
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames, _ = cache.load(images,
                                                    normalization=normalization)
        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        normalizeddata = normalizeddata[
                ~np.isnan(np.sum(normalizeddata,1)),:]

        if len(normalizeddata) == 0:
            return np.empty(len(normalized_colnames)) * np.nan

        # save the features to csv
        import csv
        key = [str(k) for k in key]
        filename = output_filename + "-" + "-".join(key) + ".csv"
        f = open(filename, 'w')
        w = csv.writer(f)
        w.writerow(header)
        for vector in normalizeddata:
            w.writerow(tuple(key) + tuple(vector))
        f.close()
        return [-1]

    except: # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
def _compute_ksstatistic((cache_dir, images, control_images,
                          normalization_name, preprocess_file)):
    import numpy as np
    import sys
    import cpa
    from cpa.profiling.cache import Cache
    from cpa.profiling.normalization import normalizations
    from cpa.profiling.ks_2samp import ks_2samp

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, variables, _ = cache.load(images,
                                              normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images,
                                                   normalization=normalization)
    assert len(control_data) >= len(normalizeddata)
    assert variables == control_colnames
    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
        variables = preprocessor.variables
    #downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    m = len(variables)
    profile = np.empty(m)
    for j in range(m):
        profile[j] = ks_2samp(control_data[:, j],
                              normalizeddata[:, j],
                              signed=True)[0]
    return profile
def _compute_svmnormalvector(xxx_todo_changeme):
    #try:
    (cache_dir, images, control_images, normalization_name, preprocess_file,
     rfe) = xxx_todo_changeme
    import numpy as np
    import sys
    from cpa.profiling.cache import Cache
    from cpa.profiling.normalization import RobustLinearNormalization, normalizations
    from sklearn.svm import LinearSVC
    from cpa.profiling.profile_svmnormalvector import _compute_rfe

    cache = Cache(cache_dir)
    normalization = normalizations[normalization_name]
    normalizeddata, normalized_colnames, _ = cache.load(
        images, normalization=normalization)
    control_data, control_colnames, _ = cache.load(control_images,
                                                   normalization=normalization)
    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        normalizeddata = preprocessor(normalizeddata)
        control_data = preprocessor(control_data)
    assert len(control_data) >= len(normalizeddata)
    downsampled = control_data[
        np.random.randint(0, len(control_data), len(normalizeddata)), :]
    x = np.vstack((normalizeddata, downsampled))
    y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
    clf = LinearSVC(C=1.0)
    m = clf.fit(x, y)
    normal_vector = m.coef_[0]
    if rfe:
        # Copy because it is immutable (normal_vector.flags.weriteable == False)
        normal_vector = np.array(normal_vector)
        normal_vector[~_compute_rfe(x, y)] = 0
    return normal_vector
def _compute_svmnormalvector((cache_dir, images, control_images, 
                              normalization_name, preprocess_file, rfe)):
    #try:
        import numpy as np 
        import sys
        from cpa.profiling.cache import Cache, RobustLinearNormalization, normalizations
        from sklearn.svm import LinearSVC
        from cpa.profiling.profile_svmnormalvector import _compute_rfe

        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames, _ = cache.load(images, normalization=normalization)
        control_data, control_colnames, _ = cache.load(control_images, normalization=normalization)
        if preprocess_file:
            preprocessor = cpa.util.unpickle1(preprocess_file)
            normalizeddata = preprocessor(normalizeddata)
            control_data = preprocessor(control_data)
        assert len(control_data) >= len(normalizeddata)
        downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
        x = np.vstack((normalizeddata, downsampled))
        y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
        clf = LinearSVC(C=1.0)
        m = clf.fit(x, y)
        normal_vector = m.coef_[0]
        if rfe:
            # Copy because it is immutable (normal_vector.flags.weriteable == False)
            normal_vector = np.array(normal_vector)
            normal_vector[~_compute_rfe(x, y)] = 0
        return normal_vector
def _compute_group_mean((cache_dir, images, normalization_name,
                         preprocess_file, method)):
    try:
        import numpy as np
        from cpa.profiling.cache import Cache
        from cpa.profiling.normalization import normalizations
        from scipy.stats import norm as Gaussian
        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        data, colnames, _ = cache.load(images, normalization=normalization)

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        data = data[~np.isnan(np.sum(data, 1)), :]

        if len(data) == 0:
            return np.empty(len(colnames)) * np.nan

        if preprocess_file:
            preprocessor = cpa.util.unpickle1(preprocess_file)
            data = preprocessor(data)

        if method == 'mean':
            return np.mean(data, axis=0)
        elif method == 'mean+std':
            return np.hstack((np.mean(data, axis=0), np.std(data, axis=0)))
        elif method == 'mode':
            return mode(data, axis=0)
        elif method == 'median':
            return np.median(data, axis=0)
        elif method == 'median+mad':
            c = Gaussian.ppf(3 / 4.)
            d = np.median(data, axis=0)
            return np.hstack((d, np.median((np.fabs(data - d)) / c, axis=0)))
        elif method == 'gmm2':
            max_sample_size = 2000
            if data.shape[0] > max_sample_size:
                data = data[np.random.random_integers(
                    0, data.shape[0] - 1, size=max_sample_size), :]
            from sklearn.decomposition import PCA
            from sklearn.mixture import GMM
            pca = PCA(n_components=0.99).fit(data)
            pca_data = pca.transform(data)
            #gmm = GMM(2, covariance_type='full', n_iter=100000, thresh=1e-7).fit(pca_data)
            gmm = GMM(2, covariance_type='full').fit(pca_data)
            return pca.inverse_transform(gmm.means_).flatten()
    except:  # catch *all* exceptions
        from traceback import print_exc
        import sys
        print_exc(None, sys.stderr)
        return None
Example #9
0
def profile_mean(cache_dir,
                 group_name,
                 filter=None,
                 parallel=Uniprocessing(),
                 normalization=RobustLinearNormalization,
                 preprocess_file=None,
                 show_progress=True,
                 method='mean',
                 full_group_header=False):
    group, colnames_group = cpa.db.group_map(group_name,
                                             reverse=True,
                                             filter=filter)

    keys = group.keys()
    parameters = [(cache_dir, group[g], normalization.__name__,
                   preprocess_file, method) for g in keys]

    if "CPA_DEBUG" in os.environ:
        DEBUG_NGROUPS = 5
        logging.warning(
            'In debug mode. Using only a few groups (n=%d) to create profile' %
            DEBUG_NGROUPS)

        parameters = parameters[0:DEBUG_NGROUPS]
        keys = keys[0:DEBUG_NGROUPS]

    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        variables = preprocessor.variables
    else:
        cache = Cache(cache_dir)
        variables = normalization(cache).colnames
    if method == 'mean+std':
        variables = variables + ['std_' + v for v in variables]
    elif method == 'median+mad':
        variables = variables + ['mad_' + v for v in variables]
    elif method == 'gmm2':
        variables = ['m1_' + v
                     for v in variables] + ['m2_' + v for v in variables]
    elif method == 'deciles':
        variables = [
            'decile_%02d_%s' % (dec, v) for dec in range(10, 100, 10)
            for v in variables
        ]
    elif method == 'mean+deciles':
        variables = variables + [
            'decile_%02d_%s' % (dec, v) for dec in range(10, 100, 10)
            for v in variables
        ]
    elif method == 'cellcount':
        variables = ['Cells_Count']
    return Profiles.compute(
        keys,
        variables,
        _compute_group_mean,
        parameters,
        parallel=parallel,
        group_name=group_name,
        show_progress=show_progress,
        group_header=colnames_group if full_group_header else None)
def _compute_svmnormalvector((cache_dir, images, control_images, rfe)):
    # try:
    import numpy as np
    import sys
    from cpa.profiling.cache import Cache, RobustLinearNormalization
    from sklearn.svm import LinearSVC
    from cpa.profiling.profile_svmnormalvector import _compute_rfe

    cache = Cache(cache_dir)
    normalizeddata, normalized_colnames, _ = cache.load(images, normalization=RobustLinearNormalization)
    control_data, control_colnames, _ = cache.load(control_images, normalization=RobustLinearNormalization)
    assert len(control_data) >= len(normalizeddata)
    downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
    x = np.vstack((normalizeddata, downsampled))
    y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
    clf = LinearSVC(C=1.0)
    m = clf.fit(x, y)
    normal_vector = m.coef_[0]
    if rfe:
        normal_vector[~_compute_rfe(x, y)] = 0
    return normal_vector
def profile_svmnormalvector(cache_dir,
                            group_name,
                            control_filter,
                            filter=None,
                            rfe=False,
                            job=None,
                            parallel=Uniprocessing(),
                            normalization=RobustLinearNormalization,
                            preprocess_file=None):
    group, colnames_group = cpa.db.group_map(group_name,
                                             reverse=True,
                                             filter=filter)
    control_images_by_plate = images_by_plate(control_filter)
    plate_by_image = dict(
        (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage())

    def control_images(treated_images):
        return [
            r for image in treated_images
            for r in control_images_by_plate[plate_by_image[image]]
        ]

    keys = list(group.keys())
    parameters = [(cache_dir, group[k], control_images(group[k]),
                   normalization.__name__, preprocess_file, rfe) for k in keys]

    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        variables = preprocessor.variables
    else:
        cache = Cache(cache_dir)
        variables = normalization(cache).colnames

    if job:
        i = job - 1
        memoize(_compute_svmnormalvector)
    else:
        if memoization_dir is None:
            fn = _compute_svmnormalvector
        else:
            fn = memoizer(_compute_svmnormalvector)
        return Profiles.compute(keys,
                                variables,
                                fn,
                                parameters,
                                parallel=parallel,
                                group_name=group_name)
def profile_ksstatistic(cache_dir,
                        group_name,
                        control_filter,
                        plate_group,
                        filter=None,
                        parallel=Uniprocessing(),
                        normalization=RobustLinearNormalization,
                        preprocess_file=None):
    group, colnames_group = cpa.db.group_map(group_name,
                                             reverse=True,
                                             filter=filter)
    control_images_by_plate = images_by_plate(control_filter, plate_group)
    plate_by_image = dict((row[:-2], tuple(row[-2:-1]))
                          for row in cpa.db.GetPlatesAndWellsPerImage())

    def control_images(treated_images):
        if plate_group is None:
            return control_images_by_plate[None]
        else:
            return list(
                set(r for image in treated_images
                    for r in control_images_by_plate[plate_by_image[image]]))

    keys = group.keys()
    parameters = [(cache_dir, group[k], control_images(group[k]),
                   normalization.__name__, preprocess_file) for k in keys]

    if preprocess_file:
        preprocessor = cpa.util.unpickle1(preprocess_file)
        variables = preprocessor.variables
    else:
        cache = Cache(cache_dir)
        variables = normalization(cache).colnames
    return Profiles.compute(keys,
                            variables,
                            _compute_ksstatistic,
                            parameters,
                            parallel=parallel,
                            group_name=group_name)
Example #13
0
normalizations = dict((c.__name__, c) for c in [
    RobustLinearNormalization, RobustStdNormalization, StdNormalization,
    DummyNormalization
])

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    parser = OptionParser(
        "usage: %prog [-r] [-m method] PROPERTIES-FILE CACHE-DIR PREDICATE")
    parser.add_option('-m',
                      '--method',
                      dest='method',
                      action='store',
                      default='RobustStdNormalization',
                      help='method')

    options, args = parser.parse_args()

    if len(args) != 3:
        parser.error('Incorrect number of arguments')
    properties_file, cache_dir, predicate = args

    cpa.properties.LoadFile(properties_file)

    from cpa.profiling.cache import Cache
    cache = Cache(cache_dir)
    normalizer = normalizations[options.method](cache)
    normalizer._create_cache(predicate)