コード例 #1
0
ファイル: ingest.py プロジェクト: maozhiqiang/zounds
def ingest(dataset,
           cls,
           skip_if_exists=True,
           multi_process=False,
           multi_threaded=False,
           cores=None):

    pool = None

    if multi_process:
        pool = Pool(cores or cpu_count())
        map_func = pool.imap_unordered
    elif multi_threaded:
        pool = ThreadPool(cores or cpu_count())
        map_func = pool.imap_unordered
    else:
        map_func = map

    cls_args = repeat(cls)
    skip_args = repeat(skip_if_exists)

    map_func(ingest_one, izip(dataset, cls_args, skip_args))

    if pool is not None:
        # if we're ingesting using multiple processes or threads, the processing
        # should be parallel, but this method should be synchronous from the
        # caller's perspective
        pool.close()
        pool.join()
コード例 #2
0
def _file_per_document(exportfile):
    if not os.path.exists(exportfile):
        print "%s doesn't exist!" % exportfile
        return
    dirs, _ = os.path.split(exportfile)
    docspath = os.path.join(dirs, 'documents')
    ensure_dirpath(docspath)
    expfile = open(exportfile, 'r')

    def wat(ammapobject):
        x = True
        while x:
            ablob = ammapobject.readline()
            if ablob:
                yield ablob
            else:
                x = False

    tpool = pool.ThreadPool(pool.cpu_count() * 64)
    gettingweird = wat(mmap.mmap(expfile.fileno(), 0, prot=mmap.PROT_READ))
    job = tpool.imap_unordered(
        _fpd, itertools.izip_longest(gettingweird, (), fillvalue=docspath))
    while True:
        try:
            job.next()
        except Exception:
            return
コード例 #3
0
ファイル: commonTips.py プロジェクト: johnsonhongyi/pyQuant
def to_mp_run_async(cmd, urllist, *args):
    # n_t=time.time()
    # print "mp_async:%s" % len(urllist),
    pool = ThreadPool(cpu_count())
    # print arg
    # print cpu_count()
    # pool = multiprocessing.Pool(processes=8)
    # for code in codes:
    #     results=pool.apply_async(sl.get_multiday_ave_compare_silent_noreal,(code,60))
    # result=[]
    # results = pool.map(cmd, urllist)
    # for code in urllist:
    # result.append(pool.apply_async(cmd,(code,)))
    results = []
    for code in urllist:
        # result = pool.apply_async(cmd, (code, arg))
        # arg=(code)+','+(args)
        # print arg
        result = pool.apply_async(cmd, (code,) + args).get()
        results.append(result)
    pool.close()
    pool.join()
    # results = flatten(map(lambda x: x.get(), results))
    # results = flatten( results)
    # print "time:MP", (time.time() - n_t)
    return results
コード例 #4
0
ファイル: process_pool.py プロジェクト: webvul/webfuzzer
    def __init__(self,
                 processes=None,
                 initializer=None,
                 initargs=(),
                 maxtasksperchild=None):
        self._setup_queues()
        self._taskqueue = SilentJoinableQueue()
        self._cache = {}
        self._state = RUN
        self._maxtasksperchild = maxtasksperchild
        self._initializer = initializer
        self._initargs = initargs

        if processes is None:
            try:
                processes = cpu_count()
            except NotImplementedError:
                processes = 1
        if processes < 1:
            raise ValueError("Number of processes must be at least 1")

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        self._processes = processes
        self._pool = []
        self._repopulate_pool()

        self._worker_handler = threading.Thread(target=Pool._handle_workers,
                                                args=(self, ),
                                                name='PoolWorkerHandler')
        self._worker_handler.daemon = True
        self._worker_handler._state = RUN
        self._worker_handler.start()

        self._task_handler = threading.Thread(
            target=Pool._handle_tasks,
            args=(self._taskqueue, self._quick_put, self._outqueue, self._pool,
                  self._cache),
            name='PoolTaskHandler')
        self._task_handler.daemon = True
        self._task_handler._state = RUN
        self._task_handler.start()

        self._result_handler = threading.Thread(target=Pool._handle_results,
                                                args=(self._outqueue,
                                                      self._quick_get,
                                                      self._cache),
                                                name='PoolResultHandler')
        self._result_handler.daemon = True
        self._result_handler._state = RUN
        self._result_handler.start()

        self._terminate = Finalize(
            self,
            self._terminate_pool,
            args=(self._taskqueue, self._inqueue, self._outqueue, self._pool,
                  self._worker_handler, self._task_handler,
                  self._result_handler, self._cache),
            exitpriority=15)
コード例 #5
0
ファイル: process_pool.py プロジェクト: 0x554simon/w3af
    def __init__(self, processes=None, initializer=None, initargs=(),
                 maxtasksperchild=None):
        self._setup_queues()
        self._taskqueue = SilentJoinableQueue()
        self._cache = {}
        self._state = RUN
        self._maxtasksperchild = maxtasksperchild
        self._initializer = initializer
        self._initargs = initargs

        if processes is None:
            try:
                processes = cpu_count()
            except NotImplementedError:
                processes = 1
        if processes < 1:
            raise ValueError("Number of processes must be at least 1")

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        self._processes = processes
        self._pool = []
        self._repopulate_pool()

        self._worker_handler = threading.Thread(
            target=Pool._handle_workers,
            args=(self, ),
            name='PoolWorkerHandler'
            )
        self._worker_handler.daemon = True
        self._worker_handler._state = RUN
        self._worker_handler.start()

        self._task_handler = threading.Thread(
            target=Pool._handle_tasks,
            args=(self._taskqueue, self._quick_put, self._outqueue,
                  self._pool, self._cache),
            name='PoolTaskHandler')
        self._task_handler.daemon = True
        self._task_handler._state = RUN
        self._task_handler.start()

        self._result_handler = threading.Thread(
            target=Pool._handle_results,
            args=(self._outqueue, self._quick_get, self._cache),
            name='PoolResultHandler')
        self._result_handler.daemon = True
        self._result_handler._state = RUN
        self._result_handler.start()

        self._terminate = Finalize(
            self, self._terminate_pool,
            args=(self._taskqueue, self._inqueue, self._outqueue, self._pool,
                  self._worker_handler, self._task_handler,
                  self._result_handler, self._cache),
            exitpriority=15)
コード例 #6
0
def learn(epochs=500, nsamples=int(1e5), init_weights=False):
    if not len(list(Sound.database.iter_ids())) > 0:
        print 'there are no sounds to learn from, exiting'
        return

    network = Network()

    if init_weights:
        recent_id = most_recent_id()
        print 'loading weights from', recent_id
        faae = EmbeddingPipeline(_id=recent_id)
        previous_network = faae.pipeline[1].network
        network.load_state_dict(previous_network.state_dict())
        print 'loaded weights from', recent_id
        del previous_network

    trainer = zounds.TripletEmbeddingTrainer(
        network,
        epochs=epochs,
        batch_size=64,
        anchor_slice=anchor_slice,
        deformations=[nearby, pitch_shift, time_stretch, additive_noise])

    pool = Pool(cpu_count())
    iterator = pool.imap_unordered(access_log_spectrogram, Sound)

    _id = 'Embedding{t}'.format(t=int(time.time() * 1e6))

    EmbeddingPipeline.process(_id=_id,
                              samples=iterator,
                              trainer=trainer,
                              bits=1024,
                              nsamples=nsamples)

    pool.close()
    pool.join()

    soundalike_client.add_trained_model(_id)

    print 'Learner most recent', _id

    print 'computing learned features'
    snd_class = with_hash(_id)
    tpool = ThreadPool(4)

    def compute_hashed(snd):
        _id = snd._id
        logger.debug('Computing {_id} hashed feature'.format(**locals()))
        snd_class.hashed.compute(_id=_id, persistence=snd)

    tpool.map(compute_hashed, snd_class)

    return _id
コード例 #7
0
ファイル: decorators.py プロジェクト: phiotr/RV-Optimization
def parallel_execution(function):
    pool_size = pool.cpu_count() * 2
    jobs_pool = pool.ThreadPool(pool_size)

    def parallel_execution_wrapper(x_array, *args):
        # Split data into the parts
        data_parts = numpy.array_split(x_array, pool_size)
        # Execute the function and combine all blocks together as a one numpy.array
        return numpy.concatenate(
            jobs_pool.map(lambda part_of_data: function(part_of_data, *args),
                          data_parts))

    return parallel_execution_wrapper
コード例 #8
0
ファイル: crawl_landmark.py プロジェクト: Dectinc/deep_vlad
def crawl():
    pool = Pool(cpu_count() - 2)
    image_list, num_images = load_image_list(args.list_file)
    print 'Loaded {} images'.format(num_images)
    cleaned_image_list, cleaned_num_images = clean_image_list(image_list)
    print '{} images to crawl'.format(cleaned_num_images)
    pbar = get_progress_bar(cleaned_num_images)

    for i, _ in enumerate(pool.imap(crawl_job, cleaned_image_list), 1):
        pbar.update(i)
    pbar.finish()
    Image.save_image_list(image_list, args.image_cache)
    Landmark.save_all(args.landmark_cache)
    logging.info('All done')
コード例 #9
0
ファイル: spl.py プロジェクト: verbalsaintmars/srmparserlite
   def Start(this, a_sites):
      # Check One Big File existence
      l_siteMap = this.BigFileTest(a_sites)
      a_sites = [value for value in l_siteMap.itervalues()]
      # Sync time for criteria or not, also check criteria == 0
      this.TakeConfigs(a_sites)

      if a_sites.__len__() == 0:
         print("Nothing to do~~~ boring...")
         return

      l_tpool = mpo.ThreadPool(processes=mpo.cpu_count())
      l_result = l_tpool.map_async(parser.Parser(), a_sites, 2)
      l_result.get()
コード例 #10
0
ファイル: crawl_landmark.py プロジェクト: bo-xiong/deep_vlad
def crawl():
    pool = Pool(cpu_count() - 2)
    image_list, num_images = load_image_list(args.list_file)
    print 'Loaded {} images'.format(num_images)
    cleaned_image_list, cleaned_num_images = clean_image_list(image_list)
    print '{} images to crawl'.format(cleaned_num_images)
    pbar = get_progress_bar(cleaned_num_images)

    for i, _ in enumerate(pool.imap(crawl_job, cleaned_image_list), 1):
        pbar.update(i)
    pbar.finish()
    Image.save_image_list(image_list, args.image_cache)
    Landmark.save_all(args.landmark_cache)
    logging.info('All done')
コード例 #11
0
ファイル: random_samples.py プロジェクト: maozhiqiang/zounds
    def _total_samples(self, cls, feature, _ids):
        pool = ThreadPool(cpu_count())

        feature_filter = self.feature_filter

        def x(_id):
            f = feature(_id=_id, persistence=cls)
            filtered = feature_filter(f)
            return len(filtered)

        if self.parallel:
            total_samples = sum(pool.imap_unordered(x, _ids))
        else:
            total_samples = sum(map(x, _ids))
        return total_samples
コード例 #12
0
 def __init__(self):
     # Python 2.x
     super(FindActiveServer, self).__init__()
     # Python 3.x
     # super().__init__()
     self.host_list = []
     self.default_port = 80
     self.port = self.default_port
     self.protocol = None
     self.url_path = None
     self.regex = None
     self.request_timeout = None
     self.default_num_threads = min(cpu_count() * 4, 100)
     self.num_threads = None
     self.queue = Queue.Queue()
     self.pool = None
コード例 #13
0
ファイル: random_samples.py プロジェクト: maozhiqiang/zounds
    def _process(self, data):
        cls, feature = data

        # compute the total number of samples in our dataset
        _ids = list(cls.database.iter_ids())
        total_samples = self._total_samples(cls, feature, _ids)
        print 'Total samples', total_samples

        while True:
            if self.parallel:
                pool = ThreadPool(cpu_count())
                list(
                    pool.imap_unordered(
                        lambda _id: self._update_reservoir(
                            _id, cls, feature, total_samples), _ids))
            else:
                for _id in _ids:
                    self._update_reservoir(_id, cls, feature, total_samples)

            yield self.reservoir.get()
コード例 #14
0
ファイル: commonTips.py プロジェクト: johnsonhongyi/pyQuant
def to_mp_run(cmd, urllist):
    # n_t=time.time()
    print "mp:%s" % len(urllist),

    pool = ThreadPool(cpu_count())
    # pool = ThreadPool(4)
    # print cpu_count()
    # pool = multiprocessing.Pool(processes=8)
    # for code in codes:
    #     results=pool.apply_async(sl.get_multiday_ave_compare_silent_noreal,(code,60))
    # result=[]
    results = pool.map(cmd, urllist)
    # for code in urllist:
    # result.append(pool.apply_async(cmd,(code,)))

    pool.close()
    pool.join()
    results = flatten(results)
    # print "time:MP", (time.time() - n_t)
    return results
コード例 #15
0
    def __call__(this, a_site):
        """
      a_site =>
      Site1 = {
         "name": "Site_1",
         "dir": r"/myfiles/Source/vsProject/srmparserlite/pplog/",
         "criteria": (site1pack11,),
         "type": "config",  # ignore other parameter. gen same
         splsync_{nu}.log on each site
         "dayoffset": 1}
      """
        a_site["dir"] = os.path.normpath(a_site["dir"])

        l_siteCriterionTuple = tuple((a_site, l_criterion) for l_criterion in a_site["criteria"])

        l_tpool = mpo.ThreadPool(processes=mpo.cpu_count())
        """
      l_siteCriterionTuple => ((a_site, site1pack11),(a_site, site1pack12))
      """
        l_result = l_tpool.map_async(this.ParseCriterion, l_siteCriterionTuple, 2)
        l_result.get()
コード例 #16
0
def parse_command_line():
    """ Parse the command line."""

    parser = ArgumentParser(description='Computes and stores the coefficients '
                            'defining the eccentricity expansion of the '
                            'various tidal terms. The coefficients are '
                            'computed exactly with no numerical roundoff as '
                            'rational numbers, but converted and stored as '
                            '16 significant figures floating point numbers. '
                            'Uses multiple CPUs to carry out the computation '
                            'and stores enough information to re-use a '
                            'previous computation to lower order.')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default='eccentricity_expansion_coefficients',
                        help='The filename to store the computed expansion '
                        'coefficients to. The file is overwritten if it '
                        'exists. Default: "%(default)s".')
    parser.add_argument('-a',
                        '--ab-file',
                        type=str,
                        default='alpha_beta_values',
                        help='The filename '
                        'where to store exact alpha and beta values '
                        'calculated while calculating the expansion '
                        'coefficients. Default: "%(default)s"')
    parser.add_argument('-c',
                        '--cpus',
                        type=int,
                        default=cpu_count(),
                        help='The number of processes to use for the '
                        'calculation. Default: %(default)d.')
    parser.add_argument('-p',
                        '--max-power',
                        type=int,
                        default=50,
                        help='Expansion is calculated up to this power of '
                        'the eccentricity. Default: %(default)d.')
    return parser.parse_args()
コード例 #17
0
#symbol_infinity="inf",
#symbol_infinity_correction=0)
#plot_example()
#pp.title("Infinity symbol: 'inf'")

##pp.savefig(build_path + "ex_misc.pdf", format="pdf")

if __name__ == '__main__':
    if build_all:
        # clear and create path
        if os.path.exists(build_path):
            shutil.rmtree(build_path)
            time.sleep(0.5)
        os.makedirs(build_path)

        p = Pool(pool.cpu_count())
        r = []
        for key, func in locals().copy().iteritems():
            if isinstance(func, FunctionType) and "make_" in key:
                r += [p.apply_async(func, {})]

        for proc in r:
            proc.get()
    else:
        #         make_grids_on()
        #         make_fancy_grids()
        #         make_grid_locators()
        #         make_scale()
        #         make_markers()
        #         make_circle()
        #         make_interpolation()
コード例 #18
0
    plot_example()
    pp.title("Infinity symbol: 'inf'")

    pp.savefig(build_path + "ex_misc.pdf", format="pdf")



if __name__ == '__main__':
    if build_all:
    # clear and create path
        if os.path.exists(build_path):
            shutil.rmtree(build_path)
            time.sleep(0.5)
        os.makedirs(build_path)

        p = Pool(pool.cpu_count())
        r = []
        for key, func in list(locals().copy().items()):
            if isinstance(func, FunctionType) and "make_" in key:
                r += [p.apply_async(func, {})]

        for proc in r:
            proc.get()
    else:
#         make_grids_on()
#         make_fancy_grids()
#         make_grid_locators()
#         make_scale()
#         make_markers()
#         make_circle()
#         make_interpolation()
コード例 #19
0
def feature_extraction(dirpath,
                       suffix_seg,
                       suffix_int,
                       num_LMs,
                       downsample,
                       clustering,
                       features,
                       recurse=False,
                       select_IDs='all',
                       assign_landmarks_kwargs='default',
                       compute_TFOR=True,
                       transform_to_TFOR_kwargs='default',
                       perform_CBE_TFOR_kwargs='default',
                       compute_CFOR=True,
                       perform_CBE_CFOR_kwargs='default',
                       processes=None,
                       dask_graph_path=None,
                       profiling=False,
                       verbose=False):
    """Extract latent features from fluorescence distributions of single-cell
    segmentations by point cloud sampling and cluster-based embedding.

    This is a dask pipeline that applies point-cloud sampling from
    `katachi.tools.assign_landmars`, transformation to the TFOR (optional)
    from `katachi.tools.find_TFOR` and cluster-based embedding (either on TFOR
    data or by constructing a CFOR, or both) from `katachi.tools.perform_CBE`
    to a dataset of single-cell segmentations that has been generated by
    `katachi.pipelines.segmentation` or an equivalent approach.

    WARNING: Not all options provided by this pipeline have been extensively
    tested. Use with prudence!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    suffix_seg : string
        File suffix that identifies target segmentation files as produced by
        `katachi.pipelines.segmentation`. This will usually be "seg.tif" but
        could contain more information to distinguish different segmentations.
    suffix_int : string
        File suffix that identifies target intensity files matching the shape
        of the target segmentation files. Each retrieved segmentation file must
        have a matching intensity file.
    num_LMs : int
        The number of landmarks to extract for each cell.
    downsample : tuple (algorithm, output_size) or None
        A tuple specifying the algorithm to use for downsampling of the merged
        point cloud prior to cluster extraction.
        See `katachi.tools.perform_CBE` for more information.
    clustering : tuple (algorithm, n_clusters)
        A tuple specifying the algorithm to use for computing the clusters to
        use in cluster-based feature extraction.
        See `katachi.tools.perform_CBE` for more information.
        Special case: both elements of clustering (i.e. `algorithm` and
        `n_clusters`) may themselves be tuples. In this case, their first and
        second elements will be used in CBE on TFOR and CFOR, respectively.
    features : list of strings
        List containing any number of cluster features to be extracted.
        See `katachi.tools.perform_CBE` for more information.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
    select_IDs : 'all' or list of strings, optional, default 'all'
        If 'all' (default), all detected input files (i.e. all samples) are
        used. Instead, a list of strings containing IDs (as assigned by
        `katachi.tools.initialize`) can be passed, in which case only samples
        whose IDs are in the list are used. If there are IDs in the list for
        which no matching files were found, a warning is shown.
    assign_landmarks_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for assign_landmarks function.
        See `katachi.tools.assign_landmarks.assign_landmarks` for information
        about available options.
        See section "Prepare kwargs for landmark assignment" in this function
        for information on default settings.
    compute_TFOR : bool, optional, default True
        If True, the prim frame of reference is computed and CBE is performed
        on the TFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    transform_to_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for transform_to_TFOR function.
        See `katachi.tools.find_TFOR.transform_to_TFOR` for information
        about available options.
        See section "Prepare kwargs for transformation to TFOR" in this
        function for information on default settings.
    perform_CBE_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to TFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on TFOR" in this function for
        information on default settings.
    compute_CFOR : bool, optional, default True
        If True, the cell frame of reference is computed and CBE is performed
        on the CFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    perform_CBE_CFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to CFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on CFOR" in this function for
        information on default settings.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (but dask is still required for CBE!).
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        shows the constructed dask pipeline.
        Note: The resulting graph may get very large if many samples are used
        at the same time.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    # Function to select pairs of files (seg, dir) and create paths
    def prepare_fpaths(dirpath, fnames):

        # Find segmentation files
        seg_names = [
            fname for fname in fnames if fname.endswith(suffix_seg + ".tif")
        ]

        # Exclude files not in select_IDs
        if not select_IDs == 'all':
            seg_names = [
                fname for fname in seg_names
                if any([fname.startswith(ID) for ID in select_IDs])
            ]

        # Get IDs
        seg_IDs = [fname[:10] for fname in seg_names]

        # Get matching intensity files
        int_names = []
        for ID in seg_IDs:
            int_name = [
                fname for fname in fnames
                if fname.startswith(ID) and fname.endswith(suffix_int + ".tif")
            ]
            try:
                int_names.append(int_name[0])
            except IndexError:
                raise IOError("Could not find matching intensity file for " +
                              "segmentation file with ID " + ID)

        # Create path
        seg_paths = [os.path.join(dirpath, name) for name in seg_names]
        int_paths = [os.path.join(dirpath, name) for name in int_names]

        # Return results
        return [(seg_paths[i], int_paths[i]) for i in range(len(seg_paths))]

    # Remove .tif if it was specified with the suffix
    if suffix_seg.endswith(".tif"): suffix_seg = suffix_seg[:-4]
    if suffix_int.endswith(".tif"): suffix_int = suffix_int[:-4]

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = prepare_fpaths(dirpath, fnames)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += prepare_fpaths(dpath, fnames)

    # Test if all samples in select_IDs are present
    if not select_IDs == 'all':
        fpaths_IDs = [os.path.split(fp[0])[1][:10] for fp in fpaths]
        orphan_IDs = [ID for ID in select_IDs if ID not in fpaths_IDs]
        if any(orphan_IDs):
            warn(
                "No matching files found for some of the IDs in select_IDs: " +
                ", ".join(orphan_IDs))

    # Check
    if len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Handle processes
    if processes is None:
        processes = cpu_count() // 2

    # More checks
    if not compute_TFOR and not compute_CFOR:
        raise IOError("At least one of compute_TFOR or compute_CFOR must be " +
                      "set to True.")

    # Report
    if verbose:
        print "Detected", len(fpaths), "target file pairs."

    #--------------------------------------------------------------------------

    ### Prepare kwargs for landmark assignment

    # Default kwargs for landmark assignment
    la_kwargs = dict()
    la_kwargs['save_centroids'] = True
    la_kwargs['fpath_out'] = None
    la_kwargs['show_cells'] = None
    la_kwargs['verbose'] = False
    la_kwargs['global_prep_func'] = None
    la_kwargs['global_prep_params'] = None
    la_kwargs['local_prep_func'] = None
    la_kwargs['local_prep_params'] = None
    la_kwargs['landmark_func'] = 'default'
    la_kwargs['landmark_func_params'] = None

    # User-specified kwargs for landmark assignment
    if assign_landmarks_kwargs != 'default':
        for kw in assign_landmarks_kwargs.keys():
            la_kwargs[kw] = assign_landmarks_kwargs[kw]

    # Safety check
    if la_kwargs['fpath_out'] is not None:
        raise IOError(
            "`assign_landmarks_kwargs['fpath_out']` must be set to " +
            "`None`, otherwise files will overwrite each other.")

    #--------------------------------------------------------------------------

    ### Prepare kwargs for TFOR transformation

    # Default kwargs for transformation to TFOR
    TFOR_kwargs = dict()
    TFOR_kwargs['n_points'] = 3000
    TFOR_kwargs['verbose'] = False
    TFOR_kwargs['show'] = False

    # User-specified kwargs for TFOR
    if transform_to_TFOR_kwargs != 'default':
        for kw in transform_to_TFOR_kwargs.keys():
            TFOR_kwargs[kw] = transform_to_TFOR_kwargs[kw]

    # Safety check
    if not compute_TFOR and transform_to_TFOR_kwargs is not 'default':
        warn("Non-default kwargs were passed for transformation to TFOR but " +
             "compute_TFOR is set to False!")

    #--------------------------------------------------------------------------

    ### Prepare args for CBE

    # Handle differing clustering inputs for TFOR and CFOR
    if type(clustering[0]) == tuple:
        clustering_TFOR = (clustering[0][0], clustering[1][0])
        clustering_cfor = (clustering[0][1], clustering[1][1])
    else:
        clustering_TFOR = clustering_cfor = clustering

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on TFOR

    # Default kwargs for CBE
    cbe_TFOR_kwargs = dict()
    cbe_TFOR_kwargs['normalize_vol'] = None
    cbe_TFOR_kwargs['presample'] = None
    cbe_TFOR_kwargs['cfor'] = None
    cbe_TFOR_kwargs['standardize'] = False
    cbe_TFOR_kwargs['custom_feature_funcs'] = None
    cbe_TFOR_kwargs['dask_graph_path'] = None
    cbe_TFOR_kwargs['processes'] = processes
    cbe_TFOR_kwargs['profiling'] = False
    cbe_TFOR_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_TFOR_kwargs['save_metadata'] = True
    cbe_TFOR_kwargs['save_presampled'] = False
    cbe_TFOR_kwargs['save_cfor'] = False
    cbe_TFOR_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_TFOR_kwargs != 'default':
        for kw in perform_CBE_TFOR_kwargs.keys():
            cbe_TFOR_kwargs[kw] = perform_CBE_TFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on CFOR

    # Default kwargs for CBE
    cbe_cfor_kwargs = dict()
    cbe_cfor_kwargs['normalize_vol'] = True
    cbe_cfor_kwargs['presample'] = None
    cbe_cfor_kwargs['cfor'] = ('PD', 3)
    cbe_cfor_kwargs['standardize'] = True
    cbe_cfor_kwargs['custom_feature_funcs'] = None
    cbe_cfor_kwargs['dask_graph_path'] = None
    cbe_cfor_kwargs['processes'] = processes
    cbe_cfor_kwargs['profiling'] = False
    cbe_cfor_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_cfor_kwargs['save_metadata'] = True
    cbe_cfor_kwargs['save_presampled'] = False
    cbe_cfor_kwargs['save_cfor'] = True
    cbe_cfor_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_CFOR_kwargs != 'default':
        for kw in perform_CBE_CFOR_kwargs.keys():
            cbe_cfor_kwargs[kw] = perform_CBE_CFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### If desired: run sequentially

    if processes == 1:

        if verbose: print "Processing target file pairs sequentially..."

        # Landmark extraction
        if verbose: print "--Assigning landmarks..."
        fpaths_lm = []
        for seg_path, int_path in fpaths:
            assign_landmarks(seg_path, int_path, num_LMs, **la_kwargs)
            fpaths_lm.append((seg_path, int_path[:-4] + "_LMs.npy"))

        # Computing the TFOR and performing CBE on TFOR
        if compute_TFOR:

            # Run the transformation to TFOR
            if verbose: print "--Transforming to TFOR..."
            fpaths_TFOR = []
            for seg_path, lm_path in fpaths_lm:
                transform_to_TFOR(seg_path, lm_path, **TFOR_kwargs)
                fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

            # Performing CBE on TFOR
            if verbose: print "--Performing CBE on TFOR..."
            cbe(fpaths_TFOR, downsample, clustering_TFOR, features,
                **cbe_TFOR_kwargs)

        # Performing CBE on CFOR
        if compute_CFOR:
            if verbose: print "--Performing CBE on CFOR..."
            lm_paths = [fpath[1] for fpath in fpaths_lm]
            cbe(lm_paths, downsample, clustering_cfor, features,
                **cbe_cfor_kwargs)

        # Done
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    dask_graph = dict()

    # For each input...
    fpaths_lm = []
    fpaths_TFOR = []
    for idx, fpath in enumerate(fpaths):

        # Landmark extraction nodes
        seg_path, int_path = fpath
        asgn_lms = partial(assign_landmarks, **la_kwargs)
        dask_graph["asgn_lms_%i" % idx] = (asgn_lms, seg_path, int_path,
                                           num_LMs)
        lm_path = int_path[:-4] + "_LMs.npy"
        fpaths_lm.append(lm_path)

        # Transform to TFOR
        if compute_TFOR:

            # Transform to TFOR
            tf2TFOR = partial(transform_to_TFOR, **TFOR_kwargs)
            tf2TFOR_await = lambda _, s, lmp: tf2TFOR(s, lmp)
            dask_graph["tf2TFOR_%i" % idx] = (tf2TFOR_await,
                                              "asgn_lms_%i" % idx, seg_path,
                                              lm_path)
            fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

    # Perform CBE on TFOR
    if compute_TFOR:
        cbe_TFOR = partial(cbe, **cbe_TFOR_kwargs)
        cbe_TFOR_await = lambda _, lmp, ds, cl, fe: cbe_TFOR(lmp, ds, cl, fe)
        dask_graph["CBE_TFOR"] = (cbe_TFOR_await, [
            "tf2TFOR_%i" % idx for idx in range(len(fpaths))
        ], fpaths_TFOR, downsample, clustering_TFOR, features)

    # Perform CBE on CFOR
    if compute_CFOR:

        cbe_cfor = partial(cbe, **cbe_cfor_kwargs)
        cbe_cfor_await = lambda _, lmp, ds, cl, fe: cbe_cfor(lmp, ds, cl, fe)

        # Don't parallelize CBEs; wait for TFOR-CBE to finish
        if compute_TFOR:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, "CBE_TFOR", fpaths_lm,
                                      downsample, clustering_cfor, features)
        else:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, [
                "asgn_lms_%i" % idx for idx in range(len(fpaths))
            ], fpaths_lm, downsample, clustering_cfor, features)

    # Create dask graph
    if dask_graph_path is not None:
        from dask.dot import dot_graph
        dot_graph(dask_graph, filename=dask_graph_path)

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # Report
    if verbose: print "Processing target file pairs in parallel..."

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_CFOR')
        else:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_TFOR')

    # Run the pipeline (with resource profiling)
    if profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_CFOR')
                visualize([prof, rprof], save=False)
        else:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_TFOR')
                visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
def full_segmentation(dirpath, channel, IDs=None, lin_unmix=False, 
                      recurse=False, ignore_old=True, fname_prefix=None,
                      processes=None, subprocesses=1,
                      profiling=False, verbose=False,
                      unmix_params=(0.0, 1.0, 20),
                      segment_params={'median_size'     : 3,
                                      'gaussian_sigma'  : 3,
                                      'max_offset'      : 10,
                                      'offset_step'     : 1,
                                      'clean_small'     : 1000,
                                      'clean_big'       : 1000000,
                                      'expansion_sigma' : 3} ,
                      use_legacy_unmix=False,
                      use_legacy_seg=False):
    """Segment single cells from 3D stacks of membrane-labeled tissues.

    This is a dask pipeline that applies linear unmixing (optional) from
    `katachi.tools.linearly_unmix` and 3D single-cell segmentation from
    `katachi.tools.segment` to a dataset that has previously been initialized
    using `katachi.pipelines.initialization`.

    WARNING: The approach used here has been developed for the Zebrafish
    posterior lateral line primordium. It is likely not readily applicable to
    other tissues!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    channel : string
        The channel to be used for segmentation.
    IDs : list of strings or None, optional, default None
        If None, all files found within dirpath that have the `channel` suffix
        are processed. If a list of strings (IDs) is given, only files with one
        of the given IDs as prefix are processed.
    lin_unmix : string or False, optional, default False
        If a string is given, linear unmixing will be performed, otherwise not.
        The string must be the channel designation of the 'contaminant'.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
    ignore_old : bool, optional, default True
        If True, files that already have a matching segmentation in the same
        directory will be ignored.
    fname_prefix : str or None, optional
        If not None, only file names that start with the given string are used.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (dask is not used).
    subprocesses : int, optional, default 1
        Number of processes that can be spawned for multiprocessing during
        linear unmixing. IMPORTANT: Note that the total number of processes
        running can reach up to `processes * subprocesses`! The default (1)
        runs sequentially (no multiprocessing code is used).
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    unmix_params: tuple, optional, default (0.0, 1.0, 20)
        Parameters for linear unmixing. For the default approach, it is simply
        the `a_range` tuple, which designates the values of `a` to be scanned
        in the form `(start, stop, n_steps)`. For more information, see
        `katachi.tools.linearly_unmix.unmix_linear`.
        For the legacy approach, the `unmix_params` tuple instead contains
        `(a_range, thresh)`. For more information, see
        `katachi.tools.linearly_unmix.unmix_linear_legacy`.
    segment_params : dict, optional
        Dict specifying parameters for segmentation. For more information see
        `katachi.tools.segment.segment_3D`.
    use_legacy_unmix : bool, optional, default False
        If True, the old parametric approach is used instead of the new one.
        Note that this requires adjustment of the unmix_params.
        Running in this mode triggers a DeprecationWarning.
    use_legacy_seg : bool, optional, default False
        If True, the old segmentation pipeline is used instead of the new one.
        Note that this requires adjustment of the segment_params.
        Running in this mode triggers a DeprecationWarning.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    # Function to select file names and create paths
    def prepare_fpaths(fpath, fnames):

        # Select correct channel
        channel_fnames = [fname for fname in fnames
                          if fname.endswith(channel+".tif")]

        # Ignore files that have already been segmented
        if ignore_old:
            lu = ""
            if lin_unmix: lu = "_linUnmix"
            channel_fnames = [fname for fname in channel_fnames
                              if fname[:-4]+lu+"_seg.tif" not in fnames]
            
        # Ignore channels that don't match any of the given IDs
        if IDs is not None:
            channel_fnames = [fname for fname in channel_fnames if
                              any([fname.startswith(ID) for ID in IDs])]

        # Ignore channels with the wrong prefix
        if fname_prefix:
            channel_fnames = [fname for fname in channel_fnames
                              if fname.startswith(fname_prefix)]
            
        # Create full paths
        fpaths = [os.path.join(fpath, fname) for fname in channel_fnames]

        # Return results
        return fpaths

    # Clean channel if specified with file ending
    if channel.endswith(".tif"):
        channel = channel[:-4]

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = prepare_fpaths(dirpath, fnames)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += prepare_fpaths(dpath, fnames)

    # Check
    if len(fpaths) == 0 and ignore_old:
        with catch_warnings():
            simplefilter("always")
            warn("No matching files found in target directory! Doing nothing!"+
                 " Could be that all matching files have already been"+
                 " processed and are ignored now because `ignore_old=True`!")
        return
    elif len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Check for linear unmixing files
    if lin_unmix:

        for fpath in fpaths:
            if not os.path.isfile(fpath.replace(channel, lin_unmix)):
                raise IOError("File(s) for the contaminant channel '" +
                              lin_unmix + "' for linear unmixing not found. ")

    # Warn about use of deprecated approaches
    if use_legacy_unmix:
        warn("Using legacy linear unmixing is deprecated!", DeprecationWarning)
    if use_legacy_seg:
        warn("Using legacy segmentation is deprecated!", DeprecationWarning)

    # Report
    if verbose:
        print "-- Detected", len(fpaths), "target files."


    #--------------------------------------------------------------------------

    ### If desired: run sequentially (does not use dask/multiprocessing)

    if processes == 1:

        if verbose: print "Processing target files sequentially..."

        if lin_unmix:
            if verbose: print "--Unmixing..."
            for fi,fpath in enumerate(fpaths):
                fpath_conta = fpath.replace(channel, lin_unmix)
                if not use_legacy_unmix:
                    unmix_linear(fpath, fpath_conta, subprocesses,
                                 unmix_params)
                else:
                    unmix_linear_legacy(fpath, fpath_conta, subprocesses,
                                        unmix_params[0], unmix_params[1])
                fpaths[fi] = fpath[:-4] + "_linUnmix.tif"

        if verbose: print "--Segmenting..."
        for fpath in fpaths:
            if not use_legacy_seg:
                segment_3D(fpath, params=segment_params)
            else:
                segment_3D_legacy(fpath, params=segment_params)

        if verbose: print "Processing complete!"
        return


    #--------------------------------------------------------------------------

    ### Prepare dask dict

    if verbose: print "Processing target files in parallel..."

    dask_graph = dict()

    # With linear unmixing
    if lin_unmix:

        # Wrapper to enable waiting for unmixing before segmentation is started
        def await_linear_unmixing(fpath, fpath_conta, subprocesses,
                                  unmix_params):
            if not use_legacy_unmix:
                unmix_linear(fpath, fpath_conta, subprocesses, unmix_params)
            else:
                unmix_linear_legacy(fpath, fpath_conta, subprocesses,
                                    unmix_params[0], unmix_params[1])
            return fpath[:-4] + "_linUnmix.tif"

        # Unmixing
        for fi, fpath in enumerate(fpaths):
            fpath_conta = fpath.replace(channel, lin_unmix)
            dask_graph["unmix_%i" % fi] = (await_linear_unmixing,
                                           fpath, fpath_conta, subprocesses,
                                           unmix_params)
            fpaths[fi] = fpath[:-4] + "_linUnmix.tif"

        # Segmentation
        for fi, fpath in enumerate(fpaths):
            if not use_legacy_seg:
                dask_graph["segment_%i" % fi] = (segment_3D, "unmix_%i" % fi,
                                                 False, segment_params)
            else:
                dask_graph["segment_%i" % fi] = (segment_3D_legacy,
                                                 "unmix_%i" % fi,
                                                 False, segment_params)

    # Without linear unmixing
    else:
        for fi, fpath in enumerate(fpaths):
            dask_graph["segment_%i" % fi] = (segment_3D, fpath,
                                             False, segment_params)

    # Collecting the results
    for fpath in fpaths:
        dask_graph['done'] = (lambda x : "done",
                              ["segment_%i" % fi for fi in range(len(fpaths))])


    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.threaded.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.threaded.get(dask_graph, 'done')
            visualize([prof,rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
def cbe(fpaths_lm,
        downsample,
        clustering,
        features,
        normalize_vol=False,
        presample=None,
        cfor=None,
        standardize='default',
        custom_feature_funcs=None,
        bw_method=None,
        dask_graph_path=None,
        processes=None,
        profiling=False,
        suffix_out='default',
        save_metadata=True,
        save_presampled=False,
        save_cfor=False,
        verbose=False,
        legacy=False):
    """Create a feature space from a set of point clouds by cluster-based
    embedding (CBE).

    This includes the following steps:
        1. Loading a set of point clouds
        2. Normalizing point clouds by volume (optional)
        3. Down-sampling of each point cloud individually (optional)
            - Available options are random, kmeans or custom downsampling
        4. Making point clouds invariant to spatial transformation (optional)
            - Also called the "Cell Frame Of Reference" (CFOR)
            - There are currently 2 ways of accomplishing this
                - Transform to pairwise distance space (PD)
                - Transform to PCA space (PCA) [DEPRECATED]
            - It is also possible to pass a custom transform function.
        5. Merging point clouds
        6. Downsampling of merged point clouds (optional but recommended!)
            - Reduces computational cost/scaling of subsequent step
            - Options are density-dep., kmeans, random or custom downsampling
        7. Extracting cluster centers as common reference points
            - Options are kmeans, dbscan and custom clustering
        8. Extracting "cluster features" relative to the reference points
            - Done with dask for effecient chaining of operations
            - Multiple feature options available, see below
        9. Saving the resulting feature space as well as intermediate results

    Cluster features that can be extracted:
        - "kNN-distsManh"  : Manhatten distance in all dimensions of each
                             cluster to the mean point of its k nearest
                             neighbor landmarks.
        - "kNN-distEuclid" : Euclidean distance of each cluster to the mean
                             point of its k nearest neighbor landmarks.
        - "NN-distsManh"   : Manhatten distance in all dimensions of each
                             cluster to the nearest neighboring landmark.
        - "NN-distEuclid"  : Euclidean distance of each cluster to the nearest
                             neighboring landmark.
        - "count-near"     : Number of landmarks near to the cluster, where
                             'near' is the mean distance of the k nearest
                             neighbor landmarks of the cluster.
        - "count-assigned" : Number of landmarks assigned to the cluster during
                             the clustering itself.
        - "kde"            : KDE estimated from cell landmarks sampled for each
                             cluster center.
        - custom features  : See custom_feature_funcs in parameters.

    Feature computations are in part dependent on each other. To make this both
    efficient and readable/elegant, dask is used for chaining the feature
    extraction steps appropriately.

    At the end, features are concatenated into a single array of shape
    (cells, features) and then saved for each input stack separately.

    Parameters
    ----------
    fpaths_lm : single string or list of strings
        A path or list of paths (either local from cwd or global) to npy files
        containing cellular landmarks as generated by
        `katachi.tools.assign_landmarks` or `...find_TFOR`.
    downsample : tuple (algorithm, output_size) or None
        A tuple specifying the algorithm to use for downsampling of the merged
        point cloud prior to cluster extraction. Available algorithms are
        "ddds" (density-dependent downsampling), "kmeans" (perform kmeans and
        use cluster centers as new points) or "random". If "default" is passed,
        "ddds" is used.
        Example: ("ddds", 200000).
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for downsampling.
        The call signature is
        `all_lms_ds = downsample[0](all_lms, downsample)`
        where all_lms is an array of shape (all_landmarks, dimensions) holding
        all input landmarks merged into one point cloud. Since the `downsample`
        tuple itself is passed, additional arguments can be specified in
        additional elements of that tuple. all_lms_ds must be an array of shape
        (output_size, dimensions).
        If None, no downsampling is performed. This is not recommended for
        inputs of relevant sizes (total landmarks > 20000).
        WARNING: downsampling (especially by ddds) can be very expensive for
        large numbers of cells. In those cases, it is recommended to first run
        a representative subsets of the cells and then use the resulting CBE
        clusters to extract features for the entire dataset (using the
        `previous` setting in the `clustering` argument).
    clustering : tuple (algorithm, n_clusters)
        A tuple specifying the algorithm to use for computing the clusters to
        use in cluster-based feature extraction. Available algorithms are
        "kmeans" or "dbscan". If "default" is passed, "kmeans" is used.
        Example: ('kmeans', 10)
        Alternatively, one may pass a tuple `('previous', clustering_object)`,
        where `clustering_object` is a previously fitted clustering instance
        similar to an instantiated and fitted sklearn.cluster.KMeans object. It
        must have the attribute `cluster_centers_`, which is an array of shape
        (clusters, dimensions) and the method `predict`, which given an array
        of shape `(all_landmarks, dimensions)` will return cluster labels for
        each landmark. Clustering objects from previous runs are stored in
        the metadata under the key `"clustobj-"+identifier`.
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for clustering.
        The call signature is
        `clust_labels, clust_centers = clustering[0](all_lms, clustering)`
        where all_lms is an array of shape (all_landmarks, dimensions) holding
        all input landmarks merged into one point cloud (and downsampled in the
        previous step). Since the `clustering` tuple itself is passed,
        additional arguments can be specified in additional elements of that
        tuple. `clust_labels` must be a 1D integer array assigning each input
        landmark to a corresponding cluster center. `clust_centers` must be an
        array of shape (clusters, dimensions) and contain the coordinates of
        the cluster centers. The first axis must be ordered such that the
        integers in `clust_labels` index it correctly. The number of clusters
        must match n_clusters.
    features : list of strings
        List containing any number of cluster features to be extracted. The
        strings noted in the explanation above are allowed. If custom feature
        extraction functions are passed (see below), their names must also be
        included in this list.
        Example: ["kNN-distEuclid", "count-near"]
    normalize_vol : bool, optional, default False
        If True, the volume of each input point cloud is normalized by dividing
        each landmark vector magnitude by the sum of all magnitudes.
    presample : tuple (algorithm, output_size) or None, optional, default None
        If not None, the algorithm specified is used to downsample each input
        cloud individually to output_size points. Available algorithms are
        "kmeans" (perform kmeans and use cluster centers as new points) or
        "random".
        Example: ('random', 50)
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for downsampling.
        The call signature is
        ```for cell in range(lms.shape[0]):
               lms_ds[cell,:,:] = presample[0](lms[cell,:,:], presample)```
        where lms is an array of shape (cells, landmarks, dimensions) holding
        the set of input point clouds. Since the `presample` tuple itself is
        passed, additional arguments can be specified in additional elements of
        that tuple. lms_ds must be an array of shape
        (cells, output_size, dimensions).
        If None, no presampling is performed.
    cfor : tuple (algorithm, dimensions) or None, optional, default None
        A tuple specifying the algorithm to use for recasting the landmarks in
        a space that is invariant to spatial transformations. There are two
        options available: "PD" (pairwise distance transform) and "PCA"
        (per-cell PCA and transform).
        For "PD", the total complement of pairwise distances between all points
        is computed and then subsampled to `dimensions` by selecting a
        corresponding number of distance percentiles in a linear range between
        the 10th to the 90th percentile (inclusive).
        For "PCA", the number of dimensions in the resulting space is equal to
        the number of dimensions of the input (should be 3). The `dimensions`
        part of the argument is ignored (but it must still be suplied!).
        If "default" is passed, "PD" is used.
        Example 1: ('PD', 6)
        Example 2: ('default', 6)  # defaults to 'PD'
        Example 3: ('PCA', 3)
        Alternatively, if a callable is passed instead of a stringm that
        callable is used for downsampling.
        The call signature is
        ```for cell in range(lms.shape[0]):
               lms_cfor[cell,:,:] = cfor[0](lms[cell,:,:], cfor)```
        where lms is an array of shape (cells, landmarks, dimensions) holding
        the set of input point clouds. Since the `cfor` tuple itself is passed,
        additional arguments can be specified in additional elements of that
        tuple. lms_ds must be an array of shape
        (cells, output_size, dimensions).
        If None, no transformation is performed; cells are left in the original
        3D space.
    standardize : bool or 'default', optional, default 'default'
        If True, the point cloud dimensions of the merged CFOR point cloud are
        standardised to zero mean and unit variance. This is also propagated
        to the individual clouds used for feature extraction and for saving
        in case the CFOR is being saved.
        If 'default', standardization is performed only if cfor is set to "PD".
        If False, no standardization is performed.
    custom_feature_funcs : list of tuples or None, optional, default None
        List used to specify one or more custom feature extraction functions.
        Each custom function is specified through a tuple in the list that
        is structured as such:
            `(feature_name, extraction_func, parent_names, other_params)`
        where `feature_name` is the name of the feature as it appears in the
        `features` argument, `extraction_func` is a callable, `parent_names`
        is a lsit of parent feature names (as they appear in the `features`
        argument) used as input to `extraction_func`, and `other_params` is a
        list of other parameters for `extraction_func`.
        The call signature is
        ```dask_graph[custom_func[0]+"_%i" % c] =
               (feature_name, [parent+"_%i" % c for parent in parent_names],
                other_params, lms[c,:,:], clust_centers, clust_labels[c]) ```
        within the dask graph, where `c` is the index of a cell.
        The callable must therefore accept a list of parent features (can be
        an empty list), a list of other parameters (can alos be empty), the
        (preprocessed) landmarks of the given cell, the cluster centers and
        the cluster labels of the given cell.
        It must return a 1D array of float values; the feature vector for the
        current cell `c`.
    bw_method : str, scalar, callable or None, optional, default None
        The method used to calculate the estimator bandwidth for the gaussian
        kde when computing the "kde" feature. This can be ‘scott’, ‘silverman’,
        a scalar constant or a callable. If a scalar, this will be used
        directly as `kde.factor`. If a callable, it should take a gaussian_kde
        instance as only parameter and return a scalar. If None (default),
        ‘scott’ is used. This is ignored if "kde" is not in `features`.
        < Modified from `scipy.stats.gaussian_kde` doc string. >
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        summarizes the feature extraction pipeline for the first 3 cells.
        Note: If the resulting graph contains multiple separate graphs, the
        only relevant graph is the one leading into `fspace` as an end result.
    processes : int or None, optional, default None
        Number of processes to use in multiprocessed and dask-controlled
        operations. If None, a number equal to half the available PCUs is used.
        If `1` (one), no multiprocessing is performed and `dask.get` is used
        instead of `dask.threaded.get`.
    profiling : bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    suffix_out : 'default' or dict, optional, default 'default'
        If 'default', the ouput is saved using '_PRES', '_CFOR', '_DS', and
        '_CBE' as suffices for the presampled landmarks (if `presample` is not
        None), for the CFOR-transformed landmarks (if `cfor` is not None), for
        overlayed downsampling (if `downsample` is not None)(note that this is
        not saved explicitly but is part of the suffix for the CBE-embedded
        feature space), and for the CBE-embedded feature space, respectively.
        The suffices are chained as appropriate. If a dict is passed, each of
        these suffices can be specified manually using the keys 'PRES', 'CFOR',
        'DS', 'CBE' and 'META'.
        The suffix specified in 'META' is added to all relevant metadata
        dictionary keys. For any suffices not specified in the suffix_out dict,
        the 'default' suffix is used.
    save_metadata : bool, optional, default True
        If True, cluster samples, cluster labels and a feature header are saved
        to the metadata of each input stack as appropriate.
    save_presampled : bool, optional, default False
        If True, the result of the presampling step is saved with the suffix
        "PRES" for later use.
    save_cfor : bool, optional, default False
        If True, the result of the cfor step is saved with the suffix "CFOR"
        for later use.
    verbose : bool, optional, default False
        If True, more information is printed.
    legacy : bool, optional, default False
        If True (and standardize is also set to True), the feature extraction
        is not performed in standardized space. Instead, the cluster centroids
        are transformed back to the un-standardized space.
        Triggers a deprecation warning.
    """

    #--------------------------------------------------------------------------

    ### Load data

    if verbose: print "Loading data..."

    # Handle cases of single paths
    if type(fpaths_lm) == str:
        fpaths_lm = [fpaths_lm]
    if len(fpaths_lm) == 1:
        warn(
            "fpaths_lm specifies only a single path. Usually, multiple paths" +
            " are specified so that many samples can be overlayed for" +
            " feature extraction!")

    # Import the landmark data
    # Note: The order of fpaths_lm is maintained and an index array is created!
    lms = []
    lms_idx = []
    for idx, fpath_lm in enumerate(fpaths_lm):
        try:
            lms_in = np.load(fpath_lm)
            lms.append(lms_in)
            lms_idx += [idx for i in range(lms_in.shape[0])]
        except:
            print "Attempting to load landmark data from " + str(fpath_lm),
            print "failed with this error:"
            raise
    lms_idx = np.array(lms_idx, dtype=np.int)
    lms = np.concatenate(lms)
    if verbose: print "Total input data shape:", lms.shape

    # Check if downsampling is specified
    if downsample is None:
        warn("It is highly recommended to use downsampling (unless the data " +
             "set is very small)!")

    # Handle processes being None
    if processes is None:
        processes = cpu_count() // 2

    # Handle standardize being default
    if standardize == 'default':
        standardize = False
        if cfor[0] == 'PD':
            standardize = True

    # Handle legacy mode
    if legacy:
        warn("Running in LEGACY mode! This is DEPRECATED!", DeprecationWarning)

    #--------------------------------------------------------------------------

    ### Normalize volume [per cell]

    if normalize_vol:
        if verbose: print "Normalizing volumes..."
        lms = vol_normalize(lms, verbose=verbose)

    #--------------------------------------------------------------------------

    ### Individual downsampling (presampling) [per cell]

    if presample is not None:
        if verbose: print "Presampling..."

        # Prep
        lms_ps = np.zeros((lms.shape[0], presample[1], lms.shape[2]))

        # Random subsampling
        if presample[0] == 'random':
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = ds.random_subsample(
                    lms[cell, :, :], presample[1])

        # Kmeans-based downsampling
        elif presample[0] == 'kmeans':
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = ds.kmeans_subsample(
                    lms[cell, :, :], presample[1])

        # Custom downsampling function
        elif callable(presample[0]):
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = presample[0](lms[cell, :, :], presample)

        # Handle other cases
        else:
            raise ValueError("Invalid presampling method: " +
                             str(presample[0]))

        # Assign the downsampled data back
        lms = lms_ps

    #--------------------------------------------------------------------------

    ### Transform to "Cell Frame Of Reference" (CFOR) [per cell]

    if cfor is not None:
        if verbose: print "Transforming to CFOR..."

        # Prep
        lms_cfor = np.zeros((lms.shape[0], lms.shape[1], cfor[1]))

        # Pairwise distance transform
        if cfor[0] == 'PD' or cfor[0] == 'default':
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = pd_transform(lms[cell, :, :],
                                                    percentiles=cfor[1])

        # PCA transform
        elif cfor[0] == 'PCA':
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = PCA().fit_transform(lms[cell, :, :])

        ## RBF transform by Nystroem embedding
        ## REMOVED: This does not create matched dimensions and thus cannot be
        ##          used for this purpose.
        #if cfor[0] == 'RBF':
        #    for cell in range(lms.shape[0]):
        #        Ny = kernel_approximation.Nystroem(kernel='rbf',
        #                                           gamma=1/lms.shape[1],
        #                                           n_components=cfor[1],
        #                                           random_state=42)
        #        lms_cfor[cell,:,:] = Ny.fit_transform(lms[cell,:,:])

        # Custom CFOR transform
        elif callable(cfor[0]):
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = cfor[0](lms[cell, :, :], cfor)

        # Handle other cases
        else:
            raise ValueError("Invalid CFOR method: " + str(cfor[0]))

        # Assign the CFOR data back
        lms = lms_cfor

    #--------------------------------------------------------------------------

    ### Collective downsampling (all cells overlayed) [altogether]
    #   Note: This is done to improve cluster retrieval and to make it more
    #         efficient. It does not affect the feature extraction afterwards.

    # Flatten cells of all samples together
    all_lms = lms.reshape((lms.shape[0] * lms.shape[1], lms.shape[2]))

    # For CFOR-PD: standardize the dimensions
    if standardize and not legacy:

        # Standardize pooled landmarks
        cloud_means = all_lms.mean(axis=0)
        cloud_stds = all_lms.std(axis=0)
        all_lms = (all_lms - cloud_means) / cloud_stds

        # Overwrite unpooled landmarks for feature extraction in standard space
        lms = all_lms.reshape((lms.shape[0], lms.shape[1], lms.shape[2]))

    # Downsampling
    if downsample is not None and clustering[0] != 'previous':
        if verbose: print "Downsampling merged cloud..."

        # Default is density dependent downsampling
        if downsample[0] == 'default' or downsample[0] == 'ddds':
            all_lms_ds = ds.ddds(all_lms,
                                 downsample[1],
                                 presample=downsample[1],
                                 processes=processes)

        # Alternative: kmeans downsampling
        elif downsample[0] == 'kmeans':
            all_lms_ds = ds.kmeans_subsample(all_lms, downsample[1])

        # Alternative: random downsampling
        elif downsample[0] == 'random':
            all_lms_ds = ds.random_subsample(all_lms, downsample[1])

        # Custom downsampling
        elif callable(downsample[0]):
            all_lms_ds = downsample[0](all_lms, downsample)

        # Handle other cases
        else:
            raise ValueError("Invalid downsampling method: " +
                             str(downsample[0]))

    # No downsampling
    else:
        all_lms_ds = all_lms

    # LEGACY: Standardization after downsampling and without overwriting the
    #         unpooled landmarks!
    if legacy and standardize:
        cloud_means = all_lms_ds.mean(axis=0)
        cloud_stds = all_lms_ds.std(axis=0)
        all_lms_ds = (all_lms_ds - cloud_means) / cloud_stds

    #--------------------------------------------------------------------------

    ### Find reference points by clustering [altogether]

    if verbose: print "Clustering to find reference points..."

    # Default: kmeans clustering
    if clustering[0] == 'default' or clustering[0] == 'kmeans':

        # Perform clustering
        my_clust = MiniBatchKMeans(n_clusters=clustering[1], random_state=42)
        my_clust.fit(all_lms_ds)

        # Get labels and centroids
        clust_labels = my_clust.labels_
        clust_centers = my_clust.cluster_centers_

        # Predict labels for whole data set (if downsampled)
        if downsample is not None:
            clust_labels = my_clust.predict(all_lms)

    # To be added: DBSCAN
    elif clustering[0] == 'dbscan':
        raise NotImplementedError("And likely never will be...")

    # Using a given (already fitted) clustering object
    elif clustering[0] == 'previous':
        my_clust = clustering[1]
        clust_centers = my_clust.cluster_centers_
        clust_labels = my_clust.predict(all_lms)

    # Custom alternative
    elif callable(clustering[0]):
        clust_labels, clust_centers = clustering[0](all_lms, clustering)

    # Handle other cases
    else:
        raise ValueError("Invalid clustering method: " + str(clustering[0]))

    # LEGACY: Back-transform of centroids to un-standardized space
    #         In legacy, feature extraction was done on the un-standardized
    #         space, using the back-transformed centroids
    if legacy and standardize:
        clust_centers = clust_centers * cloud_stds + cloud_means

    # Unpool cluster labels
    clust_labels = clust_labels.reshape((lms.shape[0], lms.shape[1]))

    #--------------------------------------------------------------------------

    ### Extract features relative to reference points [per cell]

    if verbose: print "Extracting cluster features..."

    # Init dask graph
    dask_graph = dict()

    # For each cell...
    for c in range(lms.shape[0]):

        # Node to compute kdtree
        dask_graph["kdtree_%i" % c] = (fe.build_kdtree, lms[c, :, :])

        # Nodes for the features
        dask_graph["kNN-distsManh_%i" % c] = (fe.feature_distsManhatten_kNN,
                                              "kdtree_%i" % c, lms[c, :, :],
                                              clust_centers)

        dask_graph["kNN-distEuclid_%i" % c] = (fe.feature_distEuclidean_kNN,
                                               "kNN-distsManh_%i" % c,
                                               lms.shape[2])

        dask_graph["NN-distsManh_%i" % c] = (fe.feature_distsManhatten_NN,
                                             "kdtree_%i" % c, lms[c, :, :],
                                             clust_centers)

        dask_graph["NN-distEuclid_%i" % c] = (fe.feature_distEuclidean_NN,
                                              "NN-distsManh_%i" % c,
                                              lms.shape[2])

        dask_graph["count-near_%i" % c] = (fe.feature_count_near, [
            "kdtree_%i" % c, "kNN-distEuclid_%i" % c
        ], lms[c, :, :], clust_centers)

        dask_graph["count-assigned_%i" % c] = (fe.feature_count_assigned,
                                               clust_centers, clust_labels[c])

        dask_graph["kde_%i" % c] = (fe.feature_kde, lms[c, :, :],
                                    clust_centers, bw_method)

        # Nodes for custom feature extraction functions
        if custom_feature_funcs is not None:
            for custom_func in custom_feature_funcs:
                custom_parents = [
                    parent + "_%i" % c for parent in custom_func[2]
                ]
                dask_graph[custom_func[0] +
                           "_%i" % c] = (custom_func[1], custom_parents,
                                         custom_func[3], lms[c, :, :],
                                         clust_centers, clust_labels[c])

        # Node to collect requested features for a cell
        dask_graph["fvector_%i" % c] = (fe.assemble_cell,
                                        [f + "_%i" % c
                                         for f in features], features)

        # Render example graph for first 3 cells
        if c == 2 and dask_graph_path is not None:
            from dask.dot import dot_graph
            dask_graph["fspace"] = (fe.assemble_fspace,
                                    ["fvector_%i" % c for c in range(3)])
            dot_graph(dask_graph, filename=dask_graph_path)

    # Final node to put per-cell features into a feature space
    dask_graph["fspace"] = (fe.assemble_fspace,
                            ["fvector_%i" % c for c in range(lms.shape[0])])

    # Run without multiprocessing
    if processes == 1:
        with ProgressBar(dt=1):
            fspace, fheader = dask.get(dask_graph, 'fspace')

    # Run with multiprocessing
    else:

        # Set number of threads
        dask.set_options(pool=ThreadPool(processes))

        # Run the pipeline (no profiling)
        if not profiling:
            with ProgressBar(dt=1):
                fspace, fheader = dask.threaded.get(dask_graph, 'fspace')

        # Run the pipeline (with resource profiling)
        if profiling:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    fspace, fheader = dask.threaded.get(dask_graph, 'fspace')
                visualize([prof, rprof], save=False)

    #--------------------------------------------------------------------------

    ### Save [per stack], report and return

    if verbose: print "Saving result..."

    # For each stack...
    for sample_idx, sample_fpath in enumerate(fpaths_lm):

        # Prepare suffix
        suffix = ""

        # Save individually downsampled landmark distributions if desired
        if presample is not None and save_presampled:
            if suffix_out == 'default' or 'PRES' not in suffix_out.keys():
                suffix = suffix + "_PRES"
            else:
                suffix = suffix + suffix_out['PRES']
            np.save(sample_fpath[:-4] + suffix,
                    lms_ps[lms_idx == sample_idx, :, :])

        # Save CFOR if desired
        if cfor is not None and save_cfor:
            if suffix_out == 'default' or 'CFOR' not in suffix_out.keys():
                suffix = suffix + "_CFOR"
            else:
                suffix = suffix + suffix_out['CFOR']
            np.save(sample_fpath[:-4] + suffix,
                    lms[lms_idx == sample_idx, :, :])

        # Include downsampling in suffix
        if downsample is not None:
            if suffix_out == 'default' or 'DS' not in suffix_out.keys():
                suffix = suffix + '_DS'
            else:
                suffix = suffix + suffix_out['DS']

        # Save shape space
        if suffix_out == 'default' or 'CBE' not in suffix_out.keys():
            suffix = suffix + "_CBE"
        else:
            suffix = suffix + suffix_out['CBE']
        np.save(sample_fpath[:-4] + suffix, fspace[lms_idx == sample_idx, :])

        # Save new metadata
        if save_metadata:

            # Construct metadata path
            dirpath, fname = os.path.split(sample_fpath)
            fpath_meta = os.path.join(dirpath,
                                      fname[:10] + "_stack_metadata.pkl")

            # Open metadata
            with open(fpath_meta, 'rb') as metafile:
                meta_dict = pickle.load(metafile)

            # Prepare metadata suffix
            if suffix_out == 'default' or 'META' not in suffix_out.keys():
                if suffix[0] == '_':
                    m_suffix = suffix[1:]
                else:
                    m_suffix = suffix
            else:
                if suffix[0] == '_':
                    m_suffix = suffix[1:] + suffix_out['META']
                else:
                    m_suffix = suffix + suffix_out['META']

            # Slightly awkward addition of TFOR tag
            if 'TFOR' in fpaths_lm[0]:
                m_suffix = 'TFOR_' + m_suffix

            # Add new metadata
            meta_dict["clustobj-" + m_suffix] = my_clust
            meta_dict["clusters-" + m_suffix] = clust_centers
            meta_dict["labels-" +
                      m_suffix] = clust_labels[lms_idx == sample_idx]
            meta_dict["features-" + m_suffix] = fheader

            # Write metadata
            with open(fpath_meta, 'wb') as metafile:
                pickle.dump(meta_dict, metafile, pickle.HIGHEST_PROTOCOL)

    # Report and return
    if verbose: print "Processing complete!"
    return
コード例 #22
0
def unmix_linear_legacy(fpath_dirty,
                        fpath_conta,
                        processes=None,
                        a_range=(0.0, 1.0, 40),
                        thresh=0.7,
                        accept_arrays=False,
                        save_result=True,
                        return_result=False,
                        show=False,
                        verbose=False):
    """Clean a 'dirty stack' by removing bleed-through from a 'contaminant
    stack' using simple linear unmixing.

    The linear unmixing approach used here computes the following:

    CLEAN = DIRTY - a * CONTAMINANT

    where `a` is determined by computing the image correlation of CLEAN and
    CONTAMINANT over a range of different possible values of `a` and selecting
    the lowest value of `a` that reduces the correlation to below a fraction of
    the original correlation given by `thresh`. `thresh` depends on how much
    true (non-bleed-through) correlation is expected between the images.

    Parameters
    ----------
    fpath_dirty : string
        The path (either local from cwd or global) to the 'dirty' stack that
        should be cleaned. The stack should be a single channel and time point.
    fpath_conta : string
        The path (either local from cwd or global) to the 'contaminant' stack
        that should be cleaned. Must have the same shape as the 'dirty' stack.
    processes : int or None, optional
        Number of processes that may be used for multiprocessing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (no multiprocessing code is used).
    a_range : tuple (0.0, stop, n_steps), optional, default (0.0, 2.0, 20)
        Range of values of `a` to be tested. Must start with 0.0 and end with
        a positive float (stop). n_steps is the number of regular steps tested
        between 0.0 and stop (see np.linespace).
    thresh : float, optional, default 0.6
        Threshold for the fraction of the original correlation that may remain
        for a value of `a` to be accepted. `thresh` relates to the desired
        correlation of the unmixed (reduced) image as:
        `thresh = reduced_corr / original_corr`
    accept_arrays : bool, optional, default False
        If True, fpath_dirty and fpath_conta are expected to be already loaded
        image arrays instead of paths.
    save_result : bool, optional, default True
        If True, the unmixed image will be saved as a tif file with the suffix
        `_linUnmix.tif`.
    return_result : bool, optional, default False
        If True, the unmixed image and the selected alpha are returned.
    show : bool, optional, default False
        If True, a plot is displayed showing the image correlations as a
        function of `a` and indicating the selected value of `a`.
    verbose : bool, optional, default False
        If True, more information is printed.


    Notes
    -----
    The default values chosen for a_range and thresh are based on the unmixing
    of nuclear NLS-tdTomato bleed-through into membranous Lyn:EGFP in 3D stacks
    of the zebrafish lateral line primordium acquired at the Zeiss LSM880.
    """

    #--------------------------------------------------------------------------

    ### Load the files and prepare the data

    if not accept_arrays:

        if verbose: print "Loading stacks..."

        # Add .tif to filenames if necessary
        if not fpath_dirty.endswith('.tif'):
            fpath_dirty = fpath_dirty + '.tif'
        if not fpath_conta.endswith('.tif'):
            fpath_conta = fpath_conta + '.tif'

        # Try loading the dirty channel
        try:
            img_dirty = imread(fpath_dirty)
        except:
            print "Attempting to load dirty stack failed with this error:"
            raise

        # Try loading the contaminant channel
        try:
            img_conta = imread(fpath_conta)
        except:
            print "Attempting to load contaminant stack failed with this error:"
            raise

    # If the input was provided as arrays already
    else:
        img_dirty = fpath_dirty
        img_conta = fpath_conta

    # Prepare range of `a` to be tested
    a_arr = np.linspace(*a_range)

    #--------------------------------------------------------------------------

    ### Compute correlations across a range of possible `a` (serial)
    # TODO [ENH]: Check and stop when `thresh` is reached.

    if processes == 1:

        if verbose: print "Computing correlations sequentially..."

        corrs = []
        for a in a_arr:
            corrs.append(_get_img_corrcoef_legacy((a, img_dirty, img_conta)))
        corrs = np.array(corrs)

    #--------------------------------------------------------------------------

    ### Compute correlations across a range of possible `a` (parallel)

    else:

        if verbose: print "Computing correlations in parallel..."

        # If necessary: choose number of processes (half of available cores)
        if processes is None:
            processes = cpu_count() // 2

        # Prepare for multiprocessing
        my_pool = multiprocessing.Pool(processes=processes)
        param_list = [(a, img_dirty, img_conta) for a in a_arr]

        # Execute function on the input range
        corrs = my_pool.map(_get_img_corrcoef_legacy, param_list)

        # Clean up
        my_pool.close()
        my_pool.join()
        corrs = np.array(corrs)

    #--------------------------------------------------------------------------

    ### Set threshold at T-fold reduction of correlation

    if verbose: print "Setting threshold and creating clean image..."

    reduced_corrs = corrs / corrs[0]
    target_a = a_arr[np.where(reduced_corrs <= thresh)[0][0]]

    # Plot
    if show:

        fig, ax1 = plt.subplots()
        ax1.plot(a_arr, corrs, c='b')
        ax1.set_ylabel('Image corr coeff', color='b')
        ax1.tick_params('y', colors='b')

        ax2 = ax1.twinx()
        ax2.plot(a_arr, corrs / corrs[0], c='g')
        ax2.set_ylabel('Relative image corrr coeff', color='g')
        ax2.tick_params('y', colors='g')

        ax1.vlines(target_a,
                   corrs.min(),
                   corrs.max(),
                   color="r",
                   label='target value of `a`')
        ax1.legend(frameon=False)

        ax1.set_xlabel('Factor `a`')

        plt.show()

    #--------------------------------------------------------------------------

    ### Generate the unmixed image

    img_clean = img_dirty - target_a * img_conta
    img_clean[img_clean < 0] = 0
    img_clean = img_clean.astype(np.uint8)

    #--------------------------------------------------------------------------

    ### Write the result and return

    if save_result:
        if verbose: print "Saving result..."
        imsave(fpath_dirty[:-4] + '_linUnmix.tif', img_clean, bigtiff=True)

    if verbose: print "Processing complete!"

    if return_result:
        return img_clean, target_a
    else:
        return
コード例 #23
0
ファイル: commonTips.py プロジェクト: johnsonhongyi/pyQuant
def get_cpu_count():
    return cpu_count()
コード例 #24
0
def unmix_linear(fpath_dirty,
                 fpath_conta,
                 processes=None,
                 a_range=(0.0, 1.0, 20),
                 accept_arrays=False,
                 save_result=True,
                 return_result=False,
                 show=False,
                 verbose=False):
    """Clean a 'dirty stack' by removing bleed-through from a 'contaminant
    stack' using simple linear unmixing.

    The linear unmixing approach used here computes the following:

    CLEAN = DIRTY - a * CONTAMINANT

    where `a` is determined by minimizing CORR over a range of possible `a`,
    with CORR defined as follows:

    CORR  = image_correlation( CONTAMINANT, | CLEAN - mean(CLEAN) | )

    Note that using the absolute value of the mean-background subtracted CLEAN
    images ensures that high values of `a` are punished since overly unmixed
    regions start correlating with the CONTAMINANT again. An intuitive way of
    looking at this is to argue that the unmixing should not reduce the signal
    of the DIRTY channel below its normal background (here approximated by the
    mean), hence such overreductions are punished.

    Parameters
    ----------
    fpath_dirty : string
        The path (either local from cwd or global) to the 'dirty' stack that
        should be cleaned. The stack should be a single channel and time point.
    fpath_conta : string
        The path (either local from cwd or global) to the 'contaminant' stack
        that should be cleaned. Must have the same shape as the 'dirty' stack.
    processes : int or None, optional
        Number of processes that may be used for multiprocessing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (no multiprocessing code is used).
    a_range : tuple (start, stop, n_steps), optional, default (0.0, 1.0, 20)
        Range of values of `a` to be tested. Must start with 0.0 and end with
        a positive float (stop). n_steps is the number of regular steps tested
        between 0.0 and stop (see np.linespace).
    accept_arrays : bool, optional, default False
        If True, fpath_dirty and fpath_conta are expected to be already loaded
        image arrays instead of paths.
    save_result : bool, optional, default True
        If True, the unmixed image will be saved as a tif file with the suffix
        `_linUnmix.tif`.
    return_result : bool, optional, default False
        If True, the unmixed image and the selected alpha are returned.
    show : bool, optional, default False
        If True, a plot is displayed showing the image correlations as a
        function of `a` and indicating the selected value of `a`.
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Load the files and prepare the data

    if not accept_arrays:

        if verbose: print "Loading stacks..."

        # Add .tif to filenames if necessary
        if not fpath_dirty.endswith('.tif'):
            fpath_dirty = fpath_dirty + '.tif'
        if not fpath_conta.endswith('.tif'):
            fpath_conta = fpath_conta + '.tif'

        # Try loading the dirty channel
        try:
            img_dirty = imread(fpath_dirty)
        except:
            print "Attempting to load dirty stack failed with this error:"
            raise

        # Try loading the contaminant channel
        try:
            img_conta = imread(fpath_conta)
        except:
            print "Attempting to load contaminant stack failed with this error:"
            raise

    # If the input was provided as arrays already
    else:
        img_dirty = fpath_dirty
        img_conta = fpath_conta

    # Prepare range of `a` to be tested
    a_arr = np.linspace(*a_range)

    #--------------------------------------------------------------------------

    ### Compute correlations across a range of possible `a` (serial)

    if processes == 1:

        if verbose: print "Computing correlations sequentially..."

        corrs = []
        for a in a_arr:
            corrs.append(_get_corrected_img_corrcoef(
                (a, img_dirty, img_conta)))
        corrs = np.array(corrs)

    #--------------------------------------------------------------------------

    ### Compute correlations across a range of possible `a` (parallel)

    else:

        if verbose: print "Computing correlations in parallel..."

        # If necessary: choose number of processes (half of available cores)
        if processes is None:
            processes = cpu_count() // 2

        # Prepare for multiprocessing
        my_pool = multiprocessing.Pool(processes=processes)
        param_list = [(a, img_dirty, img_conta) for a in a_arr]

        # Execute function on the input range
        corrs = my_pool.map(_get_corrected_img_corrcoef, param_list)

        # Clean up
        my_pool.close()
        my_pool.join()
        corrs = np.array(corrs)

    #--------------------------------------------------------------------------

    ### Get optimal unmixing factor based on minimum cross correlation

    if verbose: print "Detecting optimal factor and creating clean image..."

    # Get factor
    target_a = a_arr[np.argmin(corrs)]

    # Plot
    if show:

        fig, ax1 = plt.subplots()
        ax1.plot(a_arr, corrs, c='b')
        ax1.set_ylabel('Image corr coeff', color='b')
        ax1.tick_params('y', colors='b')

        ax2 = ax1.twinx()
        ax2.plot(a_arr, corrs / corrs[0], c='g')
        ax2.set_ylabel('Relative image corrr coeff', color='g')
        ax2.tick_params('y', colors='g')

        ax1.vlines(target_a,
                   corrs.min(),
                   corrs.max(),
                   color="r",
                   label='target value of `a`')
        ax1.legend(frameon=False)

        ax1.set_xlabel('Factor `a`')

        plt.show()

    #--------------------------------------------------------------------------

    ### Generate the unmixed image

    img_clean = img_dirty - target_a * img_conta
    img_clean[img_clean < 0] = 0
    img_clean = img_clean.astype(np.uint8)

    #--------------------------------------------------------------------------

    ### Write the result and return

    if save_result:
        if verbose: print "Saving result..."
        imsave(fpath_dirty[:-4] + '_linUnmix.tif', img_clean, bigtiff=True)

    if verbose: print "Processing complete!"

    if return_result:
        return img_clean, target_a
    else:
        return
コード例 #25
0
def atlas_construction(train_dirpath,
                       predict_dirpath,
                       ref_channel,
                       sec_channel,
                       recurse=False,
                       ignore_self=True,
                       ignore_old=False,
                       train_IDs=None,
                       predict_IDs=None,
                       processes=None,
                       profiling=False,
                       verbose=False,
                       outlier_removal_ref=None,
                       outlier_removal_sec=None,
                       outlier_removal_cov=None,
                       covariates_to_use=None,
                       regressor='MO-SVR',
                       outlier_params_ref={},
                       outlier_params_sec={},
                       outlier_params_cov={},
                       regressor_params={'kernel': 'rbf'},
                       atlas_params={
                           'zscore_X': False,
                           'zscore_y': False,
                           'pca_X': False,
                           'pca_y': False,
                           'rezscore_X': False,
                           'rezscore_y': False,
                           'subselect_X': None,
                           'subselect_y': None,
                           'add_covariates': None
                       }):
    """Predict a secondary channel's feature space based on a reference channel
    through regression fitted on appropriate training data.

    This is a dask pipeline that applies atlas prediction from
    `katachi.tools.predict_atlas` to feature space datasets constructed with
    `katachi.pipelines.feature_extraction`.

    Parameters
    ----------
    train_dirpath : string
        The path (either local from cwd or global) to the directory with the
        training data on which the model will be fitted.
    predict_dirpath : string
        The path (either local from cwd or global) to the directory with the
        reference dataset for which a prediction will be constructed.
    ref_channel : string or list of strings
        The channel to be used as reference (usually the shape space),
        including the full processing suffix, or a list of multiple such
        suffices.
        Example: 'seg_LMs_TFOR_SUBS_CBE'
    sec_channel : string or list of strings
        The channel for which a prediction is made, including the full
        processing suffix, or a list of multiple such suffices.
        Example: 'NLStdTomato_LMs_TFOR_SUBS_CBE'
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
    ignore_self : bool, optional, default True
        If True, predictions are not run for primordia used in training.
    ignore_old : bool, optional, default False
        If True, files that already have a matching prediction in the same
        directory will be ignored.
        WARNING: This feature has not been implemented for this pipeline!
    train_IDs : list of strings or None
        If None, all matching files in `train_dirpath` are used for training.
        If a list of strings (IDs), only the samples matching the IDs are used.
    predict_IDs : list of strings or None
        If None, all matching files in `predict_dirpath` are used for
        prediction. If a list of strings (IDs), only the samples matching the 
        IDs are used.
    processes : int or None, optional, default None
        Number of processes available for use during multi-processed model
        fitting and prediction. Works for 'MO-SVR' and 'MT-Lasso' regressors.
        WARNING: The 'MLP' regressor also performs multi-processing but does
        not seem to support an n_jobs argument...
        If None, half of the available CPUs are used.
        If set to 1, the code is run without use of dask.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    outlier_removal_ref : string or None, default None
        If None, no outlier removal is done on reference feature spaces.
        Otherwise this must be a string denoting the method for outlier removal
        (one of `absolute_thresh`, `percentile_thresh`,
        `merged_percentile_thresh` or `isolation_forest`). Note that outlier
        removal is only done on training data, not on prediction data.
        See katachi.utilities.outlier_removal.RemoveOutliers for more info.
    outlier_removal_sec : string or None, default None
        If None, no outlier removal is done on the target feature spaces.
        Otherwise this must be a string denoting the method for outlier removal
        (see outlier_removal_ref above).
    outlier_removal_cov : string or None, optional, default None
        If None, no outlier removal is done based on covariate information.
        Otherwise this must be a string denoting the method for outlier removal
        (see outlier_removal_ref above).
    covariates_to_use : string, list of strings or None, optional, default None
        A string denoting the selection tree to select a covariate to be used
        for outlier detection from the HierarchicalData covariate object. Can
        also be a list of multiple such strings, in which case the covariates
        are merged into an fspace. The specified covariates must each be single
        numeric columns.
    regressor : string or sklearn regressor instance, optional, default 'MO-SVR'
        Regressor to be used in the atlas pipeline for prediction. If a string,
        must be one of 'MO-SVR', 'MT-ENetCV', 'MT-Lasso' or 'MLP'. See doc 
        string of katachi.tools.predict_atlas.predict_atlas for more info.
    outlier_params_ref : dict, optional, default {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the reference feature space.
        See katachi.utilities.outlier_removal.RemoveOutliers for more info.
    outlier_params_sec : dict, optional, {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the target feature space.
    outlier_params_cov : dict, optional, default {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the covariates. There default is to fall back to the defaults of
        katachi.utilities.outlier_removal.RemoveOutliers.
    regressor_params : dict, optional, default is a standard RBF MO-SVR
        dictionary for the chosen regressor's instantiation. See doc string of
        katachi.tools.predict_atlas.predict_atlas function for more info.
    pipeline_params : dict, optional, default are default settings
        kwarg dictionary for AtlasPipeline instantiation. See doc string of
        katachi.tools.predict_atlas.predict_atlas function for more info.
    """

    #--------------------------------------------------------------------------

    ### Construct lists of files to include

    if verbose: print "Retrieving matching datasets..."

    def prepare_fpaths(fpath, fnames, channel, IDs):

        # Select only files matching the IDs
        if IDs is not None:
            fnames = [
                fname for fname in fnames
                if any([fname.startswith(ID) for ID in IDs])
            ]

        # Select correct file names
        channel_fnames = [
            fname for fname in fnames
            if any([fname.endswith(c + ".npy") for c in channel])
        ]

        # Ignore files that already have predictions
        if ignore_old:
            raise NotImplementedError("This would be annoying to implement " +
                                      "and likely won't ever be needed.")

        # Create full paths
        fpaths = [os.path.join(fpath, fname) for fname in channel_fnames]

        # Return results
        return fpaths

    # Handle single channel suffices
    if type(ref_channel) == str:
        ref_channel = [ref_channel]
    if type(sec_channel) == str:
        sec_channel = [sec_channel]

    # Clean channel if specified with file ending
    ref_channel = [
        rc[:-4] if rc.endswith(".npy") else rc for rc in ref_channel
    ]
    sec_channel = [
        sc[:-4] if sc.endswith(".npy") else sc for sc in sec_channel
    ]

    # Run for single dir
    if not recurse:

        # Get training data
        fnames = os.listdir(train_dirpath)
        fpaths_ref_train = prepare_fpaths(train_dirpath, fnames, ref_channel,
                                          train_IDs)
        fpaths_sec_train = prepare_fpaths(train_dirpath, fnames, sec_channel,
                                          train_IDs)

        # Get prediction data
        fnames = os.listdir(predict_dirpath)
        fpaths_ref_predict = prepare_fpaths(predict_dirpath, fnames,
                                            ref_channel, predict_IDs)

    # Run for multiple subdirs
    if recurse:

        # Get training data
        fpaths_ref_train = []
        fpaths_sec_train = []
        for dpath, _, fnames in os.walk(train_dirpath):
            fpaths_ref_train += prepare_fpaths(dpath, fnames, ref_channel,
                                               train_IDs)
            fpaths_sec_train += prepare_fpaths(dpath, fnames, sec_channel,
                                               train_IDs)

        # Get prediction data
        fpaths_ref_predict = []
        for dpath, _, fnames in os.walk(predict_dirpath):
            fpaths_ref_predict += prepare_fpaths(dpath, fnames, ref_channel,
                                                 predict_IDs)

    # Remove training data from prediction data
    if ignore_self:
        fpaths_ref_predict = [
            f for f in fpaths_ref_predict if not f in fpaths_ref_train
        ]

    # Check
    if len(fpaths_ref_train) == 0:
        raise IOError("No reference files found in training directory.")
    if len(fpaths_sec_train) == 0:
        raise IOError("No secondary files found in training directory.")
    if len(fpaths_ref_predict) == 0:
        raise IOError("No reference files found in prediction directory.")
    if not len(fpaths_ref_train) == len(fpaths_sec_train):
        raise IOError("Found unequal number of reference and secondary" +
                      " files in the traning directory." + " Ref files: " +
                      str(len(fpaths_ref_train)) + " Sec files: " +
                      str(len(fpaths_sec_train)))

    # Report
    if verbose:
        print "-- Detected", len(fpaths_ref_train), "training file pairs."
        print "-- Detected", len(fpaths_ref_predict), "prediction files."

    #--------------------------------------------------------------------------

    ### If desired: run 'sequentially' (without dask)

    if processes == 1:

        if verbose: print "Running pipeline without multiprocessing/dask..."

        predict_atlas(fpaths_ref_train,
                      fpaths_sec_train,
                      fpaths_ref_predict,
                      outlier_removal_ref=outlier_removal_ref,
                      outlier_removal_sec=outlier_removal_sec,
                      outlier_removal_cov=outlier_removal_cov,
                      covariates_to_use=covariates_to_use,
                      regressor=regressor,
                      n_jobs=1,
                      save_predictions=True,
                      save_pipeline=True,
                      verbose=False,
                      outlier_options_ref=outlier_params_ref,
                      outlier_options_sec=outlier_params_sec,
                      outlier_options_cov=outlier_params_cov,
                      regressor_options=regressor_params,
                      pipeline_options=atlas_params)

        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    if verbose: print "Running pipeline with multiprocessing/dask..."

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Create the dask 'graph'
    dask_graph = {
        'done': (predict_atlas, fpaths_ref_train, fpaths_sec_train,
                 fpaths_ref_predict, outlier_removal_ref, outlier_removal_sec,
                 outlier_removal_cov, covariates_to_use, regressor, processes,
                 True, True, False, outlier_params_ref, outlier_params_sec,
                 outlier_params_cov, regressor_params, atlas_params)
    }

    #--------------------------------------------------------------------------

    ### Run with dask

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.get(dask_graph, 'done')
            visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
コード例 #26
0
ファイル: remote_run.py プロジェクト: psmit/asr-python-htk
 def max_tasks(cls):
     return cpu_count()
def initialize_dir(dirpath,
                   idpath,
                   meta_dict,
                   recurse=False,
                   IDR_data=False,
                   IDR_IDs=None,
                   ignore_old=True,
                   fname_prefix=None,
                   fname_suffix=None,
                   processes=None,
                   profiling=False,
                   verbose=False):
    """Intialize the data structure for a directory of new image stacks.
    
    This is a dask pipeline that applies the function `initialize_stack` from
    `katachi.tools.initialize` to an entire directory.
    
    See `katachi.tools.initialize.initialize_stack` for more information.
    
    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    idpath : string or None
        Path of the text file containing previously generated IDs.
        Necessary to ensure that newly generated IDs are unique.
    meta_dict : dict 
        A dictionary containing the initial (user-defined) metadata for the
        stack. See Notes below for the keys that must be included.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
        This is ignored if `IDR_data` is True, as recursing through subfolders
        is not supported on IDR data.
    IDR_data : bool, optional, default False
        If True, the data is expected to already be grouped into subdirectories
        named according to already assigned IDs, as this is how the data was
        deposited on the IDR database.
    IDR_IDs : list of IDs or None, optional, default None
        If IDR_data is True, a list of IDs can be passed to specify a subset of 
        samples for which this pipeline is to be run. 
    ignore_old : bool, optional, default True
        If True, files that already have a known ID listed in the ID file will
        be ignored. This is not supported for IDR data, so if IDR_data is True
        and ignore_old is True, an error is raised.
    fname_prefix : str or None, optional
        If not None, only file names that start with the given string are used.
    fname_suffix : str or None, optional
        If not None, only file names that end with the given string (or with
        the given string + .tif) are used.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (dask is not used).
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask]. 
    verbose : bool, optional, default False
        If True, more information is printed.
    
    Notes
    -----
    The meta_dict dictionary must contain the following entries:
    - 'channels'   : A list of strings naming the channels in order. Must not 
                     contain characters that cannot be used in file names.
    - 'resolution' : A list of floats denoting the voxel size of the input
                     stack in order ZYX.
    It may optionally contain other entries as well.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    if verbose: print "Detecting target files..."

    # Function to select file names and create paths
    def get_fnames_ready(fnames, fpath, known_ids=None):

        fnames = fnames[:]

        fnames = [fname for fname in fnames if fname.endswith(".tif")]

        if ignore_old:
            fnames = [
                fname for fname in fnames
                if not any([fname.startswith(ID) for ID in known_ids])
            ]

        if fname_prefix:
            fnames = [
                fname for fname in fnames if fname.startswith(fname_prefix)
            ]
        if fname_suffix:
            fnames = [
                fname for fname in fnames
                if fname.endswith(fname_suffix +
                                  ".tif") or fname.endswith(fname_suffix)
            ]

        fpaths = [os.path.join(fpath, fname) for fname in fnames]

        return fpaths

    # If this is run on IDR data, most of the work is already done!
    if IDR_data:

        # Handle inputs
        if ignore_old:
            raise IOError(
                "`ignore_old` is not supported for IDR data. Be " +
                "careful when running this so as to avoid over" +
                "writing important metadata. Aborting for now; set " +
                "`ignore_old` to False to prevent this error.")
        if IDR_IDs is None:
            IDR_IDs = [
                ID for ID in os.listdir(dirpath)
                if os.path.isdir(ID) and len(ID) == 10
            ]

        # Write the metadata files; all else is already done
        if verbose: print "Creating metadata files for IDR data..."
        for ID in IDR_IDs:
            meta_path = os.path.join(dirpath, ID, ID + '_stack_metadata.pkl')
            with open(meta_path, 'wb') as outfile:
                pickle.dump(meta_dict, outfile, pickle.HIGHEST_PROTOCOL)
        if verbose: print "Processing complete!"
        return

    # If needed, load previously generated IDs (to exclude those files)
    if ignore_old:
        try:
            with open(idpath, "r") as infile:
                known_ids = [line.strip() for line in infile.readlines()]
        except:
            print("Attempting to load existing IDs from id_file failed " +
                  "with this error:")
            raise
    else:
        known_ids = None

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = get_fnames_ready(fnames, dirpath, known_ids=known_ids)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += get_fnames_ready(fnames, dpath, known_ids)

    # Check
    if len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Report
    if verbose:
        print "-- Detected", len(fpaths), "target files."

    #--------------------------------------------------------------------------

    ### If desired: run sequentially (does not use dask)

    if processes == 1:
        if verbose: print "Processing target files sequentially..."
        for fpath in fpaths:
            initialize_stack(fpath, idpath, meta_dict, verbose=False)
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    if verbose: print "Processing target files in parallel..."

    dask_graph = dict()
    for i, fpath in enumerate(fpaths):
        dask_graph["initialize_%i" % i] = (initialize_stack, fpath, idpath,
                                           meta_dict, False)
    dask_graph['done'] = (lambda x: "done",
                          ["initialize_%i" % i for i in range(len(fpaths))])

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.threaded.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.threaded.get(dask_graph, 'done')
            visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
コード例 #28
0
ファイル: fm.py プロジェクト: verbalsaintmars/srmparserlite
 def Start(this, *a_rootDirs):
    l_tpool = mpo.ThreadPool(processes=mpo.cpu_count())
    l_result = l_tpool.map_async(this.ScanDir, a_rootDirs, 1)
    return l_result.get()
コード例 #29
0
ファイル: validate_axioms.py プロジェクト: wangyu-/sosp19ae
 for (a, b), space in zip(axioms, spaces):
     if space is None:
         continue
     print now(), "Checking:\n{}".format(a)
     assert a.is_forall()
     variables = [
         z3.Const(a.var_name(i), a.var_sort(i))
         for i in range(a.num_vars())
     ]
     assert all(len(x) == len(variables) for x in space)
     func = body_to_function(variables, a.body())
     n_proved = 0
     n_skipped = 0
     print "checking {} combinations...".format(len(space))
     total_combinations += len(space)
     pool = Pool(cpu_count())  # fork after computing func
     results = []
     for s in space:
         if False:
             # this is useful for better error reporting
             st = check_axiom(s)
             print st
             if 'skipped' in st:
                 n_skipped += 1
             else:
                 n_proved += 1
         else:
             results.append(
                 pool.apply_async(check_axiom, [s],
                                  callback=print_function))
     # get all results, in order to raise exceptions if they occurred
コード例 #30
0
def feature_engineering(dirpath,
                        channels,
                        IDs=None,
                        recurse=False,
                        overwrite_previous=False,
                        seg_channel="",
                        no_lms=False,
                        no_tfor=False,
                        no_cfor=False,
                        mem_d=3,
                        M=8,
                        save_baselines=True,
                        processes=None,
                        dask_graph_path=None,
                        profiling=False,
                        verbose=False):
    """Extract a series of measurements from segmented images and point clouds.

    This is a dask pipeline that runs the covariate extraction functions in
    `katachi.tools.get_image_covariates` & `katachi.tools.get_cloud_covariates`
    on datasets that have been initialized, segmented and feature-extracted
    using other katachi pipelines.

    WARNING: The approach used here has been developed for the Zebrafish
    posterior lateral line primordium. It is likely not readily applicable to
    other tissues!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    channels : list
        A list of channels from which to extract channel-specific covariates.
        For each channel, a tif file must be present that ends on
        `channel+".tif"` and a .npy file must be present that ends either on
        `channel+"_LMs_TFOR.npy"` (recommended) or on `channel+"_LMs.npy"`.
        The channels will be used as class attributes in the output object and
        therefore must not contain characters incompatible with this use.
    IDs : list of strings or None, optional, default None
        If a list of strings (IDs) is given, only samples within dirpath that
        match this ID will be processed.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of dirpath.
    overwrite_previous : bool, optional, default False
        If True and a covariate file already exists for a given sample, that
        file will be deleted and a completely new file will be written in its
        place. If False and a covariate file already exists for a given sample,
        the new covariates will be added to it if they have a different name.
        For covariates with identical names, the new will overwrite the old.
    seg_channel : str or "", optional, default ""
        If for some reason the target directories are expected to contain more
        than one file that ends on "_seg.tif", seg_channel can be specified to
        identify the correct target file, which will have the form
        `<basename> + seg_channel + "_seg.tif"`.
        Note that having multiple segmentation files in one target directory is
        deprecated in general.
    no_lms : bool, optional, default False
        If True, it is expected that no landmark data is available. In this
        case, only image covariates are computed.
    no_tfor : bool, optional, default False
        If True, it is expected that no TFOR landmark data is available. In
        this case, untransformed landmarks are loaded and covariates depending
        on TFOR covariates are not computed (specifically pcl_covars_sample and
        pcl_covars_tissue).
    no_cfor : bool, optional, default False
        If True, the CFOR-based moments and baseline will not be computed and
        no CFOR data is required at any point.
    mem_d : int, optional, default 3
        Estimated diameter (in pixels) of the membrane region in the shell of a
        single cell. Used for extraction of intensity-based covariates.
    M : int, optional, default 8
        Highest-level moments to extract from point cloud. The moments array
        constructed will have shape (M+1,M+1,M+1).
    save_baselines : bool, optional, default True
        Whether to save the flattened moments arrays as feature space baselines
        in the form (N_cells, N_features), where N_features is length (M+1)**3.
        If True, two files are created for each channel, one for the base
        moments (usually TFOR, unless no_tfor is set to True or no TFOR data is
        available) and one for the PD-transformed (rotationally invariant) and
        volume-normalized cells, suffixed "_baseline.npy" and
        "_volnormPDbaseline.npy", respectively.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (dask is not used).
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        shows the constructed dask pipeline.
        Note: The resulting graph may get very large if many samples are used
        at the same time.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    if verbose: print "Retrieving matching datasets..."

    # Function to select suitable datasets and create paths
    def prepare_fpaths(fpath, fnames):

        # Keep only those in specified IDs
        if IDs is not None:
            fnames = [
                fname for fname in fnames
                if any([fname.startswith(ID) for ID in IDs])
            ]

        # Find the metadata file
        meta_file = None
        for fname in fnames:
            if fname.endswith("_stack_metadata.pkl"):
                meta_file = fname
                meta_path = os.path.join(fpath, meta_file)

        # Quit if no metadata file is found
        if meta_file is None:
            return None

        # Find segmentation file
        seg_file = [
            fname for fname in fnames
            if fname.endswith(seg_channel + "_seg.tif")
        ]

        # Handle failure cases
        if len(seg_file) == 0:
            return None
        if len(seg_file) > 1:
            raise IOError(
                "More than one segmentation file (*_seg.tif) found " +
                "in directory " + fpath + ". Use seg_channel kwarg to " +
                "specify which file to use.")
        else:
            seg_file = seg_file[0]
            seg_path = os.path.join(fpath, seg_file)

        # Find TFOR segmentation landmarks
        tfor_path = []
        if not no_tfor and not no_lms:

            # Search for the file
            tfor_file = [
                fname for fname in fnames
                if fname.endswith(seg_channel + "_seg_LMs_TFOR.npy")
            ]

            # Give up if nothing is found
            if len(tfor_file) == 0:
                return None

            # Else keep the result
            tfor_file = tfor_file[0]
            tfor_path = os.path.join(fpath, tfor_file)

        # Find channel landmark files
        lm_paths = []
        if not no_lms:
            for channel in channels:

                # Search for TFOR landmarks
                if not no_tfor:
                    lm_file = [
                        fname for fname in fnames
                        if fname.endswith(channel + "_LMs_TFOR.npy")
                    ]
                else:
                    lm_file = []

                # Search for non-TFOR landmarks
                if len(lm_file) == 0:
                    lm_file = [
                        fname for fname in fnames
                        if fname.endswith(channel + "_LMs.npy")
                    ]
                    if not no_tfor:
                        warn("No TFOR landmarks found for channel " + channel +
                             ". " + "Using standard landmarks.")

                # Give up if nothing is found
                if not lm_file:
                    return None

                # Else keep the result
                lm_file = lm_file[0]
                lm_path = os.path.join(fpath, lm_file)
                lm_paths.append(lm_path)

        # Find CFOR-transformed channel landmark files
        cfor_paths = []
        if not no_cfor and not no_lms:
            for channel in channels:

                # Get CFOR landmark paths
                cfor_file = [
                    fname for fname in fnames
                    if channel in fname and fname.endswith('CFOR.npy')
                ][0]
                cfor_path = os.path.join(fpath, cfor_file)
                cfor_paths.append(cfor_path)

        # Find image files
        img_paths = []
        for channel in channels:

            # Search for image files
            img_file = [
                fname for fname in fnames if fname.endswith(channel + ".tif")
            ]

            # Give up if nothing is found
            if not img_file:
                return None

            # Else keep the result
            img_file = img_file[0]
            img_path = os.path.join(fpath, img_file)
            img_paths.append(img_path)

        # Return the paths
        return {
            "meta_path": meta_path,
            "seg_path": seg_path,
            "tfor_path": tfor_path,
            "lm_paths": lm_paths,
            "img_paths": img_paths,
            "cfor_paths": cfor_paths
        }

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        all_paths = [prepare_fpaths(dirpath, fnames)]
        if all_paths is None:
            raise IOError("The specified path does not contain the required " +
                          "files (and recurse=False).")

    # Run for multiple subdirs
    if recurse:
        all_paths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths = prepare_fpaths(dpath, fnames)
            if fpaths is not None:
                all_paths.append(fpaths)
        if not all_paths:
            raise IOError("Could not find any data directories containing " +
                          "all required files.")

    # Report
    if verbose: print "-- Retrieved", len(all_paths), "matching data sets."

    #--------------------------------------------------------------------------

    ### If desired: run sequentially (does not use dask/multiprocessing)

    if processes == 1:

        if verbose: print "Processing target files sequentially..."

        # For each dataset...
        for paths in all_paths:

            # Load previously generated covariates file (if available)
            has_previous = False
            if not overwrite_previous:
                mroot, mfile = os.path.split(paths["meta_path"])
                prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
                if os.path.isfile(prevfpath):
                    with open(prevfpath, 'rb') as prevfile:
                        covars = pickle.load(prevfile)
                    has_previous = True

            # Load data
            img_seg = imread(paths["seg_path"])
            if not no_lms and not no_tfor:
                tfor_lms = np.load(paths["tfor_path"])
            with open(paths["meta_path"], 'rb') as metafile:
                meta_dict = pickle.load(metafile)

            # Extract image covariates
            covars = gic.get_img_covars_sample(
                "_", img_seg=img_seg, covars=covars if has_previous else None)
            covars = gic.get_img_covars_tissue("_",
                                               img_seg=img_seg,
                                               covars=covars)
            covars = gic.get_img_covars_cell_seg("_",
                                                 '_',
                                                 img_seg=img_seg,
                                                 metadata=meta_dict,
                                                 covars=covars)
            for c, channel in enumerate(channels):
                covars = gic.get_img_covars_cell_int("_",
                                                     paths["img_paths"][c],
                                                     channel,
                                                     mem_d,
                                                     img_seg=img_seg,
                                                     covars=covars)

            # Extract point cloud covariates
            if not no_tfor and not no_lms:
                covars = gcc.get_pcl_covars_sample("_",
                                                   "_",
                                                   tfor_lms=tfor_lms,
                                                   metadata=meta_dict,
                                                   covars=covars)
                covars = gcc.get_pcl_covars_tissue("_",
                                                   "_",
                                                   tfor_lms=tfor_lms,
                                                   metadata=meta_dict,
                                                   covars=covars)
            if not no_lms:
                for c, channel in enumerate(channels):
                    covars = gcc.get_pcl_covars_cell(
                        paths["lm_paths"][c],
                        channel,
                        M=M,
                        no_cfor=no_cfor,
                        fpath_lms_cfor=paths["cfor_paths"][c],
                        covars=covars)

                # Saving the moments as a baseline feature space
                if save_baselines:

                    # Prep base path
                    bp = paths["lm_paths"][c][:-4]

                    # Save TFOR baseline
                    m = covars.pcl.cell._gad(channel).moments
                    np.save(bp + "_baseline.npy", m)

                    # Save CFOR baseline
                    if not no_cfor:
                        m = covars.pcl.cell._gad(channel).moments_cfor
                        np.save(bp + "_CFORbaseline.npy", m)

            # Saving the extracted covariates
            mroot, mfile = os.path.split(paths["meta_path"])
            outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
            with open(outfpath, 'wb') as outfile:
                pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL)

        # Report and return
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict
    # Note: This is slightly suboptimal because some datasets have to be
    #       reloaded multiple times. However, it seems difficult to solve this
    #       in a way that permits carrying them over.

    if verbose: print "Processing target files in parallel..."

    dask_graph = dict()

    # For each dataset...
    for idx, paths in enumerate(all_paths):

        # Getting previous covariates: function
        def get_previous_covariates(prevfpath):
            with open(prevfpath, 'rb') as prevfile:
                covars = pickle.load(prevfile)
            return covars

        # Get previous covars (if existing and desired)
        has_previous = False
        if not overwrite_previous:
            mroot, mfile = os.path.split(paths["meta_path"])
            prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
            if os.path.isfile(prevfpath):
                dask_graph['prev_covars_%i' % idx] = (get_previous_covariates,
                                                      prevfpath)
                has_previous = True

        # Extract image covariates
        dask_graph["img_sample_%i" % idx] = (gic.get_img_covars_sample,
                                             paths["seg_path"])
        dask_graph["img_tissue_%i" % idx] = (gic.get_img_covars_tissue,
                                             paths["seg_path"])
        dask_graph["img_cell_seg_%i" % idx] = (gic.get_img_covars_cell_seg,
                                               paths["seg_path"],
                                               paths["meta_path"])
        for c, channel in enumerate(channels):
            dask_graph["img_cell_int_%s_%i" %
                       (channel, idx)] = (gic.get_img_covars_cell_int,
                                          paths["seg_path"],
                                          paths["img_paths"][c], channel,
                                          mem_d)

        # Extract point cloud covariates
        if not no_tfor and not no_lms:
            dask_graph["pcl_sample_%i" % idx] = (gcc.get_pcl_covars_sample,
                                                 paths["tfor_path"],
                                                 paths["meta_path"])
            dask_graph["pcl_tissue_%i" % idx] = (gcc.get_pcl_covars_tissue,
                                                 paths["tfor_path"],
                                                 paths["meta_path"])
        if not no_lms:
            for c, channel in enumerate(channels):
                dask_graph["pcl_cell_%s_%i" %
                           (channel, idx)] = (gcc.get_pcl_covars_cell,
                                              paths["lm_paths"][c], channel, M,
                                              no_cfor, paths["cfor_paths"][c])

                # Saving the moments as a baseline feature space
                if save_baselines:

                    # Baseline saving function
                    def save_baseline(covars, channel, basepath, no_cfor):

                        # Save TFOR baseline
                        m = covars.pcl.cell._gad(channel).moments
                        np.save(basepath + "_baseline.npy", m)

                        # Save CFOR baseline
                        if not no_cfor:
                            m = covars.pcl.cell._gad(channel).moments_cfor
                            np.save(basepath + "_CFORbaseline.npy", m)

                        # Forward result
                        return covars

                    # Add to graph
                    basepath = paths["lm_paths"][c][:-4]
                    dask_graph["pcl_cell_blsave_%s_%i" %
                               (channel, idx)] = (save_baseline,
                                                  "pcl_cell_%s_%i" %
                                                  (channel, idx), channel,
                                                  basepath, no_cfor)

        # Merging the extracted covariates: function
        def merge_covariates(covars_list):
            covars = covars_list[0]
            for cv in covars_list[1:]:
                covars._merge(cv)
            return covars

        # Merging the extracted covariates: input name list construction
        covars_list = [
            "img_sample_%i" % idx,
            "img_tissue_%i" % idx,
            "img_cell_seg_%i" % idx
        ]
        covars_list += [
            "img_cell_int_%s_%i" % (channel, idx) for channel in channels
        ]
        if not no_tfor and not no_lms:
            covars_list += ["pcl_sample_%i" % idx, "pcl_tissue_%i" % idx]
        if save_baselines and not no_lms:
            covars_list += [
                "pcl_cell_blsave_%s_%i" % (channel, idx)
                for channel in channels
            ]
        elif not no_lms:
            covars_list += [
                "pcl_cell_%s_%i" % (channel, idx) for channel in channels
            ]
        if has_previous:
            covars_list += ['prev_covars_%i' % idx]

        # Merging the extracted covariates: dask call
        dask_graph["merge_results_%i" % idx] = (merge_covariates, covars_list)

        # Saving the extracted covariates
        def save_covariates(covars, outfpath):
            with open(outfpath, 'wb') as outfile:
                pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL)

        mroot, mfile = os.path.split(paths["meta_path"])
        outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
        dask_graph["save_results_%i" % idx] = (save_covariates,
                                               "merge_results_%i" % idx,
                                               outfpath)

    # Collecting the results
    dask_graph['done'] = (lambda x: "done", [
        "save_results_%i" % idx for idx in range(len(all_paths))
    ])

    # Saving the graph visualization
    if dask_graph_path is not None:
        from dask.dot import dot_graph
        dot_graph(dask_graph, filename=dask_graph_path)

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.threaded.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.threaded.get(dask_graph, 'done')
            visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return