Beispiel #1
0
    def transform_mpi(self, X, keep_order=True, ncpu=4, n_components=None):
        """ Sample as transform but using multiprocessing """
        n = X.shape[0]
        if self.batch_size is None:
            batch_size = 12 * len(self.mean_)
        else:
            batch_size = self.batch_size
        batch_list = [(i, min(i + batch_size, n))
                      for i in range(0, n + batch_size, batch_size) if i < n]

        # ====== run MPI jobs ====== #
        def map_func(batch):
            start, end = batch
            x = super(MiniBatchPCA, self).transform(X=X[start:end])
            # doing dim reduction here save a lot of memory for
            # inter-processors transfer
            if n_components is not None:
                x = x[:, :n_components]
            # just need to return the start for ordering
            yield start, x

        mpi = MPI(batch_list,
                  func=map_func,
                  ncpu=ncpu,
                  batch=1,
                  hwm=ncpu * 12,
                  backend='python')
        # ====== process the return ====== #
        X_transformed = []
        for start, x in mpi:
            X_transformed.append((start, x))
        if keep_order:
            X_transformed = sorted(X_transformed, key=lambda x: x[0])
        X_transformed = np.concatenate([x[-1] for x in X_transformed], axis=0)
        return X_transformed
Beispiel #2
0
def mutual_info_estimate(representations: np.ndarray,
                         factors: np.ndarray,
                         continuous_representations: bool = True,
                         continuous_factors: bool = False,
                         n_neighbors: int = 3,
                         n_cpu: int = 1,
                         seed: int = 1,
                         verbose: bool = False):
    r""" Nonparametric method for estimating entropy from k-nearest neighbors
  distances (note: this implementation use multi-processing)

  Parameters
  -----------

  Return
  --------
  matrix `[num_latents, num_factors]`, estimated mutual information between
    each representation and each factors

  References
  ------------
  A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”.
    Phys. Rev. E 69, 2004.
  B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”.
    PLoS ONE 9(2), 2014.
  L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a
    Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
  """
    from sklearn.feature_selection import (mutual_info_classif,
                                           mutual_info_regression)
    mutual_info = mutual_info_regression if continuous_factors else \
      mutual_info_classif
    num_latents = representations.shape[1]
    num_factors = factors.shape[1]
    # iterate over each factor
    mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64)

    # repeat for each factor
    def func(idx):
        mi = mutual_info(representations,
                         factors[:, idx],
                         discrete_features=not continuous_representations,
                         n_neighbors=n_neighbors,
                         random_state=seed)
        return idx, mi

    jobs = list(range(num_factors))
    if n_cpu < 2:
        it = (func(i) for i in jobs)
    else:
        it = MPI(jobs=jobs, func=func, ncpu=n_cpu, batch=1)
    if verbose:
        from tqdm import tqdm
        it = tqdm(it, desc='Estimating mutual information', total=len(jobs))
    for i, mi in it:
        mi_matrix[:, i] = mi
    return mi_matrix
Beispiel #3
0
  def test_mpi(self):
    X = batching(n=512, batch_size=np.random.randint(low=12000, high=80000))

    def map_func(batch):
      for b in batch:
        yield b
    mpi = MPI(X, map_func=map_func, ncpu=12, buffer_size=8,
        maximum_queue_size=12 * 8)

    Y = [i for i in mpi]
    self.assertEqual(len(X), len(Y))
    self.assertEqual(sum(j - i for i, j in X), sum(j - i for i, j in Y))
    self.assertTrue(all(i == j for i, j in zip(
        sorted(X, key=lambda x: x[0]),
        sorted(Y, key=lambda x: x[0])
    )))
Beispiel #4
0
def mutual_info_estimate(representations,
                         factors,
                         continuous_representations=True,
                         continuous_factors=False,
                         n_neighbors=3,
                         random_state=1234):
    r""" Nonparametric method for estimating entropy from k-nearest neighbors
  distances (note: this implementation use multi-processing)

  Return:
    matrix `[num_latents, num_factors]`, estimated mutual information between
      each representation and each factors

  References:
    A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”.
      Phys. Rev. E 69, 2004.
    B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”.
      PLoS ONE 9(2), 2014.
    L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a
      Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
  """
    from sklearn.feature_selection import (mutual_info_classif,
                                           mutual_info_regression)
    mutual_info = mutual_info_regression if continuous_factors else \
      mutual_info_classif
    num_latents = representations.shape[1]
    num_factors = factors.shape[1]
    # iterate over each factor
    mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64)

    # repeat for each factor
    def func(idx):
        mi = mutual_info(representations,
                         factors[:, idx],
                         discrete_features=not continuous_representations,
                         n_neighbors=n_neighbors,
                         random_state=random_state)
        return idx, mi

    for i, mi in MPI(jobs=list(range(num_factors)),
                     func=func,
                     ncpu=min(max(1,
                                  get_cpu_count() - 1), 10),
                     batch=1):
        mi_matrix[:, i] = mi
    return mi_matrix
Beispiel #5
0
def get_pdf_text(path: str) -> dict:
    from PyPDF2 import PdfFileReader
    from odin.utils.mpi import MPI

    def read_text(fpath):
        with open(fpath, 'rb') as f:
            f = PdfFileReader(f)
            text = []
            for i in range(f.numPages):
                page = f.getPage(i)
                text.append(page.extractText())
        return (fpath, text)

    results = dict()
    for filepath, text in MPI(jobs=_to_files(path),
                              func=read_text,
                              ncpu=4,
                              batch=1):
        results[filepath] = text
    return results
Beispiel #6
0
    def launch(self, job_overrides):
        setup_globals()
        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = self.config.hydra.sweep.dir
        Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True)
        LOGGER.info("Launching {} jobs locally".format(len(job_overrides)))

        def run_task(job):
            idx, overrides = job
            LOGGER.info("\t#{} : {}".format(
                idx, " ".join(filter_overrides(overrides))))
            sweep_config = self.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                # id is concatenated overrides here
                sweep_config.hydra.job.id = '_'.join(sorted(overrides))
                sweep_config.hydra.job.num = idx
            HydraConfig().set_config(sweep_config)
            ret = run_job(
                config=sweep_config,
                task_function=self.task_function,
                job_dir_key="hydra.sweep.dir",
                job_subdir_key="hydra.sweep.subdir",
            )
            configure_log(self.config.hydra.hydra_logging,
                          self.config.hydra.verbose)
            return (idx, ret)

        if self.ncpu > 1:
            jobs = list(enumerate(job_overrides))
            runs = sorted([
                ret for ret in MPI(
                    jobs=jobs, func=run_task, ncpu=int(self.ncpu), batch=1)
            ])
            runs = [i[1] for i in runs]
        else:
            runs = [run_task(job)[1] for job in enumerate(job_overrides)]
        return runs
Beispiel #7
0
def fast_tsne(*X,
              n_components=2,
              n_samples=None,
              perplexity=30.0,
              early_exaggeration=8.0,
              learning_rate=200.0,
              n_iter=1000,
              n_iter_without_progress=300,
              min_grad_norm=1e-7,
              metric="euclidean",
              init="random",
              verbose=0,
              random_state=1234,
              method='barnes_hut',
              angle=0.5,
              n_jobs=4):
    """
  Parameters
  ----------
  n_components : int, optional (default: 2)
      Dimension of the embedded space.

  n_samples : {int, None}
      if given, downsampling the data to given number of sample

  perplexity : float, optional (default: 30)
      The perplexity is related to the number of nearest neighbors that
      is used in other manifold learning algorithms. Larger datasets
      usually require a larger perplexity. Consider selecting a value
      between 5 and 50. The choice is not extremely critical since t-SNE
      is quite insensitive to this parameter.

  early_exaggeration : float, optional (default: 8.0)
      Controls how tight natural clusters in the original space are in
      the embedded space and how much space will be between them. For
      larger values, the space between natural clusters will be larger
      in the embedded space. Again, the choice of this parameter is not
      very critical. If the cost function increases during initial
      optimization, the early exaggeration factor or the learning rate
      might be too high.

  learning_rate : float, optional (default: 200.0)
      The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
      the learning rate is too high, the data may look like a 'ball' with any
      point approximately equidistant from its nearest neighbours. If the
      learning rate is too low, most points may look compressed in a dense
      cloud with few outliers. If the cost function gets stuck in a bad local
      minimum increasing the learning rate may help.

  n_iter : int, optional (default: 1000)
      Maximum number of iterations for the optimization. Should be at
      least 250.

  n_iter_without_progress : int, optional (default: 300)
      Maximum number of iterations without progress before we abort the
      optimization, used after 250 initial iterations with early
      exaggeration. Note that progress is only checked every 50 iterations so
      this value is rounded to the next multiple of 50.

  min_grad_norm : float, optional (default: 1e-7)
      If the gradient norm is below this threshold, the optimization will
      be stopped.

  metric : string or callable, optional
      The metric to use when calculating distance between instances in a
      feature array. If metric is a string, it must be one of the options
      allowed by scipy.spatial.distance.pdist for its metric parameter, or
      a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
      If metric is "precomputed", X is assumed to be a distance matrix.
      Alternatively, if metric is a callable function, it is called on each
      pair of instances (rows) and the resulting value recorded. The callable
      should take two arrays from X as input and return a value indicating
      the distance between them. The default is "euclidean" which is
      interpreted as squared euclidean distance.

  init : string or numpy array, optional (default: "random")
      Initialization of embedding. Possible options are 'random', 'pca',
      and a numpy array of shape (n_samples, n_components).
      PCA initialization cannot be used with precomputed distances and is
      usually more globally stable than random initialization.

  verbose : int, optional (default: 0)
      Verbosity level.

  random_state : int, RandomState instance or None, optional (default: None)
      If int, random_state is the seed used by the random number generator;
      If RandomState instance, random_state is the random number generator;
      If None, the random number generator is the RandomState instance used
      by `np.random`.  Note that different initializations might result in
      different local minima of the cost function.

  method : string (default: 'barnes_hut')
      By default the gradient calculation algorithm uses Barnes-Hut
      approximation running in O(NlogN) time. method='exact'
      will run on the slower, but exact, algorithm in O(N^2) time. The
      exact algorithm should be used when nearest-neighbor errors need
      to be better than 3%. However, the exact method cannot scale to
      millions of examples.

  angle : float (default: 0.5)
      Only used if method='barnes_hut'
      This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
      'angle' is the angular size (referred to as theta in [3]) of a distant
      node as measured from a point. If this size is below 'angle' then it is
      used as a summary node of all points contained within it.
      This method is not very sensitive to changes in this parameter
      in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
      computation time and angle greater 0.8 has quickly increasing error.
  """
    assert len(X) > 0, "No input is given!"
    if isinstance(X[0], (tuple, list)):
        X = X[0]
    if not all(isinstance(x, np.ndarray) for x in X):
        raise ValueError(
            "`X` can only be list of numpy.ndarray or numpy.ndarray")
    # ====== kwarg for creating T-SNE class ====== #
    kwargs = dict(locals())
    del kwargs['X']
    n_samples = kwargs.pop('n_samples', None)
    # ====== downsampling ====== #
    if n_samples is not None:
        n_samples = int(n_samples)
        assert n_samples > 0
        new_X = []
        rand = random_state if isinstance(random_state, np.random.RandomState) else \
        np.random.RandomState(seed=random_state)
        for x in X:
            if x.shape[0] > n_samples:
                ids = rand.permutation(x.shape[0])[:n_samples]
                x = x[ids]
            new_X.append(x)
        X = new_X
    # ====== import proper T-SNE ====== #
    tsne_version = None
    try:
        from tsnecuda import TSNE
        from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE
        tsne_version = 'cuda'
    except ImportError:
        # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` "
        #        "for significant speed up.")
        try:
            from MulticoreTSNE import MulticoreTSNE as TSNE
            tsne_version = 'multicore'
        except ImportError:
            wprint(
                "Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`"
                ' to accelerate the T-SNE on multiple CPU cores.')
            try:
                from sklearn.manifold import TSNE
                tsne_version = 'sklearn'
            except Exception as e:
                raise e
    # ====== modify kwargs ====== #
    if tsne_version == 'cuda':
        kwargs['random_seed'] = kwargs['random_state']
        kwargs['theta'] = angle
        if method == 'exact':
            TSNE = _exact_TSNE
            del kwargs['theta']
        del kwargs['random_state']
        del kwargs['n_jobs']
        del kwargs['angle']
        del kwargs['method']
    elif tsne_version == 'multicore':
        pass
    else:
        del kwargs['n_jobs']
    # ====== getting cached values ====== #
    results = []
    X_new = []
    for i, x in enumerate(X):
        md5 = md5_checksum(x)
        key = _create_key(kwargs, md5)
        if key in _cached_values:
            results.append((i, _cached_values[key]))
        else:
            X_new.append((i, md5, x))

    # ====== perform T-SNE ====== #
    def apply_tsne(j):
        idx, md5, x = j
        tsne = TSNE(**kwargs)
        return (idx, md5, tsne.fit_transform(x))

    # only 1 X, no need for MPI
    if len(X_new) == 1:
        idx, md5, x = apply_tsne(X_new[0])
        results.append((idx, x))
        _cached_values[_create_key(kwargs, md5)] = x
    else:
        mpi = MPI(jobs=X_new,
                  func=apply_tsne,
                  batch=1,
                  ncpu=min(len(X_new),
                           cpu_count() - 1))
        for idx, md5, x in mpi:
            results.append((idx, x))
            _cached_values[_create_key(kwargs, md5)] = x
    # ====== return and clean ====== #
    results = sorted(results, key=lambda a: a[0])
    results = [r[1] for r in results]
    return results[0] if len(results) == 1 else results
Beispiel #8
0
    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))
Beispiel #9
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()
Beispiel #10
0
              batch_size=batch_size,
              semi_weight=10,
              verbose=False)
  except Exception as e:
    print("Error:", e)
    print("Error Config:", name)
    return

  print("Finish training %-4s layer:%d hdim:%-3d zdim:%d in %.2f(s)" %
        (model.id, n, h, z, time.time() - start_time))
  with open(os.path.join(path, name), 'wb') as f:
    pickle.dump(model, f)


if not no_train:
  mpi = MPI(jobs=jobs, func=run_training, ncpu=ncpu, batch=1)
  for i, j in enumerate(mpi):
    if i % 5 == 0:
      print(" == Training %d/%d jobs ==" % (i + 1, len(jobs)))


# ===========================================================================
# Generate scores file for all model
# ===========================================================================
def run_scoring(args):
  n, h, z, model = args
  name = model.id + '_%d_%d_%d' % (n, h, z)
  with open(os.path.join(path, name), 'rb') as f:
    model = pickle.load(f)

    start_time = time.time()
Beispiel #11
0
def unsupervised_clustering_scores(factors: np.ndarray,
                                   representations: Optional[np.ndarray] = None,
                                   predictions: Optional[np.ndarray] = None,
                                   algorithm: str = 'both',
                                   random_state: int = 1,
                                   n_cpu: int = 1,
                                   verbose: bool = True) -> Dict[str, float]:
  r""" Calculating the unsupervised clustering Scores:

    - ASW : silhouette_score ([-1, 1], higher is better)
        is calculated using the mean intra-cluster distance and the
        mean nearest-cluster distance (b) for each sample. Values near 0
        indicate overlapping clusters
    - ARI : adjusted_rand_score ([-1, 1], higher is better)
        A similarity measure between two clusterings by considering all pairs
        of samples and counting pairs that are assigned in the same or
        different clusters in the predicted and true clusterings.
        Similarity score between -1.0 and 1.0. Random labelings have an ARI
        close to 0.0. 1.0 stands for perfect match.
    - NMI : normalized_mutual_info_score ([0, 1], higher is better)
        Normalized Mutual Information between two clusterings.
        1.0 stands for perfectly complete labeling
    - UCA : unsupervised_clustering_accuracy ([0, 1], higher is better)
        accuracy of the linear assignment between predicted labels and
        ground-truth labels.
    - HOS : homogeneity_score ([0, 1], higher is better)
        A clustering result satisfies homogeneity if all of its clusters
        contain only data points which are members of a single class.
        1.0 stands for perfectly homogeneous
    - COS : completeness_score ([0, 1], higher is better)
        A clustering result satisfies completeness if all the data points
        that are members of a given class are elements of the same cluster.
        1.0 stands for perfectly complete labeling

  Arguments:
    factors : a Matrix.
      Categorical factors (i.e. one-hot encoded), or multiple factors.
    algorithm : {'kmeans', 'gmm', 'both'}.
      The clustering algorithm for assigning the cluster from representations

  Return:
    Dict mapping score alias to its scalar value

  Note:
    The time complexity is exponential as the number of labels increasing
  """
  if factors.ndim == 1:
    factors = np.expand_dims(factors, axis=-1)
  assert representations is not None or predictions is not None, \
    "either representations or predictions must be provided"
  ### preprocessing factors
  # multinomial :
  # binary :
  # multibinary :
  factor_type = 'multinomial'
  if np.all(np.unique(factors) == [0., 1.]):
    if np.all(np.sum(factors, axis=1) == 1.):
      factor_type = 'binary'
    else:
      factor_type = 'multibinary'
  # start scoring
  if factor_type == 'binary':
    return _clustering_scores(X=representations,
                              z=predictions,
                              y=np.argmax(factors, axis=1),
                              algo=algorithm,
                              random_state=random_state)
  if factor_type in ('multinomial', 'multibinary'):

    def _get_scores(idx):
      y = factors[:, idx]
      if factor_type == 'multinomial':
        uni = {v: i for i, v in enumerate(sorted(np.unique(y)))}
        y = np.array([uni[i] for i in y])
      else:
        y = y.astype(np.int32)
      return _clustering_scores(X=representations,
                                z=predictions,
                                y=y,
                                algo=algorithm,
                                random_state=random_state)

    scores = defaultdict(list)
    if factors.shape[1] == 1:
      verbose = False
    prog = tqdm(desc="Scoring clusters",
                total=factors.shape[1],
                disable=not verbose)
    if n_cpu == 1:
      it = (_get_scores(idx) for idx in range(factors.shape[1]))
    else:
      it = MPI(jobs=list(range(factors.shape[1])),
               func=_get_scores,
               batch=1,
               ncpu=n_cpu)
    for s in it:
      prog.update(1)
      for k, v in s.items():
        scores[k].append(v)
    return {k: np.mean(v) for k, v in scores.items()}
Beispiel #12
0
    def run(self):
        if self.pca:
            from odin.ml import MiniBatchPCA
        if not hasattr(self, 'jobs'):
            raise Exception(
                'the Processor must has "jobs" attribute, which is '
                'the list of all jobs.')
        njobs = len(self.jobs) if self.njobs == 0 else self.njobs
        prog = Progbar(target=njobs)
        dataset = self.dataset
        datatype = self.datatype
        if self.ncpu is None:  # auto select number of CPU
            ncpu = min(njobs, int(1.2 * cpu_count()))
        else:
            ncpu = self.ncpu
        # ====== indices ====== #
        indices = defaultdict(list)
        # ====== MmapDict ====== #
        dicts = {}
        for name, dtype, stats in self.features_properties:
            if 'dict' in str(dtype).lower():
                dicts[name] = MmapDict(os.path.join(dataset.path, name))
        # ====== statistic ====== #
        statistic_able = {i[0]: i[-1] for i in self.features_properties}
        sum1 = defaultdict(int)
        sum2 = defaultdict(int)
        # init PCA
        pca = defaultdict(lambda *args, **kwargs: MiniBatchPCA(
            n_components=None,
            whiten=self.pca_whiten,
            copy=True,
            batch_size=None) if self.pca else None)
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        if self.ncache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.ncache)
        ref_vars = {'start': defaultdict(int), 'processed_count': 0}

        # ====== helper ====== #
        def flush_feature(name, cache_data):
            if len(cache_data) > 0:
                cache_data = np.concatenate(cache_data, 0)
                # NOTE: if nb_samples < nb_features, fitting PCA
                # will course error
                if self.pca and statistic_able[name]:
                    pca[name].partial_fit(cache_data)
                # flush data
                if name in dataset:
                    dataset[name].append(cache_data)
                else:
                    dataset[(name, datatype)] = cache_data

        def wrapped_reduce(result):
            name, data = result
            ref_vars['processed_count'] += 1
            # check data
            if not isinstance(data, (tuple, list)):
                data = (data, )
            length = []  # store length of all data for validation
            # processing
            for prop, d in zip(self.features_properties, data):
                n, t, s = prop  # data-type-name, dtype, stats
                # mmapdict type:
                if 'dict' in str(t).lower():
                    dicts[n][name] = d.tolist() if isinstance(
                        d, np.ndarray) else d
                    del d
                    continue
                # auto-create new indices
                if len(d) not in length:
                    length.append(len(d))
                    indices[n].append([
                        name, ref_vars['start'][n],
                        ref_vars['start'][n] + len(d)
                    ])
                    ref_vars['start'][n] += len(d)
                # cache data, only if we have more than 0 sample
                if len(d) > 0:
                    cache[n].append(d.astype(t))
                    if self.save_stats and s:  # save stats
                        sum1[n] += np.sum(d, axis=0, dtype='float64')
                        sum2[n] += np.sum(np.power(d, 2),
                                          axis=0,
                                          dtype='float64')
                del d
            # ====== flush cache ====== #
            if ref_vars['processed_count'] % cache_limit == 0:  # 12 + 8
                for i, j in cache.iteritems():
                    flush_feature(i, j)
                cache.clear()
            # ====== update progress ====== #
            return name

        # ====== processing ====== #
        mpi = MPI(self.jobs,
                  self.map,
                  wrapped_reduce,
                  ncpu=ncpu,
                  buffer_size=1,
                  maximum_queue_size=ncpu * 3)
        for name in mpi:
            prog.title = '%-20s' % name
            prog.add(1)
        # ====== end, flush the last time ====== #
        for i, j in cache.iteritems():
            flush_feature(i, j)
        cache = None
        dataset.flush()
        # ====== saving indices ====== #
        for n, ids in indices.iteritems():
            outpath = os.path.join(
                dataset.path,
                'indices' if n in self.primary_indices else 'indices_%s' % n)
            _ = MmapDict(outpath)
            for name, start, end in ids:
                _[name] = (int(start), int(end))
            _.flush()
            _.close()

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, pca, name, dataset):
            N = dataset[name].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - mean**2)
            if self.substitute_nan is not None:
                mean = np.where(np.isnan(mean), self.substitute_nan, mean)
                std = np.where(np.isnan(std), self.substitute_nan, std)
            else:
                assert not np.any(
                    np.isnan(mean)), 'Mean contains NaN, %s' % name
                assert not np.any(np.isnan(std)), 'Std contains NaN, %s' % name
            dataset[name + '_sum1'] = sum1
            dataset[name + '_sum2'] = sum2
            dataset[name + '_mean'] = mean
            dataset[name + '_std'] = std
            dataset[name + '_pca'] = pca

        # save all stats
        if self.save_stats:
            print('Saving statistics of each data ...')
            for n, d, s in self.features_properties:
                if s:  # save stats
                    print(' * Name:', n)
                    s1, s2, pca_ = sum1[n], sum2[n], pca[n]
                    save_mean_std(s1, s2, pca_, n, dataset)
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== all MmapDict flush() ====== #
        for d in dicts.itervalues():
            d.flush()
            d.close()
Beispiel #13
0
def count_frames(specifiers: List[str],
                 is_matrix: bool = False,
                 is_bool_index: bool = True,
                 progressbar: bool = False,
                 num_workers: int = 3,
                 concat_char: str = '&') -> List[int]:
    """
  Parameters
  ----------
  specifiers : list of `str`
    list of sepcifier `["raw_mfcc_voxceleb.1.ark:42", ...]`
  is_matrix : `bool` (default=`False`)
    input data is matrix or vector
  is_bool_index : `bool` (default=`True`)
    if `True`, the loaded data is boolean index of speech activity detection,
    the length of audio file is calculated by summing the index array.
  concat_char : `str` (default='&')
    by concatenating multiple specifier using given character,
    multiple utterance could be sequentially loaded and concatenated.
    (e.g. 'raw_mfcc_sre18_dev.1.ark:3018396&raw_mfcc_sre18_dev.1.ark:5516398')

  Return
  ------
  List of integer (i.e. the frame count)
  """
    _check_pykaldi()
    import kaldi.util.io as kio

    frame_counts = []
    fn_read = kio.read_matrix if bool(is_matrix) else kio.read_vector
    progress = tqdm(total=len(specifiers),
                    desc="Kaldi counting frame",
                    disable=not progressbar,
                    mininterval=0.0,
                    maxinterval=10.0)

    def _count(specs):
        res = []
        for idx, spec in specs:
            n = 0
            for s in spec.split(concat_char):
                # both feature and VAD is provided, then get the vad only
                dat = fn_read(s).numpy()
                if is_bool_index:  # sum of all True values
                    n += np.sum(dat)
                else:  # just get the first dimension
                    n += len(dat)
            # (utt_id, frame_count)
            res.append((int(idx), n))
        return res

    jobs = np.array_split([(i, s) for i, s in enumerate(specifiers)],
                          num_workers * 25)
    if num_workers == 1:
        for j in jobs:
            for r in _count(j):
                frame_counts.append(r)
            progress.update(n=len(j))
    else:
        from odin.utils.mpi import MPI
        for r in MPI(jobs=jobs, func=_count, ncpu=num_workers, batch=1):
            progress.update(n=len(r))
            frame_counts.extend(r)
    return [i[1] for i in sorted(frame_counts)]
Beispiel #14
0
    def __iter__(self):
        # ====== check ====== #
        if self.__recipes is None:
            raise ValueError('You must "set_recipes" first')
        # ====== get start and end for indices ====== #
        n = self._indices.shape[0]
        start = _apply_approx(n, self._start)
        end = _apply_approx(n, self._end)
        indices = self._indices[start:end]
        outtype = self._outtype
        # ====== shuffle the indices ====== #
        rng = None
        if self._seed is not None:
            rng = np.random.RandomState(self._seed)
            indices = indices[rng.permutation(indices.shape[0])]
            # reset the seed
            self._seed = None
        # ====== create iter and its identity ====== #
        process_func = self.__recipes.process
        group_func = self.__recipes.group
        self.__recipes.prepare(
            batch_size=self._batch_size,
            seed=rng.randint(10e6) if rng is not None else None,
            shuffle_level=self._shuffle_level,
        )

        # ====== create wrapped functions ====== #
        def map_func(jobs):
            batch = []
            for name, start, end in jobs:
                start = int(start)
                end = int(end)
                # data can be list of Data, or just 1 Data
                if outtype is not None:
                    x = [
                        np.array(d[start:end], dtype=t)
                        for d, t in zip(self._data, outtype)
                    ]
                else:
                    x = [np.array(d[start:end]) for d in self._data]
                x = process_func(name, x)
                if x is not None:
                    batch.append(x)
            return group_func(batch)

        def reduce_func(results):
            # perform batch level permutation
            if rng is not None and self._shuffle_level > 1:
                permutation = rng.permutation(results[0].shape[0])
                # different shape NO shuffle
                results = [r[permutation] for r in results]
            # convert batch to tuple object if possible
            if isinstance(results, (tuple, list)) and len(results) == 1:
                results = results[0]
            elif isinstance(results, list):
                results = tuple(results)
            return results

        # ====== track and return ====== #
        it = MPI(indices,
                 map_func,
                 reduce_func,
                 ncpu=self.ncpu,
                 buffer_size=self.buffer_size,
                 maximum_queue_size=self.maximum_queue_size)
        self.__running_iter.append(it)
        return it
Beispiel #15
0
    def __iter__(self):
        # ====== get start and end for indices ====== #
        start = _apply_approx(self.nb_files, self._start)
        end = _apply_approx(self.nb_files, self._end)
        all_keys = self.indices_keys[start:end]
        # ====== shuffle the indices ====== #
        rng = None
        shuffle_level = self._shuffle_level
        if self._seed is not None:
            rng = np.random.RandomState(self._seed)
            all_keys = all_keys[rng.permutation(self.nb_files)]
            if shuffle_level < 1:
                rng = None
            # reset the seed
            self._seed = None
        batch_size = self._batch_size
        batch_filter = self._batch_filter
        process_func = self._recipes.process
        # ====== prepare data, indices and dtype ====== #
        data_indices_dtype = []
        i = 0
        for dat in self._data:
            for d in dat._data:
                data_indices_dtype.append(
                    (d, dat.indices, self._output_dtype[i]))
                i += 1

        # ====== create wrapped functions ====== #
        def map_func(jobs):
            if self.buffer_size == 1:
                jobs = [jobs]
            # calculating batch results
            batch = []
            for name in jobs:
                X = []
                for dat, ids, dtype in data_indices_dtype:
                    start, end = ids[name]
                    # data can be list of Data, or just 1 Data
                    dat = dat[start:end]
                    if dat.dtype != dtype:
                        dat = dat.astype(dtype)
                    X.append(dat)
                X = process_func(name, X)
                # ignore None returned result
                if X is not None:
                    batch.append(X)
            # choose grouping function
            if self._batch_mode == 'batch':
                X = _batch_grouping(batch, batch_size, rng, batch_filter)
            elif self._batch_mode == 'file':
                X = _file_grouping(batch, batch_size, rng, batch_filter)
            return X

        # ====== track and return ====== #
        it = MPI(jobs=all_keys,
                 func=map_func,
                 ncpu=self.ncpu,
                 batch=self.buffer_size,
                 hwm=self.hwm,
                 backend=self.mpi_backend)
        self._running_iter.append(it)
        return iter(it)
Beispiel #16
0
  def __init__(self,
               path="~/tensorflow_datasets/lego_faces",
               image_size=64,
               background_threshold=255):
    super().__init__()
    path = os.path.abspath(os.path.expanduser(path))
    if not os.path.exists(path):
      os.makedirs(path)
    ### download metadata
    meta_path = os.path.join(path, 'meta.csv')
    if not os.path.exists(meta_path):
      print("Download lego faces metadata ...")
      meta_path, _ = urlretrieve(url=LegoFaces.METADATA, filename=meta_path)
    import pandas as pd
    metadata = pd.read_csv(meta_path)
    metadata = metadata[metadata["Category Name"] == "Minifigure, Head"]
    ### check downloaded images
    image_folder = os.path.join(path, "dataset")
    if os.path.exists(image_folder):
      if md5_folder(image_folder) != LegoFaces.MD5:
        shutil.rmtree(image_folder)
    ### download data
    zip_path = os.path.join(path, "dataset.zip")
    if not os.path.exists(zip_path):
      print("Download zip lego faces dataset ...")
      zip_path, _ = urlretrieve(url=LegoFaces.DATASET, filename=zip_path)
    if not os.path.exists(image_folder):
      with zipfile.ZipFile(zip_path, mode="r") as f:
        print("Extract all lego faces images ...")
        f.extractall(path)
    ### load all images, downsample if necessary
    images = glob.glob(image_folder + '/*.jpg', recursive=True)
    if image_size != 128:
      image_folder = image_folder + '_%d' % int(image_size)
      if not os.path.exists(image_folder):
        os.mkdir(image_folder)
      if len(os.listdir(image_folder)) != len(images):
        shutil.rmtree(image_folder)
        os.mkdir(image_folder)
        from tqdm import tqdm
        images = [
            i for i in tqdm(MPI(jobs=images,
                                func=partial(_resize,
                                             image_size=image_size,
                                             outpath=image_folder),
                                ncpu=3,
                                batch=1),
                            total=len(images),
                            desc="Resizing images to %d" % image_size)
        ]
      else:
        images = glob.glob(image_folder + '/*.jpg', recursive=True)
    ### extract the heuristic factors
    metadata = {
        part_id: desc
        for part_id, desc in zip(metadata["Number"], metadata["Name"])
    }
    images_desc = {}
    for path in images:
      name = os.path.basename(path)[:-4]
      if name in metadata:
        desc = metadata[name]
      else:
        name = name.split('_')
        desc = metadata[name[0]]
      images_desc[path] = _process_desc(desc)
    ### tokenizing the description
    from PIL import Image

    def imread(p):
      img = Image.open(p, mode='r')
      arr = np.array(img, dtype=np.uint8)
      del img
      return arr

    self.image_size = image_size
    self.images = np.stack(
        [i for i in MPI(jobs=images, func=imread, ncpu=2, batch=1)])
    self.factors = _extract_factors(list(images_desc.keys()),
                                    list(images_desc.values()))
    ### remove images with background
    ids = np.array([
        True if np.min(i) <= int(background_threshold) else False
        for i in self.images
    ])
    self.images = self.images[ids]
    self.factors = self.factors[ids]
    ### split the dataset
    rand = np.random.RandomState(seed=1)
    n = len(self.images)
    ids = rand.permutation(n)
    self.train = (self.images[:int(0.8 * n)], self.factors[:int(0.8 * n)])
    self.valid = (self.images[int(0.8 * n):int(0.9 * n)],
                  self.factors[int(0.8 * n):int(0.9 * n)])
    self.test = (self.images[int(0.9 * n):], self.factors[int(0.9 * n):])
Beispiel #17
0
def scrap_lego_faces(metadata, path, resize=64, n_processes=4):
  r""" This function does not filter out bad images """
  from tqdm import tqdm
  from PIL import Image

  def _download_image(meta, conn):
    part_id, desc = meta
    desc = desc.replace("Minifigure, ", "")
    return_path = []
    with warnings.catch_warnings():
      warnings.filterwarnings('ignore', category=InsecureRequestWarning)
      response = conn.request(
          "GET",
          f"https://www.bricklink.com/v2/catalog/catalogitem.page?P={part_id}",
          preload_content=False)
      img_url = re.search(
          rf"\bimg\.bricklink\.com\/ItemImage\/[A-Z]+\/[0-9]+\/{part_id}\.png\b",
          str(response.read(), 'utf-8'),
      )
      if img_url is not None:
        img_url = img_url.group(0)
        img_response = conn.request("GET",
                                    f"https://{img_url}",
                                    preload_content=False)
        image_path = f"{path}/{part_id}"
        # convert to jpg with white background
        image = Image.open(img_response).convert("RGBA")
        background = Image.new("RGBA", image.size, (255, 255, 255))
        image = Image.alpha_composite(background, image).convert("RGB")
        del background
        width, height = image.size
        ratio = width / height
        # split the image
        if ratio >= 1.6 or part_id:
          im = np.array(image)
          M = im.shape[0]
          N = im.shape[1] // 2
          halves = [
              im[x:x + M, y:y + N]
              for x in range(0, im.shape[0], M)
              for y in range(0, im.shape[1], N)
          ]
          image = [Image.fromarray(half, "RGB") for half in halves[:2]]
        else:
          image = [image]
        # crop to square image
        for idx, im in enumerate(image):
          width, height = im.size
          new_len = min(width, height)
          left = (width - new_len) / 2
          top = (height - new_len) / 2
          right = (width + new_len) / 2
          bottom = (height + new_len) / 2
          im = im.crop((left, top, right, bottom))
          # resize the image
          if resize is not None:
            im = im.resize((int(resize), int(resize)))
          # save image
          out = image_path + ('.jpg' if idx == 0 else ('_%d.jpg' % idx))
          im.save(out, "JPEG", quality=90)
          return_path.append(out)
          del im
    return return_path

  conn = PoolManager(
      num_pools=2,
      headers={
          "User-Agent":
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"
      },
      maxsize=100,
      cert_reqs='CERT_NONE')
  all_images = []
  for image_path in tqdm(MPI(
      jobs=list(zip(metadata["Number"].values, metadata["Name"].values)),
      func=partial(_download_image, conn=conn),
      ncpu=max(1, int(n_processes)),
      batch=1,
  ),
                         desc="Download lego faces",
                         unit="image",
                         total=metadata.shape[0]):
    all_images += image_path
  return np.array(all_images)
Beispiel #18
0
def fast_tsne(
    *X,
    n_components: int = 2,
    max_samples: Optional[int] = None,
    perplexity: float = 30.0,
    early_exaggeration: float = 12.0,
    learning_rate: float = 200.0,
    n_iter: int = 1000,
    n_iter_without_progress: int = 300,
    exaggeration_iter: int = 250,
    perplexity_max_iter: int = 100,
    min_grad_norm: float = 1e-7,
    method: str = 'barnes_hut',
    metric: str = "euclidean",
    init: str = "random",
    angle: float = 0.5,
    n_jobs: Optional[int] = 4,
    merge_inputs: bool = True,
    pca_preprocessing: bool = True,
    return_model: bool = False,
    random_state: int = 1,
    verbose: int = 0,
    framework: Literal['auto', 'sklearn', 'cuml'] = 'auto',
):
    """ t-Stochastic Nearest Neighbors.
  If the algorithm take unexpected long time for running, lower the
  `exaggeration_iter`, or reduce the amount of samples by downsampling
  the dataset.

  Parameters
  ----------
  n_components : int, optional (default: 2)
      Dimension of the embedded space.
  max_samples : {int, None}
      if given, downsampling the data to given number of sample
  perplexity : float, optional (default: 30)
      The perplexity is related to the number of nearest neighbors that
      is used in other manifold learning algorithms. Larger datasets
      usually require a larger perplexity. Consider selecting a value
      between 5 and 50. The choice is not extremely critical since t-SNE
      is quite insensitive to this parameter.
  early_exaggeration : float, optional (default: 8.0)
      Controls how tight natural clusters in the original space are in
      the embedded space and how much space will be between them. For
      larger values, the space between natural clusters will be larger
      in the embedded space. Again, the choice of this parameter is not
      very critical. If the cost function increases during initial
      optimization, the early exaggeration factor or the learning rate
      might be too high.
  learning_rate : float, optional (default: 200.0)
      The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
      the learning rate is too high, the data may look like a 'ball' with any
      point approximately equidistant from its nearest neighbours. If the
      learning rate is too low, most points may look compressed in a dense
      cloud with few outliers. If the cost function gets stuck in a bad local
      minimum increasing the learning rate may help.
  n_iter : int, optional (default: 1000)
      Maximum number of iterations for the optimization. Should be at
      least 250.
  n_iter_without_progress : int, optional (default: 300)
      Maximum number of iterations without progress before we abort the
      optimization, used after 250 initial iterations with early
      exaggeration. Note that progress is only checked every 50 iterations so
      this value is rounded to the next multiple of 50.
  perplexity_max_iter : int, (default 100)
      The number of epochs the best gaussian bands are found for.
  exaggeration_iter : int, (default 250)
      To promote the growth of clusters, set this higher.
  min_grad_norm : float, optional (default: 1e-7)
      If the gradient norm is below this threshold, the optimization will
      be stopped.
  metric : string or callable, optional
      The metric to use when calculating distance between instances in a
      feature array. If metric is a string, it must be one of the options
      allowed by scipy.spatial.distance.pdist for its metric parameter, or
      a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
      If metric is "precomputed", X is assumed to be a distance matrix.
      Alternatively, if metric is a callable function, it is called on each
      pair of instances (rows) and the resulting value recorded. The callable
      should take two arrays from X as input and return a value indicating
      the distance between them. The default is "euclidean" which is
      interpreted as squared euclidean distance.
  init : string or numpy array, optional (default: "random")
      Initialization of embedding. Possible options are 'random', 'pca',
      and a numpy array of shape (n_samples, n_components).
      PCA initialization cannot be used with precomputed distances and is
      usually more globally stable than random initialization.
  verbose : int, optional (default: 0)
      Verbosity level, a number from 0 to 6.
  random_state : int, RandomState instance or None, optional (default: None)
      If int, random_state is the seed used by the random number generator;
      If RandomState instance, random_state is the random number generator;
      If None, the random number generator is the RandomState instance used
      by `np.random`.  Note that different initializations might result in
      different local minima of the cost function.
  method : string (default: 'barnes_hut')
      By default the gradient calculation algorithm uses Barnes-Hut
      approximation running in O(NlogN) time. method='exact'
      will run on the slower, but exact, algorithm in O(N^2) time. The
      exact algorithm should be used when nearest-neighbor errors need
      to be better than 3%. However, the exact method cannot scale to
      millions of examples.
  angle : float (default: 0.5)
      Only used if method='barnes_hut'
      This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
      'angle' is the angular size (referred to as theta in [3]) of a distant
      node as measured from a point. If this size is below 'angle' then it is
      used as a summary node of all points contained within it.
      This method is not very sensitive to changes in this parameter
      in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
      computation time and angle greater 0.8 has quickly increasing error.
  return_model : a Boolean, if `True`,
      return the trained t-SNE model
  merge_inputs : a Boolean, if `True`,
      merge all arrays into a single array
      for training t-SNE.
  """
    assert len(X) > 0, "No input is given!"
    if isinstance(X[0], (tuple, list)):
        X = X[0]
    if not all(isinstance(x, np.ndarray) for x in X):
        raise ValueError(
            "`X` can only be list of numpy.ndarray or numpy.ndarray")
    # ====== kwarg for creating T-SNE class ====== #
    kwargs = dict(locals())
    del kwargs['X']
    kwargs.pop('merge_inputs')
    kwargs.pop('return_model')
    kwargs.pop('max_samples')
    kwargs.pop('framework')
    kwargs.pop('pca_preprocessing')
    # ====== downsampling ====== #
    if max_samples is not None:
        max_samples = int(max_samples)
        assert max_samples > 0
        new_X = []
        rand = random_state if isinstance(random_state, np.random.RandomState) else \
        np.random.RandomState(seed=random_state)
        for x in X:
            if x.shape[0] > max_samples:
                ids = rand.permutation(x.shape[0])[:max_samples]
                x = x[ids]
            new_X.append(x)
        X = new_X
    # ====== import proper T-SNE ====== #
    tsne_version = None
    if framework != 'sklearn':
        try:
            from cuml.manifold import TSNE
            tsne_version = 'cuda'
        except ImportError:
            warnings.warn("Install RAPIDSAI cuML GPUs-accelerated t-SNE")
            try:
                from MulticoreTSNE import MulticoreTSNE as TSNE
                tsne_version = 'multicore'
            except ImportError:
                warnings.warn(
                    "pip install "
                    "git+https://github.com/DmitryUlyanov/Multicore-TSNE.git")
    if tsne_version is None:
        from sklearn.manifold import TSNE
        tsne_version = 'sklearn'
    # ====== modify kwargs ====== #
    if tsne_version == 'cuda':
        del kwargs['n_jobs']
    elif tsne_version == 'multicore':
        del kwargs['perplexity_max_iter']
        del kwargs['exaggeration_iter']
    else:
        del kwargs['n_jobs']
        del kwargs['perplexity_max_iter']
        del kwargs['exaggeration_iter']
    # ====== getting cached values ====== #
    results = []
    X_new = []
    X_size = []
    if merge_inputs:
        X_size = [x.shape[0] for x in X]
        x = np.vstack(X) if len(X) > 1 else X[0]
        md5 = md5_checksum(x)
        key = _create_key(tsne_version, kwargs, md5)
        if key in _cached_values:
            results.append((0, _cached_values[key]))
        else:
            X_new.append((0, md5, x))
    else:
        for i, x in enumerate(X):
            md5 = md5_checksum(x)
            key = _create_key(tsne_version, kwargs, md5)
            if key in _cached_values:
                results.append((i, _cached_values[key]))
            else:
                X_new.append((i, md5, x))

    # ====== perform T-SNE ====== #
    def apply_tsne(j):
        idx, md5, x = j
        if pca_preprocessing:
            x = PCA(n_components=None,
                    random_state=random_state).fit_transform(x)
        tsne = TSNE(**kwargs)
        return (idx, md5, tsne.fit_transform(x),
                tsne if return_model else None)

    # only 1 X, no need for MPI
    if len(X_new) == 1 or tsne_version in ('cuda', 'multicore'):
        for x in X_new:
            idx, md5, x, model = apply_tsne(x)
            results.append((idx, x))
            _cached_values[_create_key(tsne_version, kwargs, md5)] = x
    else:
        mpi = MPI(jobs=X_new,
                  func=apply_tsne,
                  batch=1,
                  ncpu=min(len(X_new),
                           cpu_count() - 1))
        model = []
        for idx, md5, x, m in mpi:
            results.append((idx, x))
            _cached_values[_create_key(tsne_version, kwargs, md5)] = x
            model.append(m)
    # ====== return and clean ====== #
    if merge_inputs and len(X_size) > 1:
        indices = [0] + np.cumsum(X_size).tolist()
        results = [results[0][1][s:e] for s, e in zip(indices, indices[1:])]
    else:
        results = sorted(results, key=lambda a: a[0])
        results = [r[1] for r in results]
    results = results[0] if len(results) == 1 else results
    if return_model:
        return results, model
    del model
    return results