Ejemplo n.º 1
0
Archivo: magic.py Proyecto: akv84/MAGIC
    def __init__(self,
                 knn=10,
                 decay=15,
                 t='auto',
                 n_pca=100,
                 knn_dist='euclidean',
                 n_jobs=1,
                 random_state=None,
                 verbose=1,
                 k=None,
                 a=None):
        if k is not None:
            knn = k
        if a is not None:
            decay = a
        self.knn = knn
        self.decay = decay
        self.t = t
        self.n_pca = n_pca
        self.knn_dist = knn_dist
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.graph = None
        self.X = None
        self.X_magic = None
        self._check_params()
        self.verbose = verbose
        tasklogger.set_level(verbose)
Ejemplo n.º 2
0
 def __init__(
     self,
     knn=10,
     knn_max=None,
     decay=2,
     t="auto",
     n_pca=100,
     solver="exact",
     knn_dist="euclidean",
     n_jobs=1,
     random_state=None,
     verbose=1,
     k=None,
     a=None,
 ):
     if k is not None:
         knn = k
     if a is not None:
         decay = a
     self.knn = knn
     self.knn_max = knn_max
     self.decay = decay
     self.t = t
     self.n_pca = n_pca
     self.knn_dist = knn_dist
     self.n_jobs = n_jobs
     self.random_state = random_state
     self.solver = solver
     self.graph = None
     self.X = None
     self.X_magic = None
     self._check_params()
     self.verbose = verbose
     tasklogger.set_level(verbose)
Ejemplo n.º 3
0
    def set_params(self, **params):
        """Set the parameters of SCYN.

        Any parameters not given as named arguments will be left at their
        current value.

        Parameters
        ----------

        seq : string, optional, default: single-end
            The reads type: single-end or paired-end

        bin_len : int, optional, default: 500
            The bin length, default is 500K
        
        ref : string, optional, default: hg19
            The reference genome version: hg19 or hg38

        reg : string, optional, default: *.bam
            The regular expression to match all BAM files in your input directory.
            For example, "*.bam" will match all BAM files ended with '.bam'
        
        mapq : int, optional, default: 40
            The mapping quality cutoff when calculating the reads coverage
        
        K : int, optional, default: 10
            The predifined changepoints number for all chromosomes

        verbose : `int` or `boolean`, optional, default: 1
            If `True` or `> 0`, print log messages

        Returns
        -------
        self
        """

        # parameters
        if 'seq' in params and params['seq'] != self.seq:
            self.seq = params['seq']
            del params['seq']
        if 'bin_len' in params and params['bin_len'] != self.bin_len:
            self.bin_len = params['bin_len']
            del params['bin_len']
        if 'ref' in params and params['ref'] != self.ref:
            self.ref = params['ref']
            del params['ref']
        if 'reg' in params and params['reg'] != self.reg:
            self.reg = params['reg']
            del params['reg']
        if 'mapq' in params and params['mapq'] != self.mapq:
            self.mapq = params['mapq']
            del params['mapq']
        if 'verbose' in params:
            self.verbose = params['verbose']
            tasklogger.set_level(self.verbose)
            del params['verbose']

        self._check_params()
        return self
Ejemplo n.º 4
0
 def __init__(self, seq='single-end', bin_len=500, ref='hg19', reg='*.bam', mapq=40, K=None, verbose=1):
     self.seq = seq
     self.bin_len = bin_len
     self.ref = ref
     self.reg = reg
     self.mapq = mapq
     self.verbose = verbose
     self._check_params()
     self.cnv = None
     self.meta_info = None
     self.segments = None
     self.bin_info = None
     self.K = K
     tasklogger.set_level(verbose)
Ejemplo n.º 5
0
    def __init__(self, k=10, a=15, t='auto', n_pca=100,
                 knn_dist='euclidean', n_jobs=1, random_state=None,
                 verbose=1):
        self.k = k
        self.a = a
        self.t = t
        self.n_pca = n_pca
        self.knn_dist = knn_dist
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.graph = None
        self.X = None
        self.X_magic = None
        self._check_params()
        self.verbose = verbose
        tasklogger.set_level(verbose)
 def __init__(
     self,
     n_filters,
     overlap=2,
     t=1,
     knn=5,
     decay=20,
     n_pca=100,
     n_eigenvectors=None,
     n_jobs=1,
     verbose=False,
     random_state=None,
     knn_X=None,
     knn_Y=None,
     knn_XY=None,
     decay_X=None,
     decay_Y=None,
     decay_XY=None,
     n_pca_X=None,
     n_pca_Y=None,
     n_pca_XY=0,
 ):
     self.n_filters = n_filters
     self.overlap = overlap
     self.t = t
     self.n_eigenvectors = n_eigenvectors
     self.n_jobs = joblib.effective_n_jobs(n_jobs=n_jobs)
     self.random_state = random_state
     self.verbose = verbose
     self.knn_X = utils.with_default(knn_X, knn)
     self.knn_Y = utils.with_default(knn_Y, knn)
     self.knn_XY = utils.with_default(knn_XY, knn)
     self.decay_X = utils.with_default(decay_X, decay)
     self.decay_Y = utils.with_default(decay_Y, decay)
     self.decay_XY = utils.with_default(decay_XY, decay)
     self.n_pca_X = utils.with_default(n_pca_X,
                                       n_pca) if n_pca_X != 0 else None
     self.n_pca_Y = utils.with_default(n_pca_Y,
                                       n_pca) if n_pca_Y != 0 else None
     self.n_pca_XY = utils.with_default(n_pca_XY,
                                        n_pca) if n_pca_XY != 0 else None
     tasklogger.set_level(self.verbose)
     super().__init__()
Ejemplo n.º 7
0
 def __init__(self,
              n=20,
              c_drop=0.5,
              p_pca=0.4,
              alpha=0.01,
              normalize=True,
              iteration=False,
              verbose=1):
     self.ZERO_VALUE = np.log10(1.01)
     self.n = n
     self.c_drop = c_drop
     self.p_pca = p_pca
     self.alpha = alpha
     self.normalize = normalize
     self.iteration = iteration
     self._check_params()
     self.verbose = verbose
     if self.normalize:
         self.ZERO_VALUE = 0.01
     tasklogger.set_level(verbose)
Ejemplo n.º 8
0
def test_level():
    logger = tasklogger.set_level(2)
    assert logger.level == logging.DEBUG
    assert logger.logger.level == logging.DEBUG
            args.metadata_channels = None
        else:
            parser.error(
                "Cannot handle --metadata-channels with {} file".format(
                    filetype))

    # check for inappropriately set parameters
    if not args.transform == 'log':
        if '--pseudocount' in sys.argv:
            parser.error(
                "Cannot handle --pseudocount with --transform {}".format(
                    args.transform))
        else:
            args.pseudocount = None
    if not args.transform == 'arcsinh':
        if '--cofactor' in sys.argv:
            parser.error("Cannot handle --cofactor with --transform {}".format(
                args.transform))
        else:
            args.cofactor = None

    return args


if __name__ == "__main__":
    args = parse_args()
    tasklogger.set_level(args.verbose)
    tasklogger.log_debug("Running MAGIC with arguments {}".format(
        args.__dict__))
    run_magic_from_file(**(args.__dict__))
Ejemplo n.º 10
0
    def __init__(self,
                 n_components=2,
                 knn=5,
                 decay=40,
                 n_landmark=2000,
                 t='auto',
                 gamma=1,
                 n_pca=100,
                 knn_dist='euclidean',
                 mds_dist='euclidean',
                 mds='metric',
                 n_jobs=1,
                 random_state=None,
                 verbose=1,
                 potential_method=None,
                 alpha_decay=None,
                 njobs=None,
                 k=None,
                 a=None,
                 **kwargs):
        if k is not None:
            knn = k
        if a is not None:
            decay = a
        self.n_components = n_components
        self.decay = decay
        self.knn = knn
        self.t = t
        self.n_landmark = n_landmark
        self.mds = mds
        self.n_pca = n_pca
        self.knn_dist = knn_dist
        self.mds_dist = mds_dist
        self.random_state = random_state
        self.kwargs = kwargs

        self.graph = None
        self._diff_potential = None
        self.embedding = None
        self.X = None
        self.optimal_t = None

        if (alpha_decay is True and decay is None) or \
                (alpha_decay is False and decay is not None):
            warnings.warn(
                "alpha_decay is deprecated. Use `decay=None`"
                " to disable alpha decay in future.", FutureWarning)
            if not alpha_decay:
                self.decay = None

        if njobs is not None:
            warnings.warn("njobs is deprecated. Please use n_jobs in future.",
                          FutureWarning)
            n_jobs = njobs
        self.n_jobs = n_jobs

        if potential_method is not None:
            if potential_method == 'log':
                gamma = 1
            elif potential_method == 'sqrt':
                gamma = 0
            else:
                raise ValueError(
                    "potential_method {} not recognized. Please "
                    "use gamma between -1 and 1".format(potential_method))
            warnings.warn(
                "potential_method is deprecated. "
                "Setting gamma to {} to achieve"
                " {} transformation.".format(gamma, potential_method),
                FutureWarning)
        elif gamma > 0.99 and gamma < 1:
            warnings.warn(
                "0.99 < gamma < 1 is numerically unstable. "
                "Setting gamma to 0.99", RuntimeWarning)
            gamma = 0.99
        self.gamma = gamma

        if verbose is True:
            verbose = 1
        elif verbose is False:
            verbose = 0
        self.verbose = verbose
        self._check_params()
        tasklogger.set_level(verbose)
Ejemplo n.º 11
0
    def set_params(self, **params):
        """Set the parameters on this estimator.

        Any parameters not given as named arguments will be left at their
        current value.

        Parameters
        ----------

        n_components : int, optional, default: 2
            number of dimensions in which the data will be embedded

        knn : int, optional, default: 5
            number of nearest neighbors on which to build kernel

        decay : int, optional, default: 40
            sets decay rate of kernel tails.
            If None, alpha decaying kernel is not used

        n_landmark : int, optional, default: 2000
            number of landmarks to use in fast PHATE

        t : int, optional, default: 'auto'
            power to which the diffusion operator is powered.
            This sets the level of diffusion. If 'auto', t is selected
            according to the knee point in the Von Neumann Entropy of
            the diffusion operator

        gamma : float, optional, default: 1
            Informational distance constant between -1 and 1.
            `gamma=1` gives the PHATE log potential, `gamma=0` gives
            a square root potential.

        n_pca : int, optional, default: 100
            Number of principal components to use for calculating
            neighborhoods. For extremely large datasets, using
            n_pca < 20 allows neighborhoods to be calculated in
            roughly log(n_samples) time.

        knn_dist : string, optional, default: 'euclidean'
            recommended values: 'euclidean', 'cosine', 'precomputed'
            Any metric from `scipy.spatial.distance` can be used
            distance metric for building kNN graph. Custom distance
            functions of form `f(x, y) = d` are also accepted. If 'precomputed',
            `data` should be an n_samples x n_samples distance or
            affinity matrix. Distance matrices are assumed to have zeros
            down the diagonal, while affinity matrices are assumed to have
            non-zero values down the diagonal. This is detected automatically
            using `data[0,0]`. You can override this detection with
            `knn_dist='precomputed_distance'` or `knn_dist='precomputed_affinity'`.

        mds_dist : string, optional, default: 'euclidean'
            recommended values: 'euclidean' and 'cosine'
            Any metric from `scipy.spatial.distance` can be used
            distance metric for MDS

        mds : string, optional, default: 'metric'
            choose from ['classic', 'metric', 'nonmetric'].
            Selects which MDS algorithm is used for dimensionality reduction

        n_jobs : integer, optional, default: 1
            The number of jobs to use for the computation.
            If -1 all CPUs are used. If 1 is given, no parallel computing code
            is used at all, which is useful for debugging.
            For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
            n_jobs = -2, all CPUs but one are used

        random_state : integer or numpy.RandomState, optional, default: None
            The generator used to initialize SMACOF (metric, nonmetric) MDS
            If an integer is given, it fixes the seed
            Defaults to the global `numpy` random number generator

        verbose : `int` or `boolean`, optional (default: 1)
            If `True` or `> 0`, print status messages

        k : Deprecated for `knn`

        a : Deprecated for `decay`

        Examples
        --------
        >>> import phate
        >>> import matplotlib.pyplot as plt
        >>> tree_data, tree_clusters = phate.tree.gen_dla(n_dim=50, n_branch=5,
        ...                                               branch_length=50)
        >>> tree_data.shape
        (250, 50)
        >>> phate_operator = phate.PHATE(k=5, a=20, t=150)
        >>> tree_phate = phate_operator.fit_transform(tree_data)
        >>> tree_phate.shape
        (250, 2)
        >>> phate_operator.set_params(n_components=10)
        PHATE(a=20, alpha_decay=None, k=5, knn_dist='euclidean', mds='metric',
           mds_dist='euclidean', n_components=10, n_jobs=1, n_landmark=2000,
           n_pca=100, njobs=None, potential_method='log', random_state=None, t=150,
           verbose=1)
        >>> tree_phate = phate_operator.transform()
        >>> tree_phate.shape
        (250, 10)
        >>> # plt.scatter(tree_phate[:,0], tree_phate[:,1], c=tree_clusters)
        >>> # plt.show()

        Returns
        -------
        self
        """
        reset_kernel = False
        reset_potential = False
        reset_embedding = False

        # mds parameters
        if 'n_components' in params and \
                params['n_components'] != self.n_components:
            self.n_components = params['n_components']
            reset_embedding = True
            del params['n_components']
        if 'mds' in params and params['mds'] != self.mds:
            self.mds = params['mds']
            reset_embedding = True
            del params['mds']
        if 'mds_dist' in params and params['mds_dist'] != self.mds_dist:
            self.mds_dist = params['mds_dist']
            reset_embedding = True
            del params['mds_dist']

        # diff potential parameters
        if 't' in params and params['t'] != self.t:
            self.t = params['t']
            reset_potential = True
            del params['t']
        if 'potential_method' in params:
            if params['potential_method'] == 'log':
                params['gamma'] = 1
            elif params['potential_method'] == 'sqrt':
                params['gamma'] = 0
            else:
                raise ValueError("potential_method {} not recognized. Please "
                                 "use gamma between -1 and 1".format(
                                     params['potential_method']))
            warnings.warn(
                "potential_method is deprecated. Setting gamma to {} to "
                "achieve {} transformation.".format(
                    params['gamma'], params['potential_method']),
                FutureWarning)
            del params['potential_method']
        if 'gamma' in params and \
                params['gamma'] != self.gamma:
            self.gamma = params['gamma']
            reset_potential = True
            del params['gamma']

        # kernel parameters
        if 'k' in params and params['k'] != self.knn:
            self.knn = params['k']
            reset_kernel = True
            del params['k']
        if 'a' in params and params['a'] != self.decay:
            self.decay = params['a']
            reset_kernel = True
            del params['a']
        if 'knn' in params and params['knn'] != self.knn:
            self.knn = params['knn']
            reset_kernel = True
            del params['knn']
        if 'decay' in params and params['decay'] != self.decay:
            self.decay = params['decay']
            reset_kernel = True
            del params['decay']
        if 'n_pca' in params:
            if self.X is not None and params['n_pca'] >= np.min(self.X.shape):
                params['n_pca'] = None
            if params['n_pca'] != self.n_pca:
                self.n_pca = params['n_pca']
                reset_kernel = True
                del params['n_pca']
        if 'knn_dist' in params and params['knn_dist'] != self.knn_dist:
            self.knn_dist = params['knn_dist']
            reset_kernel = True
            del params['knn_dist']
        if 'n_landmark' in params and params['n_landmark'] != self.n_landmark:
            if self.n_landmark is None or params['n_landmark'] is None:
                # need a different type of graph, reset entirely
                self._reset_graph()
            else:
                self._set_graph_params(n_landmark=params['n_landmark'])
            self.n_landmark = params['n_landmark']
            del params['n_landmark']

        # parameters that don't change the embedding
        if 'n_jobs' in params:
            self.n_jobs = params['n_jobs']
            self._set_graph_params(n_jobs=params['n_jobs'])
            del params['n_jobs']
        if 'random_state' in params:
            self.random_state = params['random_state']
            self._set_graph_params(random_state=params['random_state'])
            del params['random_state']
        if 'verbose' in params:
            self.verbose = params['verbose']
            tasklogger.set_level(self.verbose)
            self._set_graph_params(verbose=params['verbose'])
            del params['verbose']

        if reset_kernel:
            # can't reset the graph kernel without making a new graph
            self._reset_graph()
        if reset_potential:
            self._reset_potential()
        if reset_embedding:
            self._reset_embedding()

        self._set_graph_params(**params)

        self._check_params()
        return self
Ejemplo n.º 12
0
def run_magic_from_file(
        filename,
        # data loading params
        sparse=True,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        # kernel params
        knn=5,
        decay=15,
        n_pca=100,
        knn_dist='euclidean',
        n_jobs=1,
        random_state=42,
        verbose=1,
        # magic params
        t_magic='auto',
        genes=None,
        # output params
        output='magic.csv',
        validate=False):
    """Run MAGIC on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    knn : int, optional, default: 10
        number of nearest neighbors on which to build kernel
    decay : int, optional, default: 15
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_pca : int, optional, default: 100
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean', 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize random PCA
        If an integer is given, it fixes the seed
        Defaults to the global `numpy` random number generator
    verbose : `int` or `boolean`, optional (default: 1)
        If `True` or `> 0`, print status messages
    t_magic : int, optional, default: 'auto'
        power to which the diffusion operator is powered for MAGIC.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    genes : list or {"all_genes", "pca_only"}, optional (default: None)
        List of genes to return from MAGIC,
        either as integer indices or column names
        if input data is a pandas DataFrame. If "all_genes", the entire
        smoothed matrix is returned. If "pca_only", PCA on the smoothed
        data is returned. If None, the entire matrix is also
        returned, but a warning may be raised if the resultant matrix
        is very large.
    output : str, optional (default: 'magic.csv')
        Output CSV file to save smoothed data matrix
    """
    # check arguments
    filetype = check_filetype(filename)
    load_fn, load_kws = check_load_args(filetype,
                                        sparse=sparse,
                                        gene_names=gene_names,
                                        cell_names=cell_names,
                                        cell_axis=cell_axis,
                                        gene_labels=gene_labels,
                                        allow_duplicates=allow_duplicates,
                                        genome=genome,
                                        metadata_channels=metadata_channels)
    transform_fn, transform_kws = check_transform_args(transform=transform,
                                                       pseudocount=pseudocount,
                                                       cofactor=cofactor)

    # set up logging
    # https://github.com/scottgigante/tasklogger
    tasklogger.set_level(verbose)

    # load data
    # example: scprep.io.load_csv("data.csv")
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io
    tasklogger.log_info("Loading data from {}...".format(filename))
    data = load_fn(filename, **load_kws)
    data = scprep.sanitize.check_numeric(data, copy=True)
    tasklogger.log_info("Loaded {} cells and {} genes.".format(
        data.shape[0], data.shape[1]))

    # filter data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter
    if min_library_size is not None:
        tasklogger.log_info("Filtering cells by library size >= {}...".format(
            min_library_size))
        data = scprep.filter.filter_library_size(data, cutoff=min_library_size)
        tasklogger.log_info("Retained {} cells.".format(data.shape[0]))
    if min_cells_per_gene is not None:
        tasklogger.log_info(
            "Filtering genes by min cells >= {}...".format(min_cells_per_gene))
        data = scprep.filter.filter_rare_genes(data,
                                               min_cells=min_cells_per_gene)
        tasklogger.log_info("Retained {} genes.".format(data.shape[1]))

    # normalize data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize
    if library_size_normalize:
        tasklogger.log_info("Library size normalizing data...")
        data = scprep.normalize.library_size_normalize(data)

    # transform data
    # example: data = scprep.transform.sqrt(data)
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform
    if transform is not None:
        tasklogger.log_info("Applying {} transform...".format(transform))
        data = transform_fn(data, **transform_kws)

    # run MAGIC
    # https://magic.readthedocs.io/
    magic_op = magic.MAGIC(knn=knn,
                           decay=decay,
                           t=t_magic,
                           n_pca=n_pca,
                           knn_dist=knn_dist,
                           n_jobs=n_jobs,
                           random_state=random_state,
                           verbose=verbose)
    magic_data = magic_op.fit_transform(data, genes=genes)

    # save as csv
    magic_data = pd.DataFrame(magic_data)
    if cell_axis in ['col', 'column']:
        magic_data = magic_data.T
    tasklogger.log_info("Saving data to {}...".format(output))
    magic_data.to_csv(output)
    tasklogger.log_info("Complete.".format(output))
    if validate:
        correct_magic_data = scprep.io.load_csv(
            'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/'
            'master/magic-validate.csv',
            sparse=False)
        try:
            np.testing.assert_equal(scprep.utils.toarray(magic_data),
                                    scprep.utils.toarray(correct_magic_data))
            tasklogger.log_debug(
                "Validation complete, output is equal to expected")
        except AssertionError:
            np.testing.assert_allclose(
                scprep.utils.toarray(magic_data),
                scprep.utils.toarray(correct_magic_data),
                atol=1e-14)
            tasklogger.log_debug(
                "Validation complete, output is numerically equivalent to expected"
            )
Ejemplo n.º 13
0
 def __init__(self, data, verbose=True, n_jobs=1, **kwargs):
     # kwargs are ignored
     self.n_jobs = n_jobs
     self.verbose = verbose
     tasklogger.set_level(verbose)
     super().__init__(data, **kwargs)
Ejemplo n.º 14
0
    def set_params(self, **params):
        """Set the parameters on this estimator.

        Any parameters not given as named arguments will be left at their
        current value.

        Parameters
        ----------

        k : int, optional, default: 10
            number of nearest neighbors on which to build kernel

        a : int, optional, default: 15
            sets decay rate of kernel tails.
            If None, alpha decaying kernel is not used

        t : int, optional, default: 'auto'
            power to which the diffusion operator is powered.
            This sets the level of diffusion. If 'auto', t is selected
            according to the R squared of the diffused data

        n_pca : int, optional, default: 100
            Number of principal components to use for calculating
            neighborhoods. For extremely large datasets, using
            n_pca < 20 allows neighborhoods to be calculated in
            roughly log(n_samples) time.

        knn_dist : string, optional, default: 'euclidean'
            recommended values: 'euclidean', 'cosine', 'precomputed'
            Any metric from `scipy.spatial.distance` can be used
            distance metric for building kNN graph. If 'precomputed',
            `data` should be an n_samples x n_samples distance or
            affinity matrix

        n_jobs : integer, optional, default: 1
            The number of jobs to use for the computation.
            If -1 all CPUs are used. If 1 is given, no parallel computing code
            is used at all, which is useful for debugging.
            For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
            n_jobs = -2, all CPUs but one are used

        random_state : integer or numpy.RandomState, optional, default: None
            The generator used to initialize random PCA
            If an integer is given, it fixes the seed
            Defaults to the global `numpy` random number generator

        verbose : `int` or `boolean`, optional (default: 1)
            If `True` or `> 0`, print status messages

        Returns
        -------
        self
        """
        reset_kernel = False
        reset_imputation = False
        # diff potential parameters
        if 't' in params and params['t'] != self.t:
            self.t = params['t']
            reset_imputation = True
            del params['t']

        # kernel parameters
        if 'k' in params and params['k'] != self.k:
            self.k = params['k']
            reset_kernel = True
            del params['k']
        if 'a' in params and params['a'] != self.a:
            self.a = params['a']
            reset_kernel = True
            del params['a']
        if 'n_pca' in params and params['n_pca'] != self.n_pca:
            self.n_pca = params['n_pca']
            reset_kernel = True
            del params['n_pca']
        if 'knn_dist' in params and params['knn_dist'] != self.knn_dist:
            self.knn_dist = params['knn_dist']
            reset_kernel = True
            del params['knn_dist']

        # parameters that don't change the embedding
        if 'n_jobs' in params:
            self.n_jobs = params['n_jobs']
            self._set_graph_params(n_jobs=params['n_jobs'])
            del params['n_jobs']
        if 'random_state' in params:
            self.random_state = params['random_state']
            self._set_graph_params(random_state=params['random_state'])
            del params['random_state']
        if 'verbose' in params:
            self.verbose = params['verbose']
            tasklogger.set_level(self.verbose)
            self._set_graph_params(verbose=params['verbose'])
            del params['verbose']

        if reset_kernel:
            # can't reset the graph kernel without making a new graph
            self.graph = None
            reset_imputation = True
        if reset_imputation:
            self.X_magic = None

        self._check_params()
        return self
def parse_args():
    parser = argparse.ArgumentParser(
        description='Run MAGIC for imputation of '
        'high-dimensional data.',
        epilog='For help, visit magic.readthedocs.io or '
        'krishnaswamylab.org/get-help',
        add_help=True,
        allow_abbrev=True)

    io_group = parser.add_argument_group('Data IO')
    filename = io_group.add_mutually_exclusive_group(required=True)
    filename.add_argument('--filename',
                          type=str,
                          default=None,
                          help='Input data. Allowed types: csv, tsv, mtx, '
                          'hdf5/h5 (10X format), directory/zip (10X format)')
    filename.add_argument('--validate',
                          action='store_true',
                          default=False,
                          help='Run MAGIC on a test dataset to ensure '
                          'output is correct.')
    sparse = io_group.add_mutually_exclusive_group()
    sparse.add_argument('--sparse',
                        action='store_true',
                        help='Use sparse data format',
                        dest='sparse',
                        default=None)
    sparse.add_argument('--dense',
                        action='store_false',
                        help='Use dense data format',
                        dest='sparse',
                        default=None)
    gene_names = io_group.add_mutually_exclusive_group()
    gene_names.add_argument('--gene-names',
                            action='store_true',
                            help='Use gene name headers in data file'
                            ' (csv, tsv, fcs)',
                            dest='gene_names',
                            default=True)
    gene_names.add_argument('--no-gene-names',
                            action='store_false',
                            help='Do not use gene names'
                            ' (csv, tsv, fcs, mtx)',
                            dest='gene_names',
                            default=True)
    gene_names.add_argument('--gene-name-file',
                            type=str,
                            help='Use gene name headers in FILE'
                            ' (csv, tsv, fcs, mtx)',
                            metavar='FILE',
                            dest='gene_names',
                            default=True)
    cell_names = io_group.add_mutually_exclusive_group()
    cell_names.add_argument('--cell-names',
                            action='store_true',
                            help='Use cell name headers in data file'
                            ' (csv, tsv, fcs)',
                            dest='cell_names',
                            default=True)
    cell_names.add_argument('--no-cell-names',
                            action='store_false',
                            help='Do not use cell names'
                            ' (csv, tsv, fcs, mtx)',
                            dest='cell_names',
                            default=True)
    cell_names.add_argument('--cell-name-file',
                            type=str,
                            help='Use cell name headers in FILE'
                            ' (csv, tsv, fcs, mtx)',
                            metavar='FILE',
                            dest='cell_names',
                            default=True)
    io_group.add_argument('--cell-axis',
                          type=str,
                          choices=['row', 'column'],
                          default='row',
                          help='States whether cells are on rows or columns '
                          '(csv, tsv, mtx)')
    io_group.add_argument('--gene-labels',
                          type=str,
                          default='both',
                          choices=['symbol', 'id', 'both'],
                          help='Choice of gene labels for 10X data'
                          ' (dir, zip, hdf5)')
    io_group.add_argument('--genome',
                          type=str,
                          default=None,
                          help='Genome name for 10X HDF5 data (hdf5)')
    io_group.add_argument(
        '--metadata-channels',
        type=str,
        nargs='+',
        default=[
            'Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist',
            'bead1'
        ],
        help='Names of channels to remove from fcs data (fcs)',
        metavar='CHANNEL')

    preprocess_group = parser.add_argument_group('Preprocessing')
    cell_filter = preprocess_group.add_mutually_exclusive_group()
    cell_filter.add_argument('--min-library-size',
                             type=int,
                             default=2000,
                             help='Filter cells with less than COUNTS counts',
                             dest='min_library_size',
                             metavar='COUNTS')
    cell_filter.add_argument('--no-cell-filter',
                             action='store_false',
                             default=2000,
                             dest='min_library_size',
                             help='Do not filter cells')
    gene_filter = preprocess_group.add_mutually_exclusive_group()
    gene_filter.add_argument(
        '--min-cells-per-gene',
        type=int,
        default=10,
        help='Filter genes with less than CELLS non-zero cells',
        dest='min_cells_per_gene',
        metavar='CELLS')
    gene_filter.add_argument('--no-gene-filter',
                             action='store_false',
                             default=2000,
                             dest='min_cells_per_gene',
                             help='Do not filter genes')
    libnorm = preprocess_group.add_mutually_exclusive_group()
    libnorm.add_argument(
        '--normalize',
        action='store_true',
        default=True,
        dest='library_size_normalize',
        help='Normalize cells by total UMI count (library size)')
    libnorm.add_argument('--no-normalize',
                         action='store_false',
                         default=True,
                         dest='library_size_normalize',
                         help='Do not normalize cells')
    transform = preprocess_group.add_mutually_exclusive_group()
    transform.add_argument('--transform',
                           type=str,
                           default='sqrt',
                           choices=['sqrt', 'log', 'arcsinh'],
                           help='Sublinear data transformation function')
    transform.add_argument('--no-transform',
                           action='store_false',
                           default='sqrt',
                           dest='transform',
                           help='Do not transform data')
    preprocess_group.add_argument('--pseudocount',
                                  type=float,
                                  default=1,
                                  help='Pseudocount to add to genes prior '
                                  'to log transform',
                                  metavar='PCOUNT')
    preprocess_group.add_argument('--cofactor',
                                  type=float,
                                  default=5,
                                  help='Factor by which to divide genes prior '
                                  'to arcsinh transform')

    kernel_group = parser.add_argument_group('Kernel Computation')
    kernel_group.add_argument('-k',
                              '--knn',
                              type=int,
                              default=10,
                              dest='knn',
                              help='Number of nearest neighbors on which to '
                              'build kernel')
    decay = kernel_group.add_mutually_exclusive_group()
    decay.add_argument('-a',
                       '--decay',
                       type=int,
                       default=15,
                       dest='decay',
                       help='Sets decay rate of kernel tails')
    decay.add_argument('--no-decay',
                       action='store_false',
                       default=15,
                       dest='decay',
                       help='Do not use alpha decay')
    pca = kernel_group.add_mutually_exclusive_group()
    pca.add_argument('--pca',
                     type=int,
                     default=100,
                     dest='n_pca',
                     help='Number of principal components to use for '
                     'neighborhoods')
    pca.add_argument('--no-pca',
                     action='store_false',
                     default=100,
                     dest='n_pca',
                     help='Do not use PCA')
    kernel_group.add_argument('--knn-dist',
                              type=str,
                              default='euclidean',
                              help='Distance metric to use for calculating '
                              'neighborhoods. Recommended values are '
                              '"euclidean" and "cosine"',
                              metavar='DISTANCE')
    kernel_group.add_argument(
        '-t',
        '--threads',
        type=int,
        default=1,
        help='Use THREADS threads. If -1 all CPUs are used',
        metavar='THREADS',
        dest='random_state')
    kernel_group.add_argument('--seed',
                              type=int,
                              default=None,
                              help='Integer random seed',
                              metavar='SEED',
                              dest='random_state')
    verbose = kernel_group.add_mutually_exclusive_group()
    verbose.add_argument('-v',
                         '--verbose',
                         action='store_true',
                         default=True,
                         help='Print verbose output')
    verbose.add_argument('-q',
                         '--quiet',
                         action='store_false',
                         default=True,
                         help='Do not print verbose output',
                         dest='verbose')
    verbose.add_argument('-vv',
                         '--debug',
                         action='store_true',
                         default=False,
                         help='Print debugging output',
                         dest='debug')

    magic_group = parser.add_argument_group('MAGIC')
    magic_group.add_argument('--t-magic',
                             type=str,
                             default='auto',
                             help='Level of diffusion for MAGIC',
                             metavar='T')
    genes = magic_group.add_mutually_exclusive_group()
    genes.add_argument('--pca-only',
                       action='store_true',
                       default=False,
                       help='Return PCA on the smoothed matrix')
    genes.add_argument('--all-genes',
                       action='store_true',
                       default=False,
                       help='Return the entire smoothed matrix')
    genes.add_argument('--gene-list',
                       type=str,
                       nargs='+',
                       default=None,
                       help='List of genes to return from MAGIC, '
                       'either as integer indices or column names.',
                       metavar='GENE',
                       dest='genes')
    magic_group.add_argument('--output',
                             type=str,
                             default='magic.csv',
                             help='Output CSV file to save smoothed '
                             'data matrix',
                             metavar='FILE')

    args = parser.parse_args()

    if args.validate:
        tasklogger.set_level(2)
        tasklogger.log_info("Running MAGIC validation.")
        args.filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_small.csv"
        args.sparse = False
        args.gene_names = True
        args.cell_names = True
        args.cell_axis = "row"
        args.gene_labels = "both"
        args.genome = None
        args.metadata_channels = None
        args.min_library_size = 1
        args.min_cells_per_gene = 1
        args.library_size_normalize = True
        args.transform = 'sqrt'
        args.pseudocount = None
        args.cofactor = None
        args.knn = 3
        args.decay = 20
        args.n_pca = None
        args.knn_dist = "euclidean"
        args.n_jobs = 1
        args.random_state = 42
        args.verbose = True
        args.debug = True
        args.t_magic = "auto"
        args.all_genes = True
        args.output = "magic-validate.csv"

    # fix magic "genes" argument
    if args.all_genes:
        args.genes = "all_genes"
    elif args.pca_only:
        args.genes = "pca_only"
    else:
        try:
            args.genes = [int(g) for g in args.genes]
        except TypeError:
            # string gene names
            pass
    del args.all_genes
    del args.pca_only
    # fix t argument
    if args.t_magic != 'auto':
        try:
            args.t_magic = int(args.t_magic)
        except TypeError:
            parser.error("argument --t-magic: invalid int value: '{}'".format(
                args.t_magic))
    # fix debug argument
    if args.debug:
        args.verbose = 2
    del args.debug

    # store None values where appropriate
    if args.decay is False:
        args.decay = None
    if args.n_pca is False:
        args.n_pca = None
    if args.min_library_size is False:
        args.min_library_size = None
    if args.min_cells_per_gene is False:
        args.min_cells_per_gene = None

    # check for inappropriately set defaults
    try:
        filetype = check_filetype(args.filename)
    except RuntimeError as e:
        parser.error(str(e))
    if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'fcs']:
        if '--gene-names' not in sys.argv:
            args.gene_names = None
        else:
            parser.error(
                "Cannot handle --gene-names with {} file".format(filetype))
        if '--cell-names' not in sys.argv:
            args.cell_names = None
        else:
            parser.error(
                "Cannot handle --cell-names with {} file".format(filetype))
    if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'mtx']:
        if '--cell-axis' not in sys.argv:
            args.cell_axis = None
        else:
            parser.error(
                "Cannot handle --cell-axis with {} file".format(filetype))
    if filetype not in ['dir', 'zip', 'hdf5', 'h5']:
        if '--gene-labels' not in sys.argv:
            args.gene_labels = None
        else:
            parser.error(
                "Cannot handle --gene-labels with {} file".format(filetype))
    if filetype not in ['hdf5', 'h5']:
        if '--genome' not in sys.argv:
            args.genome = None
        else:
            parser.error(
                "Cannot handle --genome with {} file".format(filetype))
    if filetype not in ['fcs']:
        if '--metadata-channels' not in sys.argv:
            args.metadata_channels = None
        else:
            parser.error(
                "Cannot handle --metadata-channels with {} file".format(
                    filetype))

    # check for inappropriately set parameters
    if not args.transform == 'log':
        if '--pseudocount' in sys.argv:
            parser.error(
                "Cannot handle --pseudocount with --transform {}".format(
                    args.transform))
        else:
            args.pseudocount = None
    if not args.transform == 'arcsinh':
        if '--cofactor' in sys.argv:
            parser.error("Cannot handle --cofactor with --transform {}".format(
                args.transform))
        else:
            args.cofactor = None

    return args
Ejemplo n.º 16
0
    def set_params(self, **params):
        """Set the parameters of I-Impute.

        Any parameters not given as named arguments will be left at their
        current value.

        Parameters
        ----------

        n : int, optional, default: 20
            nth of nearest neighbors on which to build kernel when calculating affinity matrix.

        c_drop : float, optional, default: 0.5
            Dropout event cutoff. For entry whose dropout probability is less than c_drop, we consider 
            it as a real observation, its original value will remain. Otherwise, we conduct the imputation 
            with the aid of information from similar cells.

        p_pca : float, optional, default: 0.4
            Percentage of variance explained by the selected components of PCA. It determines the nmumber of PCs 
            used to calculate the distance between cells.

        alpha : float, optional, default: 0.01
            L1 penalty for Lasso regression.
        
        normalize : boolean, optional, default: True
            By default, I-Impute takes in an unnormalized matrix and performs library size normalization during 
            the denoising step. However, if your data is already normalized or normalization is not desired, you 
            can set normalize=False.

        iteration : boolean, optional, default: False
            The imputation process only performs once when False (it is equivalent to C-Impute described in our paper).
            The imputation process will iterate n times to achieve self-constistent imputation matrix.

        verbose : `int` or `boolean`, optional, default: 1
            If `True` or `> 0`, print status messages

        Returns
        -------
        self
        """

        # kernel parameters
        if 'n' in params and params['n'] != self.n:
            self.n = params['n']
            del params['n']
        if 'c_drop' in params and params['c_drop'] != self.c_drop:
            self.c_drop = params['c_drop']
            del params['c_drop']
        if 'p_pca' in params and params['p_pca'] != self.p_pca:
            self.p_pca = params['p_pca']
            del params['p_pca']
        if 'alpha' in params and params['alpha'] != self.alpha:
            self.alpha = params['alpha']
            del params['alpha']
        if 'normalize' in params and params['normalize'] != self.normalize:
            self.normalize = params['normalize']
            del params['normalize']
        if 'iteration' in params and params['iteration'] != self.iteration:
            self.iteration = params['iteration']
            del params['iteration']
        if 'verbose' in params:
            self.verbose = params['verbose']
            tasklogger.set_level(self.verbose)
            del params['verbose']

        self._check_params()
        return self
Ejemplo n.º 17
0
def Graph(data,
          n_pca=None,
          sample_idx=None,
          adaptive_k='sqrt',
          precomputed=None,
          knn=5,
          decay=10,
          distance='euclidean',
          thresh=1e-4,
          kernel_symm='+',
          gamma=None,
          n_landmark=None,
          n_svd=100,
          beta=1,
          n_jobs=-1,
          verbose=False,
          random_state=None,
          graphtype='auto',
          use_pygsp=False,
          initialize=True,
          **kwargs):
    """Create a graph built on data.

    Automatically selects the appropriate DataGraph subclass based on
    chosen parameters.
    Selection criteria:
    - if `graphtype` is given, this will be respected
    - otherwise:
    -- if `sample_idx` is given, an MNNGraph will be created
    -- if `precomputed` is not given, and either `decay` is `None` or `thresh`
    is given, a kNNGraph will be created
    - otherwise, a TraditionalGraph will be created.

    Incompatibilities:
    - MNNGraph and kNNGraph cannot be precomputed
    - kNNGraph and TraditionalGraph do not accept sample indices

    Parameters
    ----------
    data : array-like, shape=[n_samples,n_features]
        accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
        TODO: accept pandas dataframes

    n_pca : `int` or `None`, optional (default: `None`)
        number of PC dimensions to retain for graph building.
        If `None`, uses the original data.
        Note: if data is sparse, uses SVD instead of PCA
        TODO: should we subtract and store the mean?

    knn : `int`, optional (default: 5)
        Number of nearest neighbors (including self) to use to build the graph

    decay : `int` or `None`, optional (default: 10)
        Rate of alpha decay to use. If `None`, alpha decay is not used.

    distance : `str`, optional (default: `'euclidean'`)
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
        TODO: actually sklearn.neighbors has even more choices

    thresh : `float`, optional (default: `1e-4`)
        Threshold above which to calculate alpha decay kernel.
        All affinities below `thresh` will be set to zero in order to save
        on time and memory constraints.

    kernel_symm : string, optional (default: '+')
        Defines method of MNN symmetrization.
        '+'  : additive
        '*'  : multiplicative
        'gamma' : min-max
        'none' : no symmetrization

    gamma: float (default: None)
        Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'.
        K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)`

    precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`)
        If the graph is precomputed, this variable denotes which graph
        matrix is provided as `data`.
        Only one of `precomputed` and `n_pca` can be set.

    beta: float, optional(default: 1)
        Multiply within - batch connections by(1 - beta)

    sample_idx: array-like
        Batch index for MNN kernel

    adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt')
        Weights MNN kernel adaptively using the number of cells in
        each sample according to the selected method.

    n_landmark : `int`, optional (default: 2000)
        number of landmarks to use

    n_svd : `int`, optional (default: 100)
        number of SVD components to use for spectral clustering

    random_state : `int` or `None`, optional (default: `None`)
        Random state for random PCA

    verbose : `bool`, optional (default: `True`)
        Verbosity.
        TODO: should this be an integer instead to allow multiple
        levels of verbosity?

    n_jobs : `int`, optional (default : 1)
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used

    graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto')
        Manually selects graph type. Only recommended for expert users

    use_pygsp : `bool` (Default: `False`)
        If true, inherits from `pygsp.graphs.Graph`.

    initialize : `bool` (Default: `True`)
        If True, initialize the kernel matrix on instantiation

    **kwargs : extra arguments for `pygsp.graphs.Graph`

    Returns
    -------
    G : `DataGraph`

    Raises
    ------
    ValueError : if selected parameters are incompatible.
    """
    tasklogger.set_level(verbose)
    if sample_idx is not None and len(np.unique(sample_idx)) == 1:
        warnings.warn("Only one unique sample. " "Not using MNNGraph")
        sample_idx = None
        if graphtype == 'mnn':
            graphtype = 'auto'
    if graphtype == 'auto':
        # automatic graph selection
        if sample_idx is not None:
            # only mnn does batch correction
            graphtype = "mnn"
        elif precomputed is None and (decay is None or thresh > 0):
            # precomputed requires exact graph
            # no decay or threshold decay require knngraph
            graphtype = "knn"
        else:
            graphtype = "exact"

    # set base graph type
    if graphtype == "knn":
        basegraph = graphs.kNNGraph
        if precomputed is not None:
            raise ValueError("kNNGraph does not support precomputed "
                             "values. Use `graphtype='exact'` or "
                             "`precomputed=None`")
        if sample_idx is not None:
            raise ValueError("kNNGraph does not support batch "
                             "correction. Use `graphtype='mnn'` or "
                             "`sample_idx=None`")

    elif graphtype == "mnn":
        basegraph = graphs.MNNGraph
        if precomputed is not None:
            raise ValueError("MNNGraph does not support precomputed "
                             "values. Use `graphtype='exact'` and "
                             "`sample_idx=None` or `precomputed=None`")
    elif graphtype == "exact":
        basegraph = graphs.TraditionalGraph
        if sample_idx is not None:
            raise ValueError("TraditionalGraph does not support batch "
                             "correction. Use `graphtype='mnn'` or "
                             "`sample_idx=None`")
    else:
        raise ValueError("graphtype '{}' not recognized. Choose from "
                         "['knn', 'mnn', 'exact', 'auto']")

    # set add landmarks if necessary
    parent_classes = [basegraph]
    msg = "Building {} graph".format(graphtype)
    if n_landmark is not None:
        parent_classes.append(graphs.LandmarkGraph)
        msg = msg + " with landmarks"
    if use_pygsp:
        parent_classes.append(base.PyGSPGraph)
        if len(parent_classes) > 2:
            msg = msg + " with PyGSP inheritance"
        else:
            msg = msg + " and PyGSP inheritance"

    tasklogger.log_debug(msg)

    class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
    try:
        Graph = eval("graphs." + "".join(class_names) + "Graph")
    except NameError:
        raise RuntimeError("unknown graph classes {}".format(parent_classes))

    params = kwargs
    for parent_class in parent_classes:
        for param in parent_class._get_param_names():
            try:
                params[param] = eval(param)
            except NameError:
                # keyword argument not specified above - no problem
                pass

    # build graph and return
    tasklogger.log_debug("Initializing {} with arguments {}".format(
        parent_classes, ", ".join([
            "{}='{}'".format(key, value) for key, value in params.items()
            if key != "data"
        ])))
    return Graph(**params)
Ejemplo n.º 18
0
    def set_params(self, **params):
        """Set the parameters on this estimator.

        Any parameters not given as named arguments will be left at their
        current value.

        Parameters
        ----------

        knn : int, optional, default: 5
            number of nearest neighbors on which to build kernel

        decay : int, optional, default: 1
            sets decay rate of kernel tails.
            If None, alpha decaying kernel is not used

        t : int, optional, default: 3
            power to which the diffusion operator is powered.
            This sets the level of diffusion. If 'auto', t is selected
            according to the R squared of the diffused data

        n_pca : int, optional, default: 100
            Number of principal components to use for calculating
            neighborhoods. For extremely large datasets, using
            n_pca < 20 allows neighborhoods to be calculated in
            roughly log(n_samples) time.

        knn_dist : string, optional, default: 'euclidean'
            recommended values: 'euclidean', 'cosine'
            Any metric from `scipy.spatial.distance` can be used
            distance metric for building kNN graph.

        n_jobs : integer, optional, default: 1
            The number of jobs to use for the computation.
            If -1 all CPUs are used. If 1 is given, no parallel computing code
            is used at all, which is useful for debugging.
            For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
            n_jobs = -2, all CPUs but one are used

        random_state : integer or numpy.RandomState, optional, default: None
            The generator used to initialize random PCA
            If an integer is given, it fixes the seed
            Defaults to the global `numpy` random number generator

        verbose : `int` or `boolean`, optional (default: 1)
            If `True` or `> 0`, print status messages

        k : Deprecated for `knn`

        a : Deprecated for `decay`

        Returns
        -------
        self
        """
        reset_kernel = False
        reset_imputation = False
        # diff potential parameters
        if "t" in params and params["t"] != self.t:
            self.t = params["t"]
            reset_imputation = True
            del params["t"]

        # kernel parameters
        if "k" in params and params["k"] != self.knn:
            warnings.warn(
                "Parameter `k` is deprecated and will be removed"
                " in a future version. Use `knn` instead",
                FutureWarning,
            )
            self.knn = params["k"]
            reset_kernel = True
            del params["k"]
        if "a" in params and params["a"] != self.decay:
            warnings.warn(
                "Parameter `a` is deprecated and will be removed"
                " in a future version. Use `decay` instead",
                FutureWarning,
            )
            self.decay = params["a"]
            reset_kernel = True
            del params["a"]
        if "knn" in params and params["knn"] != self.knn:
            self.knn = params["knn"]
            reset_kernel = True
            del params["knn"]
        if "knn_max" in params and params["knn_max"] != self.knn_max:
            self.knn_max = params["knn_max"]
            reset_kernel = True
            del params["knn_max"]
        if "decay" in params and params["decay"] != self.decay:
            self.decay = params["decay"]
            reset_kernel = True
            del params["decay"]
        if "n_pca" in params and params["n_pca"] != self.n_pca:
            self.n_pca = params["n_pca"]
            reset_kernel = True
            del params["n_pca"]
        if "knn_dist" in params and params["knn_dist"] != self.knn_dist:
            self.knn_dist = params["knn_dist"]
            reset_kernel = True
            del params["knn_dist"]

        # parameters that don't change the embedding
        if "solver" in params and params["solver"] != self.solver:
            self.solver = params["solver"]
            reset_imputation = True
            del params["solver"]
        if "n_jobs" in params:
            self.n_jobs = params["n_jobs"]
            self._set_graph_params(n_jobs=params["n_jobs"])
            del params["n_jobs"]
        if "random_state" in params:
            self.random_state = params["random_state"]
            self._set_graph_params(random_state=params["random_state"])
            del params["random_state"]
        if "verbose" in params:
            self.verbose = params["verbose"]
            tasklogger.set_level(self.verbose)
            self._set_graph_params(verbose=params["verbose"])
            del params["verbose"]

        if reset_kernel:
            # can't reset the graph kernel without making a new graph
            self.graph = None
            reset_imputation = True
        if reset_imputation:
            self.X_magic = None

        self._check_params()
        return self