def __init__(self, knn=10, decay=15, t='auto', n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=None, verbose=1, k=None, a=None): if k is not None: knn = k if a is not None: decay = a self.knn = knn self.decay = decay self.t = t self.n_pca = n_pca self.knn_dist = knn_dist self.n_jobs = n_jobs self.random_state = random_state self.graph = None self.X = None self.X_magic = None self._check_params() self.verbose = verbose tasklogger.set_level(verbose)
def __init__( self, knn=10, knn_max=None, decay=2, t="auto", n_pca=100, solver="exact", knn_dist="euclidean", n_jobs=1, random_state=None, verbose=1, k=None, a=None, ): if k is not None: knn = k if a is not None: decay = a self.knn = knn self.knn_max = knn_max self.decay = decay self.t = t self.n_pca = n_pca self.knn_dist = knn_dist self.n_jobs = n_jobs self.random_state = random_state self.solver = solver self.graph = None self.X = None self.X_magic = None self._check_params() self.verbose = verbose tasklogger.set_level(verbose)
def set_params(self, **params): """Set the parameters of SCYN. Any parameters not given as named arguments will be left at their current value. Parameters ---------- seq : string, optional, default: single-end The reads type: single-end or paired-end bin_len : int, optional, default: 500 The bin length, default is 500K ref : string, optional, default: hg19 The reference genome version: hg19 or hg38 reg : string, optional, default: *.bam The regular expression to match all BAM files in your input directory. For example, "*.bam" will match all BAM files ended with '.bam' mapq : int, optional, default: 40 The mapping quality cutoff when calculating the reads coverage K : int, optional, default: 10 The predifined changepoints number for all chromosomes verbose : `int` or `boolean`, optional, default: 1 If `True` or `> 0`, print log messages Returns ------- self """ # parameters if 'seq' in params and params['seq'] != self.seq: self.seq = params['seq'] del params['seq'] if 'bin_len' in params and params['bin_len'] != self.bin_len: self.bin_len = params['bin_len'] del params['bin_len'] if 'ref' in params and params['ref'] != self.ref: self.ref = params['ref'] del params['ref'] if 'reg' in params and params['reg'] != self.reg: self.reg = params['reg'] del params['reg'] if 'mapq' in params and params['mapq'] != self.mapq: self.mapq = params['mapq'] del params['mapq'] if 'verbose' in params: self.verbose = params['verbose'] tasklogger.set_level(self.verbose) del params['verbose'] self._check_params() return self
def __init__(self, seq='single-end', bin_len=500, ref='hg19', reg='*.bam', mapq=40, K=None, verbose=1): self.seq = seq self.bin_len = bin_len self.ref = ref self.reg = reg self.mapq = mapq self.verbose = verbose self._check_params() self.cnv = None self.meta_info = None self.segments = None self.bin_info = None self.K = K tasklogger.set_level(verbose)
def __init__(self, k=10, a=15, t='auto', n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=None, verbose=1): self.k = k self.a = a self.t = t self.n_pca = n_pca self.knn_dist = knn_dist self.n_jobs = n_jobs self.random_state = random_state self.graph = None self.X = None self.X_magic = None self._check_params() self.verbose = verbose tasklogger.set_level(verbose)
def __init__( self, n_filters, overlap=2, t=1, knn=5, decay=20, n_pca=100, n_eigenvectors=None, n_jobs=1, verbose=False, random_state=None, knn_X=None, knn_Y=None, knn_XY=None, decay_X=None, decay_Y=None, decay_XY=None, n_pca_X=None, n_pca_Y=None, n_pca_XY=0, ): self.n_filters = n_filters self.overlap = overlap self.t = t self.n_eigenvectors = n_eigenvectors self.n_jobs = joblib.effective_n_jobs(n_jobs=n_jobs) self.random_state = random_state self.verbose = verbose self.knn_X = utils.with_default(knn_X, knn) self.knn_Y = utils.with_default(knn_Y, knn) self.knn_XY = utils.with_default(knn_XY, knn) self.decay_X = utils.with_default(decay_X, decay) self.decay_Y = utils.with_default(decay_Y, decay) self.decay_XY = utils.with_default(decay_XY, decay) self.n_pca_X = utils.with_default(n_pca_X, n_pca) if n_pca_X != 0 else None self.n_pca_Y = utils.with_default(n_pca_Y, n_pca) if n_pca_Y != 0 else None self.n_pca_XY = utils.with_default(n_pca_XY, n_pca) if n_pca_XY != 0 else None tasklogger.set_level(self.verbose) super().__init__()
def __init__(self, n=20, c_drop=0.5, p_pca=0.4, alpha=0.01, normalize=True, iteration=False, verbose=1): self.ZERO_VALUE = np.log10(1.01) self.n = n self.c_drop = c_drop self.p_pca = p_pca self.alpha = alpha self.normalize = normalize self.iteration = iteration self._check_params() self.verbose = verbose if self.normalize: self.ZERO_VALUE = 0.01 tasklogger.set_level(verbose)
def test_level(): logger = tasklogger.set_level(2) assert logger.level == logging.DEBUG assert logger.logger.level == logging.DEBUG
args.metadata_channels = None else: parser.error( "Cannot handle --metadata-channels with {} file".format( filetype)) # check for inappropriately set parameters if not args.transform == 'log': if '--pseudocount' in sys.argv: parser.error( "Cannot handle --pseudocount with --transform {}".format( args.transform)) else: args.pseudocount = None if not args.transform == 'arcsinh': if '--cofactor' in sys.argv: parser.error("Cannot handle --cofactor with --transform {}".format( args.transform)) else: args.cofactor = None return args if __name__ == "__main__": args = parse_args() tasklogger.set_level(args.verbose) tasklogger.log_debug("Running MAGIC with arguments {}".format( args.__dict__)) run_magic_from_file(**(args.__dict__))
def __init__(self, n_components=2, knn=5, decay=40, n_landmark=2000, t='auto', gamma=1, n_pca=100, knn_dist='euclidean', mds_dist='euclidean', mds='metric', n_jobs=1, random_state=None, verbose=1, potential_method=None, alpha_decay=None, njobs=None, k=None, a=None, **kwargs): if k is not None: knn = k if a is not None: decay = a self.n_components = n_components self.decay = decay self.knn = knn self.t = t self.n_landmark = n_landmark self.mds = mds self.n_pca = n_pca self.knn_dist = knn_dist self.mds_dist = mds_dist self.random_state = random_state self.kwargs = kwargs self.graph = None self._diff_potential = None self.embedding = None self.X = None self.optimal_t = None if (alpha_decay is True and decay is None) or \ (alpha_decay is False and decay is not None): warnings.warn( "alpha_decay is deprecated. Use `decay=None`" " to disable alpha decay in future.", FutureWarning) if not alpha_decay: self.decay = None if njobs is not None: warnings.warn("njobs is deprecated. Please use n_jobs in future.", FutureWarning) n_jobs = njobs self.n_jobs = n_jobs if potential_method is not None: if potential_method == 'log': gamma = 1 elif potential_method == 'sqrt': gamma = 0 else: raise ValueError( "potential_method {} not recognized. Please " "use gamma between -1 and 1".format(potential_method)) warnings.warn( "potential_method is deprecated. " "Setting gamma to {} to achieve" " {} transformation.".format(gamma, potential_method), FutureWarning) elif gamma > 0.99 and gamma < 1: warnings.warn( "0.99 < gamma < 1 is numerically unstable. " "Setting gamma to 0.99", RuntimeWarning) gamma = 0.99 self.gamma = gamma if verbose is True: verbose = 1 elif verbose is False: verbose = 0 self.verbose = verbose self._check_params() tasklogger.set_level(verbose)
def set_params(self, **params): """Set the parameters on this estimator. Any parameters not given as named arguments will be left at their current value. Parameters ---------- n_components : int, optional, default: 2 number of dimensions in which the data will be embedded knn : int, optional, default: 5 number of nearest neighbors on which to build kernel decay : int, optional, default: 40 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_landmark : int, optional, default: 2000 number of landmarks to use in fast PHATE t : int, optional, default: 'auto' power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the knee point in the Von Neumann Entropy of the diffusion operator gamma : float, optional, default: 1 Informational distance constant between -1 and 1. `gamma=1` gives the PHATE log potential, `gamma=0` gives a square root potential. n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. Custom distance functions of form `f(x, y) = d` are also accepted. If 'precomputed', `data` should be an n_samples x n_samples distance or affinity matrix. Distance matrices are assumed to have zeros down the diagonal, while affinity matrices are assumed to have non-zero values down the diagonal. This is detected automatically using `data[0,0]`. You can override this detection with `knn_dist='precomputed_distance'` or `knn_dist='precomputed_affinity'`. mds_dist : string, optional, default: 'euclidean' recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for MDS mds : string, optional, default: 'metric' choose from ['classic', 'metric', 'nonmetric']. Selects which MDS algorithm is used for dimensionality reduction n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize SMACOF (metric, nonmetric) MDS If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages k : Deprecated for `knn` a : Deprecated for `decay` Examples -------- >>> import phate >>> import matplotlib.pyplot as plt >>> tree_data, tree_clusters = phate.tree.gen_dla(n_dim=50, n_branch=5, ... branch_length=50) >>> tree_data.shape (250, 50) >>> phate_operator = phate.PHATE(k=5, a=20, t=150) >>> tree_phate = phate_operator.fit_transform(tree_data) >>> tree_phate.shape (250, 2) >>> phate_operator.set_params(n_components=10) PHATE(a=20, alpha_decay=None, k=5, knn_dist='euclidean', mds='metric', mds_dist='euclidean', n_components=10, n_jobs=1, n_landmark=2000, n_pca=100, njobs=None, potential_method='log', random_state=None, t=150, verbose=1) >>> tree_phate = phate_operator.transform() >>> tree_phate.shape (250, 10) >>> # plt.scatter(tree_phate[:,0], tree_phate[:,1], c=tree_clusters) >>> # plt.show() Returns ------- self """ reset_kernel = False reset_potential = False reset_embedding = False # mds parameters if 'n_components' in params and \ params['n_components'] != self.n_components: self.n_components = params['n_components'] reset_embedding = True del params['n_components'] if 'mds' in params and params['mds'] != self.mds: self.mds = params['mds'] reset_embedding = True del params['mds'] if 'mds_dist' in params and params['mds_dist'] != self.mds_dist: self.mds_dist = params['mds_dist'] reset_embedding = True del params['mds_dist'] # diff potential parameters if 't' in params and params['t'] != self.t: self.t = params['t'] reset_potential = True del params['t'] if 'potential_method' in params: if params['potential_method'] == 'log': params['gamma'] = 1 elif params['potential_method'] == 'sqrt': params['gamma'] = 0 else: raise ValueError("potential_method {} not recognized. Please " "use gamma between -1 and 1".format( params['potential_method'])) warnings.warn( "potential_method is deprecated. Setting gamma to {} to " "achieve {} transformation.".format( params['gamma'], params['potential_method']), FutureWarning) del params['potential_method'] if 'gamma' in params and \ params['gamma'] != self.gamma: self.gamma = params['gamma'] reset_potential = True del params['gamma'] # kernel parameters if 'k' in params and params['k'] != self.knn: self.knn = params['k'] reset_kernel = True del params['k'] if 'a' in params and params['a'] != self.decay: self.decay = params['a'] reset_kernel = True del params['a'] if 'knn' in params and params['knn'] != self.knn: self.knn = params['knn'] reset_kernel = True del params['knn'] if 'decay' in params and params['decay'] != self.decay: self.decay = params['decay'] reset_kernel = True del params['decay'] if 'n_pca' in params: if self.X is not None and params['n_pca'] >= np.min(self.X.shape): params['n_pca'] = None if params['n_pca'] != self.n_pca: self.n_pca = params['n_pca'] reset_kernel = True del params['n_pca'] if 'knn_dist' in params and params['knn_dist'] != self.knn_dist: self.knn_dist = params['knn_dist'] reset_kernel = True del params['knn_dist'] if 'n_landmark' in params and params['n_landmark'] != self.n_landmark: if self.n_landmark is None or params['n_landmark'] is None: # need a different type of graph, reset entirely self._reset_graph() else: self._set_graph_params(n_landmark=params['n_landmark']) self.n_landmark = params['n_landmark'] del params['n_landmark'] # parameters that don't change the embedding if 'n_jobs' in params: self.n_jobs = params['n_jobs'] self._set_graph_params(n_jobs=params['n_jobs']) del params['n_jobs'] if 'random_state' in params: self.random_state = params['random_state'] self._set_graph_params(random_state=params['random_state']) del params['random_state'] if 'verbose' in params: self.verbose = params['verbose'] tasklogger.set_level(self.verbose) self._set_graph_params(verbose=params['verbose']) del params['verbose'] if reset_kernel: # can't reset the graph kernel without making a new graph self._reset_graph() if reset_potential: self._reset_potential() if reset_embedding: self._reset_embedding() self._set_graph_params(**params) self._check_params() return self
def run_magic_from_file( filename, # data loading params sparse=True, gene_names=None, cell_names=None, cell_axis=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, # kernel params knn=5, decay=15, n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=42, verbose=1, # magic params t_magic='auto', genes=None, # output params output='magic.csv', validate=False): """Run MAGIC on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation knn : int, optional, default: 10 number of nearest neighbors on which to build kernel decay : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages t_magic : int, optional, default: 'auto' power to which the diffusion operator is powered for MAGIC. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data genes : list or {"all_genes", "pca_only"}, optional (default: None) List of genes to return from MAGIC, either as integer indices or column names if input data is a pandas DataFrame. If "all_genes", the entire smoothed matrix is returned. If "pca_only", PCA on the smoothed data is returned. If None, the entire matrix is also returned, but a warning may be raised if the resultant matrix is very large. output : str, optional (default: 'magic.csv') Output CSV file to save smoothed data matrix """ # check arguments filetype = check_filetype(filename) load_fn, load_kws = check_load_args(filetype, sparse=sparse, gene_names=gene_names, cell_names=cell_names, cell_axis=cell_axis, gene_labels=gene_labels, allow_duplicates=allow_duplicates, genome=genome, metadata_channels=metadata_channels) transform_fn, transform_kws = check_transform_args(transform=transform, pseudocount=pseudocount, cofactor=cofactor) # set up logging # https://github.com/scottgigante/tasklogger tasklogger.set_level(verbose) # load data # example: scprep.io.load_csv("data.csv") # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io tasklogger.log_info("Loading data from {}...".format(filename)) data = load_fn(filename, **load_kws) data = scprep.sanitize.check_numeric(data, copy=True) tasklogger.log_info("Loaded {} cells and {} genes.".format( data.shape[0], data.shape[1])) # filter data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter if min_library_size is not None: tasklogger.log_info("Filtering cells by library size >= {}...".format( min_library_size)) data = scprep.filter.filter_library_size(data, cutoff=min_library_size) tasklogger.log_info("Retained {} cells.".format(data.shape[0])) if min_cells_per_gene is not None: tasklogger.log_info( "Filtering genes by min cells >= {}...".format(min_cells_per_gene)) data = scprep.filter.filter_rare_genes(data, min_cells=min_cells_per_gene) tasklogger.log_info("Retained {} genes.".format(data.shape[1])) # normalize data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize if library_size_normalize: tasklogger.log_info("Library size normalizing data...") data = scprep.normalize.library_size_normalize(data) # transform data # example: data = scprep.transform.sqrt(data) # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform if transform is not None: tasklogger.log_info("Applying {} transform...".format(transform)) data = transform_fn(data, **transform_kws) # run MAGIC # https://magic.readthedocs.io/ magic_op = magic.MAGIC(knn=knn, decay=decay, t=t_magic, n_pca=n_pca, knn_dist=knn_dist, n_jobs=n_jobs, random_state=random_state, verbose=verbose) magic_data = magic_op.fit_transform(data, genes=genes) # save as csv magic_data = pd.DataFrame(magic_data) if cell_axis in ['col', 'column']: magic_data = magic_data.T tasklogger.log_info("Saving data to {}...".format(output)) magic_data.to_csv(output) tasklogger.log_info("Complete.".format(output)) if validate: correct_magic_data = scprep.io.load_csv( 'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/' 'master/magic-validate.csv', sparse=False) try: np.testing.assert_equal(scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data)) tasklogger.log_debug( "Validation complete, output is equal to expected") except AssertionError: np.testing.assert_allclose( scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data), atol=1e-14) tasklogger.log_debug( "Validation complete, output is numerically equivalent to expected" )
def __init__(self, data, verbose=True, n_jobs=1, **kwargs): # kwargs are ignored self.n_jobs = n_jobs self.verbose = verbose tasklogger.set_level(verbose) super().__init__(data, **kwargs)
def set_params(self, **params): """Set the parameters on this estimator. Any parameters not given as named arguments will be left at their current value. Parameters ---------- k : int, optional, default: 10 number of nearest neighbors on which to build kernel a : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used t : int, optional, default: 'auto' power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the R squared of the diffused data n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. If 'precomputed', `data` should be an n_samples x n_samples distance or affinity matrix n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages Returns ------- self """ reset_kernel = False reset_imputation = False # diff potential parameters if 't' in params and params['t'] != self.t: self.t = params['t'] reset_imputation = True del params['t'] # kernel parameters if 'k' in params and params['k'] != self.k: self.k = params['k'] reset_kernel = True del params['k'] if 'a' in params and params['a'] != self.a: self.a = params['a'] reset_kernel = True del params['a'] if 'n_pca' in params and params['n_pca'] != self.n_pca: self.n_pca = params['n_pca'] reset_kernel = True del params['n_pca'] if 'knn_dist' in params and params['knn_dist'] != self.knn_dist: self.knn_dist = params['knn_dist'] reset_kernel = True del params['knn_dist'] # parameters that don't change the embedding if 'n_jobs' in params: self.n_jobs = params['n_jobs'] self._set_graph_params(n_jobs=params['n_jobs']) del params['n_jobs'] if 'random_state' in params: self.random_state = params['random_state'] self._set_graph_params(random_state=params['random_state']) del params['random_state'] if 'verbose' in params: self.verbose = params['verbose'] tasklogger.set_level(self.verbose) self._set_graph_params(verbose=params['verbose']) del params['verbose'] if reset_kernel: # can't reset the graph kernel without making a new graph self.graph = None reset_imputation = True if reset_imputation: self.X_magic = None self._check_params() return self
def parse_args(): parser = argparse.ArgumentParser( description='Run MAGIC for imputation of ' 'high-dimensional data.', epilog='For help, visit magic.readthedocs.io or ' 'krishnaswamylab.org/get-help', add_help=True, allow_abbrev=True) io_group = parser.add_argument_group('Data IO') filename = io_group.add_mutually_exclusive_group(required=True) filename.add_argument('--filename', type=str, default=None, help='Input data. Allowed types: csv, tsv, mtx, ' 'hdf5/h5 (10X format), directory/zip (10X format)') filename.add_argument('--validate', action='store_true', default=False, help='Run MAGIC on a test dataset to ensure ' 'output is correct.') sparse = io_group.add_mutually_exclusive_group() sparse.add_argument('--sparse', action='store_true', help='Use sparse data format', dest='sparse', default=None) sparse.add_argument('--dense', action='store_false', help='Use dense data format', dest='sparse', default=None) gene_names = io_group.add_mutually_exclusive_group() gene_names.add_argument('--gene-names', action='store_true', help='Use gene name headers in data file' ' (csv, tsv, fcs)', dest='gene_names', default=True) gene_names.add_argument('--no-gene-names', action='store_false', help='Do not use gene names' ' (csv, tsv, fcs, mtx)', dest='gene_names', default=True) gene_names.add_argument('--gene-name-file', type=str, help='Use gene name headers in FILE' ' (csv, tsv, fcs, mtx)', metavar='FILE', dest='gene_names', default=True) cell_names = io_group.add_mutually_exclusive_group() cell_names.add_argument('--cell-names', action='store_true', help='Use cell name headers in data file' ' (csv, tsv, fcs)', dest='cell_names', default=True) cell_names.add_argument('--no-cell-names', action='store_false', help='Do not use cell names' ' (csv, tsv, fcs, mtx)', dest='cell_names', default=True) cell_names.add_argument('--cell-name-file', type=str, help='Use cell name headers in FILE' ' (csv, tsv, fcs, mtx)', metavar='FILE', dest='cell_names', default=True) io_group.add_argument('--cell-axis', type=str, choices=['row', 'column'], default='row', help='States whether cells are on rows or columns ' '(csv, tsv, mtx)') io_group.add_argument('--gene-labels', type=str, default='both', choices=['symbol', 'id', 'both'], help='Choice of gene labels for 10X data' ' (dir, zip, hdf5)') io_group.add_argument('--genome', type=str, default=None, help='Genome name for 10X HDF5 data (hdf5)') io_group.add_argument( '--metadata-channels', type=str, nargs='+', default=[ 'Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1' ], help='Names of channels to remove from fcs data (fcs)', metavar='CHANNEL') preprocess_group = parser.add_argument_group('Preprocessing') cell_filter = preprocess_group.add_mutually_exclusive_group() cell_filter.add_argument('--min-library-size', type=int, default=2000, help='Filter cells with less than COUNTS counts', dest='min_library_size', metavar='COUNTS') cell_filter.add_argument('--no-cell-filter', action='store_false', default=2000, dest='min_library_size', help='Do not filter cells') gene_filter = preprocess_group.add_mutually_exclusive_group() gene_filter.add_argument( '--min-cells-per-gene', type=int, default=10, help='Filter genes with less than CELLS non-zero cells', dest='min_cells_per_gene', metavar='CELLS') gene_filter.add_argument('--no-gene-filter', action='store_false', default=2000, dest='min_cells_per_gene', help='Do not filter genes') libnorm = preprocess_group.add_mutually_exclusive_group() libnorm.add_argument( '--normalize', action='store_true', default=True, dest='library_size_normalize', help='Normalize cells by total UMI count (library size)') libnorm.add_argument('--no-normalize', action='store_false', default=True, dest='library_size_normalize', help='Do not normalize cells') transform = preprocess_group.add_mutually_exclusive_group() transform.add_argument('--transform', type=str, default='sqrt', choices=['sqrt', 'log', 'arcsinh'], help='Sublinear data transformation function') transform.add_argument('--no-transform', action='store_false', default='sqrt', dest='transform', help='Do not transform data') preprocess_group.add_argument('--pseudocount', type=float, default=1, help='Pseudocount to add to genes prior ' 'to log transform', metavar='PCOUNT') preprocess_group.add_argument('--cofactor', type=float, default=5, help='Factor by which to divide genes prior ' 'to arcsinh transform') kernel_group = parser.add_argument_group('Kernel Computation') kernel_group.add_argument('-k', '--knn', type=int, default=10, dest='knn', help='Number of nearest neighbors on which to ' 'build kernel') decay = kernel_group.add_mutually_exclusive_group() decay.add_argument('-a', '--decay', type=int, default=15, dest='decay', help='Sets decay rate of kernel tails') decay.add_argument('--no-decay', action='store_false', default=15, dest='decay', help='Do not use alpha decay') pca = kernel_group.add_mutually_exclusive_group() pca.add_argument('--pca', type=int, default=100, dest='n_pca', help='Number of principal components to use for ' 'neighborhoods') pca.add_argument('--no-pca', action='store_false', default=100, dest='n_pca', help='Do not use PCA') kernel_group.add_argument('--knn-dist', type=str, default='euclidean', help='Distance metric to use for calculating ' 'neighborhoods. Recommended values are ' '"euclidean" and "cosine"', metavar='DISTANCE') kernel_group.add_argument( '-t', '--threads', type=int, default=1, help='Use THREADS threads. If -1 all CPUs are used', metavar='THREADS', dest='random_state') kernel_group.add_argument('--seed', type=int, default=None, help='Integer random seed', metavar='SEED', dest='random_state') verbose = kernel_group.add_mutually_exclusive_group() verbose.add_argument('-v', '--verbose', action='store_true', default=True, help='Print verbose output') verbose.add_argument('-q', '--quiet', action='store_false', default=True, help='Do not print verbose output', dest='verbose') verbose.add_argument('-vv', '--debug', action='store_true', default=False, help='Print debugging output', dest='debug') magic_group = parser.add_argument_group('MAGIC') magic_group.add_argument('--t-magic', type=str, default='auto', help='Level of diffusion for MAGIC', metavar='T') genes = magic_group.add_mutually_exclusive_group() genes.add_argument('--pca-only', action='store_true', default=False, help='Return PCA on the smoothed matrix') genes.add_argument('--all-genes', action='store_true', default=False, help='Return the entire smoothed matrix') genes.add_argument('--gene-list', type=str, nargs='+', default=None, help='List of genes to return from MAGIC, ' 'either as integer indices or column names.', metavar='GENE', dest='genes') magic_group.add_argument('--output', type=str, default='magic.csv', help='Output CSV file to save smoothed ' 'data matrix', metavar='FILE') args = parser.parse_args() if args.validate: tasklogger.set_level(2) tasklogger.log_info("Running MAGIC validation.") args.filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_small.csv" args.sparse = False args.gene_names = True args.cell_names = True args.cell_axis = "row" args.gene_labels = "both" args.genome = None args.metadata_channels = None args.min_library_size = 1 args.min_cells_per_gene = 1 args.library_size_normalize = True args.transform = 'sqrt' args.pseudocount = None args.cofactor = None args.knn = 3 args.decay = 20 args.n_pca = None args.knn_dist = "euclidean" args.n_jobs = 1 args.random_state = 42 args.verbose = True args.debug = True args.t_magic = "auto" args.all_genes = True args.output = "magic-validate.csv" # fix magic "genes" argument if args.all_genes: args.genes = "all_genes" elif args.pca_only: args.genes = "pca_only" else: try: args.genes = [int(g) for g in args.genes] except TypeError: # string gene names pass del args.all_genes del args.pca_only # fix t argument if args.t_magic != 'auto': try: args.t_magic = int(args.t_magic) except TypeError: parser.error("argument --t-magic: invalid int value: '{}'".format( args.t_magic)) # fix debug argument if args.debug: args.verbose = 2 del args.debug # store None values where appropriate if args.decay is False: args.decay = None if args.n_pca is False: args.n_pca = None if args.min_library_size is False: args.min_library_size = None if args.min_cells_per_gene is False: args.min_cells_per_gene = None # check for inappropriately set defaults try: filetype = check_filetype(args.filename) except RuntimeError as e: parser.error(str(e)) if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'fcs']: if '--gene-names' not in sys.argv: args.gene_names = None else: parser.error( "Cannot handle --gene-names with {} file".format(filetype)) if '--cell-names' not in sys.argv: args.cell_names = None else: parser.error( "Cannot handle --cell-names with {} file".format(filetype)) if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'mtx']: if '--cell-axis' not in sys.argv: args.cell_axis = None else: parser.error( "Cannot handle --cell-axis with {} file".format(filetype)) if filetype not in ['dir', 'zip', 'hdf5', 'h5']: if '--gene-labels' not in sys.argv: args.gene_labels = None else: parser.error( "Cannot handle --gene-labels with {} file".format(filetype)) if filetype not in ['hdf5', 'h5']: if '--genome' not in sys.argv: args.genome = None else: parser.error( "Cannot handle --genome with {} file".format(filetype)) if filetype not in ['fcs']: if '--metadata-channels' not in sys.argv: args.metadata_channels = None else: parser.error( "Cannot handle --metadata-channels with {} file".format( filetype)) # check for inappropriately set parameters if not args.transform == 'log': if '--pseudocount' in sys.argv: parser.error( "Cannot handle --pseudocount with --transform {}".format( args.transform)) else: args.pseudocount = None if not args.transform == 'arcsinh': if '--cofactor' in sys.argv: parser.error("Cannot handle --cofactor with --transform {}".format( args.transform)) else: args.cofactor = None return args
def set_params(self, **params): """Set the parameters of I-Impute. Any parameters not given as named arguments will be left at their current value. Parameters ---------- n : int, optional, default: 20 nth of nearest neighbors on which to build kernel when calculating affinity matrix. c_drop : float, optional, default: 0.5 Dropout event cutoff. For entry whose dropout probability is less than c_drop, we consider it as a real observation, its original value will remain. Otherwise, we conduct the imputation with the aid of information from similar cells. p_pca : float, optional, default: 0.4 Percentage of variance explained by the selected components of PCA. It determines the nmumber of PCs used to calculate the distance between cells. alpha : float, optional, default: 0.01 L1 penalty for Lasso regression. normalize : boolean, optional, default: True By default, I-Impute takes in an unnormalized matrix and performs library size normalization during the denoising step. However, if your data is already normalized or normalization is not desired, you can set normalize=False. iteration : boolean, optional, default: False The imputation process only performs once when False (it is equivalent to C-Impute described in our paper). The imputation process will iterate n times to achieve self-constistent imputation matrix. verbose : `int` or `boolean`, optional, default: 1 If `True` or `> 0`, print status messages Returns ------- self """ # kernel parameters if 'n' in params and params['n'] != self.n: self.n = params['n'] del params['n'] if 'c_drop' in params and params['c_drop'] != self.c_drop: self.c_drop = params['c_drop'] del params['c_drop'] if 'p_pca' in params and params['p_pca'] != self.p_pca: self.p_pca = params['p_pca'] del params['p_pca'] if 'alpha' in params and params['alpha'] != self.alpha: self.alpha = params['alpha'] del params['alpha'] if 'normalize' in params and params['normalize'] != self.normalize: self.normalize = params['normalize'] del params['normalize'] if 'iteration' in params and params['iteration'] != self.iteration: self.iteration = params['iteration'] del params['iteration'] if 'verbose' in params: self.verbose = params['verbose'] tasklogger.set_level(self.verbose) del params['verbose'] self._check_params() return self
def Graph(data, n_pca=None, sample_idx=None, adaptive_k='sqrt', precomputed=None, knn=5, decay=10, distance='euclidean', thresh=1e-4, kernel_symm='+', gamma=None, n_landmark=None, n_svd=100, beta=1, n_jobs=-1, verbose=False, random_state=None, graphtype='auto', use_pygsp=False, initialize=True, **kwargs): """Create a graph built on data. Automatically selects the appropriate DataGraph subclass based on chosen parameters. Selection criteria: - if `graphtype` is given, this will be respected - otherwise: -- if `sample_idx` is given, an MNNGraph will be created -- if `precomputed` is not given, and either `decay` is `None` or `thresh` is given, a kNNGraph will be created - otherwise, a TraditionalGraph will be created. Incompatibilities: - MNNGraph and kNNGraph cannot be precomputed - kNNGraph and TraditionalGraph do not accept sample indices Parameters ---------- data : array-like, shape=[n_samples,n_features] accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. TODO: accept pandas dataframes n_pca : `int` or `None`, optional (default: `None`) number of PC dimensions to retain for graph building. If `None`, uses the original data. Note: if data is sparse, uses SVD instead of PCA TODO: should we subtract and store the mean? knn : `int`, optional (default: 5) Number of nearest neighbors (including self) to use to build the graph decay : `int` or `None`, optional (default: 10) Rate of alpha decay to use. If `None`, alpha decay is not used. distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. TODO: actually sklearn.neighbors has even more choices thresh : `float`, optional (default: `1e-4`) Threshold above which to calculate alpha decay kernel. All affinities below `thresh` will be set to zero in order to save on time and memory constraints. kernel_symm : string, optional (default: '+') Defines method of MNN symmetrization. '+' : additive '*' : multiplicative 'gamma' : min-max 'none' : no symmetrization gamma: float (default: None) Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'. K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) If the graph is precomputed, this variable denotes which graph matrix is provided as `data`. Only one of `precomputed` and `n_pca` can be set. beta: float, optional(default: 1) Multiply within - batch connections by(1 - beta) sample_idx: array-like Batch index for MNN kernel adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt') Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. n_landmark : `int`, optional (default: 2000) number of landmarks to use n_svd : `int`, optional (default: 100) number of SVD components to use for spectral clustering random_state : `int` or `None`, optional (default: `None`) Random state for random PCA verbose : `bool`, optional (default: `True`) Verbosity. TODO: should this be an integer instead to allow multiple levels of verbosity? n_jobs : `int`, optional (default : 1) The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto') Manually selects graph type. Only recommended for expert users use_pygsp : `bool` (Default: `False`) If true, inherits from `pygsp.graphs.Graph`. initialize : `bool` (Default: `True`) If True, initialize the kernel matrix on instantiation **kwargs : extra arguments for `pygsp.graphs.Graph` Returns ------- G : `DataGraph` Raises ------ ValueError : if selected parameters are incompatible. """ tasklogger.set_level(verbose) if sample_idx is not None and len(np.unique(sample_idx)) == 1: warnings.warn("Only one unique sample. " "Not using MNNGraph") sample_idx = None if graphtype == 'mnn': graphtype = 'auto' if graphtype == 'auto': # automatic graph selection if sample_idx is not None: # only mnn does batch correction graphtype = "mnn" elif precomputed is None and (decay is None or thresh > 0): # precomputed requires exact graph # no decay or threshold decay require knngraph graphtype = "knn" else: graphtype = "exact" # set base graph type if graphtype == "knn": basegraph = graphs.kNNGraph if precomputed is not None: raise ValueError("kNNGraph does not support precomputed " "values. Use `graphtype='exact'` or " "`precomputed=None`") if sample_idx is not None: raise ValueError("kNNGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`") elif graphtype == "mnn": basegraph = graphs.MNNGraph if precomputed is not None: raise ValueError("MNNGraph does not support precomputed " "values. Use `graphtype='exact'` and " "`sample_idx=None` or `precomputed=None`") elif graphtype == "exact": basegraph = graphs.TraditionalGraph if sample_idx is not None: raise ValueError("TraditionalGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`") else: raise ValueError("graphtype '{}' not recognized. Choose from " "['knn', 'mnn', 'exact', 'auto']") # set add landmarks if necessary parent_classes = [basegraph] msg = "Building {} graph".format(graphtype) if n_landmark is not None: parent_classes.append(graphs.LandmarkGraph) msg = msg + " with landmarks" if use_pygsp: parent_classes.append(base.PyGSPGraph) if len(parent_classes) > 2: msg = msg + " with PyGSP inheritance" else: msg = msg + " and PyGSP inheritance" tasklogger.log_debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: Graph = eval("graphs." + "".join(class_names) + "Graph") except NameError: raise RuntimeError("unknown graph classes {}".format(parent_classes)) params = kwargs for parent_class in parent_classes: for param in parent_class._get_param_names(): try: params[param] = eval(param) except NameError: # keyword argument not specified above - no problem pass # build graph and return tasklogger.log_debug("Initializing {} with arguments {}".format( parent_classes, ", ".join([ "{}='{}'".format(key, value) for key, value in params.items() if key != "data" ]))) return Graph(**params)
def set_params(self, **params): """Set the parameters on this estimator. Any parameters not given as named arguments will be left at their current value. Parameters ---------- knn : int, optional, default: 5 number of nearest neighbors on which to build kernel decay : int, optional, default: 1 sets decay rate of kernel tails. If None, alpha decaying kernel is not used t : int, optional, default: 3 power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the R squared of the diffused data n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages k : Deprecated for `knn` a : Deprecated for `decay` Returns ------- self """ reset_kernel = False reset_imputation = False # diff potential parameters if "t" in params and params["t"] != self.t: self.t = params["t"] reset_imputation = True del params["t"] # kernel parameters if "k" in params and params["k"] != self.knn: warnings.warn( "Parameter `k` is deprecated and will be removed" " in a future version. Use `knn` instead", FutureWarning, ) self.knn = params["k"] reset_kernel = True del params["k"] if "a" in params and params["a"] != self.decay: warnings.warn( "Parameter `a` is deprecated and will be removed" " in a future version. Use `decay` instead", FutureWarning, ) self.decay = params["a"] reset_kernel = True del params["a"] if "knn" in params and params["knn"] != self.knn: self.knn = params["knn"] reset_kernel = True del params["knn"] if "knn_max" in params and params["knn_max"] != self.knn_max: self.knn_max = params["knn_max"] reset_kernel = True del params["knn_max"] if "decay" in params and params["decay"] != self.decay: self.decay = params["decay"] reset_kernel = True del params["decay"] if "n_pca" in params and params["n_pca"] != self.n_pca: self.n_pca = params["n_pca"] reset_kernel = True del params["n_pca"] if "knn_dist" in params and params["knn_dist"] != self.knn_dist: self.knn_dist = params["knn_dist"] reset_kernel = True del params["knn_dist"] # parameters that don't change the embedding if "solver" in params and params["solver"] != self.solver: self.solver = params["solver"] reset_imputation = True del params["solver"] if "n_jobs" in params: self.n_jobs = params["n_jobs"] self._set_graph_params(n_jobs=params["n_jobs"]) del params["n_jobs"] if "random_state" in params: self.random_state = params["random_state"] self._set_graph_params(random_state=params["random_state"]) del params["random_state"] if "verbose" in params: self.verbose = params["verbose"] tasklogger.set_level(self.verbose) self._set_graph_params(verbose=params["verbose"]) del params["verbose"] if reset_kernel: # can't reset the graph kernel without making a new graph self.graph = None reset_imputation = True if reset_imputation: self.X_magic = None self._check_params() return self