def from_json(cls, path_to_folder, estimator=None, model=None, x_data=None, y_dataframe=None, sigma=None): """Load results from folder of results with json format""" result = cls(estimator=estimator, model=model, x_data=x_data, y_dataframe=y_dataframe, sigma=sigma) path_to_folder = Path(path_to_folder) info(f'loading data from {str(path_to_folder)}...') if path_to_folder.joinpath('summary.csv').exists(): result.summary = pd.read_csv( path_to_folder.joinpath('summary.csv'), index_col=0) elif path_to_folder.joinpath('summary.json').exists(): result.summary = pd.read_json( path_to_folder.joinpath('summary.json')) result.summary.index.name = 'seq' if path_to_folder.joinpath('seqs').exists(): info("'seqs' folder found") seq_to_hash = read_json( path_to_folder.joinpath('seqs', 'seq_to_hash.json')) elif path_to_folder.joinpath('seqs.tar.gz').exists(): # for previous saving formats import tarfile info("'seqs.tar.gz' found") with tarfile.open(path_to_folder.joinpath('seqs.tar.gz'), mode='r:gz') as tf: import json try: seq_to_hash = json.load( tf.extractfile('seqs/seq_to_hash.json')) except: seq_to_hash = json.load( tf.extractfile('results/seqs/seq_to_hash.json')) else: logging.warning( "'seqs' folder or 'seqs.tar.gz' not found - no individual sequence fitting results loaded" ) seq_to_hash = None result._bs_record = seq_to_hash result._conv_record = seq_to_hash result.result_path = Path(path_to_folder) result.large_dataset = True return result
def func(target, norm_factor): """Normalize name seq_table w.r.t norm_factor Returns: pd.DataFrame of normalized name seq_table with only samples provided in norm_factor """ def sample_normalize(col): return col.astype('float') * norm_factor[col.name] sample_list = [] for sample in target.columns: if sample in norm_factor.keys(): sample_list.append(sample) else: logging.warning( f'Sample {sample} is not in total amount norm_factor, skip this sample' ) return target[sample_list].apply(sample_normalize, axis=0)
def func(target, norm_factor): """Normalize counts in `name` by `norm_factor`""" if not isinstance(target, pd.DataFrame): logging.error('name needs to be pd.DataFrame') def sample_normalize(col): return col.astype('float') * norm_factor[col.name] sample_list = [] for sample in target.columns: if sample in norm_factor.keys(): sample_list.append(sample) else: logging.warning( f'Sample {sample} is not in spike-in norm_factor, skip this sample' ) return target[sample_list].apply(sample_normalize, axis=0)
def __init__(self, y_dataframe, x_data, model, x_label=None, y_label=None, seq_to_fit=None, sigma=None, bounds=None, init_guess=None, opt_method='trf', exclude_zero=False, metrics=None, rnd_seed=None, curve_fit_kwargs=None, replicates=None, bootstrap_num=0, bs_record_num=0, bs_method='pct_res', bs_stats=None, grouper=None, record_full=False, conv_reps=0, conv_init_range=None, conv_stats=None, note=None, large_dataset=False, verbose=1, result_path=None): from ..utility.func_tools import AttrScope, get_func_params super().__init__() logging.info('Creating the BatchFitter...') self.model = model self.note = note # parse y_dataframe from ..utility.file_tools import table_object_to_dataframe self.y_dataframe = table_object_to_dataframe(y_dataframe) # process seq_to_fit if seq_to_fit is not None: if isinstance(seq_to_fit, (list, np.ndarray, pd.Series)): self.seq_to_fit = list(seq_to_fit) elif isinstance(seq_to_fit, int): self.seq_to_fit = y_dataframe.index[:seq_to_fit].values else: logging.error( 'Unknown seq_to_fit type, is it list-like or int?', error_type=TypeError) else: self.seq_to_fit = seq_to_fit # prep fitting params shared by all fittings if isinstance(x_data, pd.Series): self.x_data = x_data[y_dataframe.columns.values] elif len(x_data) != y_dataframe.shape[1]: logging.error( 'x_data length and table column number does not match', error_type=ValueError) else: self.x_data = np.array(x_data) if sigma is not None: if np.shape(sigma) != np.shape(self.y_dataframe): logging.error( 'Shape of sigma does not match the shape of y_dataframe', error_type=ValueError) self.sigma = sigma if bounds is None: bounds = (-np.inf, np.inf) if len(x_data) <= 1: logging.warning( "Number of data points less than 2, bootstrap will not be performed" ) bootstrap_num = 0 self.bootstrap = bootstrap_num > 0 # contains arguments should pass to the single estimator self.fit_params = AttrScope( x_data=self.x_data, x_label=x_label, y_label=y_label, model=self.model, bounds=bounds, init_guess=init_guess, opt_method=opt_method, exclude_zero=exclude_zero, metrics=metrics, rnd_seed=rnd_seed, curve_fit_kwargs=curve_fit_kwargs, replicates=replicates, bootstrap_num=bootstrap_num, bs_record_num=bs_record_num, bs_method=bs_method, bs_stats=bs_stats, grouper=grouper if bs_method == 'stratified' else None, record_full=record_full, conv_reps=conv_reps, conv_init_range=conv_init_range, conv_stats=conv_stats, verbose=verbose, ) if result_path is None: self.results = BatchFitResults(estimator=self) else: self.results = BatchFitResults.load_result(result_path) self.large_dataset = large_dataset self.results.large_dataset = large_dataset self.workers = None logging.info('BatchFitter created')
def fit(self, parallel_cores=1, point_estimate=True, replicates=False, bootstrap=False, convergence_test=False, stream_to=None, overwrite=False): """Run the estimation Args: parallel_cores (int): number of parallel cores to use. Default 1 point_estimate (bool): if perform point estimation, default True bootstrap (bool): if perform bootstrap uncertainty estimation, default False replicates (bool): if perform replicates for uncertainty estimation, default False convergence_test (bool): if perform convergence test, default False stream_to (str): Directly stream fitting results to disk if output path is given will create a folder with name of seq/hash with pickled dict of fitting results overwrite (bool): if overwrite existing results when stream to disk. Default False. """ from yutility.log import Timer logging.info('Batch fitting starting...') with Timer(): if self.large_dataset and stream_to is None: logging.error( 'You are working with large dataset and stream_to needs to be specified', error_type=ValueError) if not self.large_dataset and stream_to is not None: self.large_dataset = True logging.warning( "You provided `stream_to` so the large_dataset method is used" ) if self.large_dataset: self._hash() self.results.result_path = Path(stream_to) check_dir(self.results.result_path.joinpath('seqs')) dump_json(obj=self._seq_to_hash, path=self.results.result_path.joinpath( 'seqs', 'seq_to_hash.json')) from functools import partial work_fn = partial(_work_fn, point_estimate=point_estimate, replicates=replicates, bootstrap=bootstrap, convergence_test=convergence_test) worker_generator = self._worker_generator(stream_to=stream_to, overwrite=overwrite) if parallel_cores > 1: import multiprocessing as mp pool = mp.Pool(processes=int(parallel_cores)) logging.info( 'Use multiprocessing to fit in {} parallel threads...'. format(parallel_cores)) workers = pool.map(work_fn, worker_generator) else: # single thread logging.info('Fitting in a single thread...') workers = [work_fn(worker) for worker in worker_generator] # print(workers[0].summary()) self.results.summary = pd.DataFrame( {worker.name: worker.summary() for worker in workers}).transpose() self.results.summary.index.name = 'seq' # record result if self.bootstrap: if self.large_dataset: self.results._bs_record = self._seq_to_hash else: self.results._bs_record = { worker.name: worker.results.uncertainty.records for worker in workers } if convergence_test: if self.large_dataset: self.results._conv_record = self._seq_to_hash else: self.results._conv_record = { worker.name: worker.results.convergence.records for worker in workers } if self.large_dataset: self._hash_inv() self.results.to_json(output_dir=stream_to) logging.info('Fitting finished')
def test_logging_can_log(): logging.info('Some info') logging.warning("Some warning") with raises(ValueError): logging.error("let's get some ValueError", error_type=ValueError)
def _fit(self, model=None, x_data=None, y_data=None, sigma=None, bounds="unspecified", metrics=None, init_guess=None, curve_fit_kwargs=None): from scipy.optimize import curve_fit from ..utility.func_tools import update_none from ..utility.func_tools import get_func_params model = update_none(model, self.model) parameters = get_func_params(model, required_only=True)[1:] x_data = update_none(x_data, self.x_data) y_data = update_none(y_data, self.y_data) sigma = update_none(sigma, self.config.sigma) if len(x_data) != len(sigma): sigma = None logging.debug('Sigma is ignored as it has different length as x_data') if bounds == "unspecified": bounds = self.config.bounds if bounds is None: bounds = (-np.inf, np.inf) metrics = update_none(metrics, self.config.metrics) init_guess = update_none(init_guess, self.config.init_guess) curve_fit_kwargs = update_none(curve_fit_kwargs, self.config.curve_fit_kwargs) try: if not init_guess: # by default, use a random guess form (0, 1) init_guess = [np.random.random() for _ in parameters] if curve_fit_kwargs is None: curve_fit_kwargs = {} params, pcov = curve_fit(f=model, xdata=x_data, ydata=y_data, sigma=sigma, bounds=bounds, p0=init_guess, **curve_fit_kwargs) if metrics is not None: metrics_res = pd.Series({name: fn(params) for name, fn in metrics.items()}) else: metrics_res = None except RuntimeError: logging.warning( f"RuntimeError on \n" f'\tx = {x_data}\n' f'\ty={y_data}\n' f'\tsigma={sigma}' ) params = np.full(fill_value=np.nan, shape=len(parameters)) pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters))) if metrics is not None: metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()}) else: metrics_res = None except ValueError: logging.warning( f"ValueError on \n" f'\tx={x_data}\n' f'\ty={y_data}\n' f'\tsigma={sigma}' ) params = np.full(fill_value=np.nan, shape=len(parameters)) pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters))) if metrics is not None: metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()}) else: metrics_res = None except: logging.warning( f"Other error observed on\n" f'\tx={x_data}\n' f'\ty={y_data}\n' f'\tsigma={sigma}' ) params = np.full(fill_value=np.nan, shape=len(parameters)) pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters))) if metrics is not None: metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()}) else: metrics_res = None return { 'params': pd.Series(data=params, index=parameters), 'pcov': pd.DataFrame(data=pcov, index=parameters, columns=parameters), 'metrics': metrics_res }