def _hash(self): """De-duplicate rows before fitting""" def hash_series(row): """Return a positive hash value""" return ctypes.c_size_t(hash(tuple(row))) self._y_dataframe_dup = self.y_dataframe.copy() if self.seq_to_fit is not None: # filter the seq to fit self._seq_to_fit_dup = self.seq_to_fit.copy() self.y_dataframe = self.y_dataframe.loc[self.seq_to_fit] # find seq to hash mapping self._seq_to_hash = self.y_dataframe.apply(hash_series, axis=1).to_dict() # only keep the first instance of each hash self.y_dataframe = self.y_dataframe[~self.y_dataframe.duplicated( keep='first')] if isinstance(self.sigma, pd.DataFrame): # only accept sigma as an pd.DataFrame self._sigma_dup = self.sigma.copy() # filter sigma table for only the first instance of each hash self.sigma = self.sigma.loc[self.y_dataframe.index] # convert seq --> hash self.sigma.rename(index=self._seq_to_hash, inplace=True) # convert seq --> hash self.y_dataframe.rename(index=self._seq_to_hash, inplace=True) if self.seq_to_fit is not None: self.seq_to_fit = [ self._seq_to_hash[seq] for seq in self.seq_to_fit ] logging.info( 'Shrink rows in table by removing duplicates: ' f'{self._y_dataframe_dup.shape[0]} --> {self.y_dataframe.shape[0]}' )
def fit(self, **kwargs): """Batch fit simulated result""" if self.seed: np.random.seed(self.seed) if self.y_dataframe is None: self.simulate_samples() from ..estimate.least_squares_batch import BatchFitter fitter = BatchFitter(y_dataframe=self.y_dataframe, x_data=self.x_data, model=self.model, large_dataset=True, **self.batchfitter_kwargs) fitter.fit(convergence_test=self.batchfitter_kwargs['conv_reps'] > 0, bootstrap=self.batchfitter_kwargs['bootstrap_num'] > 0, point_estimate=True, stream_to=self.save_to, **kwargs) self.results = fitter.results self.results.summary.to_csv(self.save_to.joinpath('results.csv')) logging.info(f"Result saved to {self.save_to.joinpath('results.csv')}") os.system( f"cd {self.save_to} && tar -czf seqs.tar.gz seqs && rm -r seqs")
def __init__(self, dataset_dir, result_dir): """Survey estimation results - load fitting results from `result_dir/fit_summary.csv` - load truth and input count infor from `dataset_dir/truth.csv` and `input_counts` Optional to include: - input_counts: counts of sequences in the input pool - mean_counts: mean counts in all samples (input and reacted) Return: results: table of estimated k, A, kA truth: table of true k, A, p0, ka, and input_counts seq_list: list of indices of sequences that were able to estimate """ allowed_col = [ 'k', 'A', 'kA', 'A_mean', 'k_mean', 'kA_mean', 'A_std', 'k_std', 'kA_std', 'A_2.5%', 'k_2.5%', 'kA_2.5%', 'A_50%', 'k_50%', 'kA_50%', 'A_97.5%', 'k_97.5%', 'kA_97.5%', 'bs_A_mean', 'bs_k_mean', 'bs_kA_mean', 'bs_A_std', 'bs_k_std', 'bs_kA_std', 'bs_A_2.5%', 'bs_k_2.5%', 'bs_kA_2.5%', 'bs_A_50%', 'bs_k_50%', 'bs_kA_50%', 'bs_A_97.5%', 'bs_k_97.5%', 'bs_kA_97.5%', 'rep_A_mean', 'rep_k_mean', 'rep_kA_mean', 'rep_A_std', 'rep_k_std', 'rep_kA_std' ] from pathlib import Path from ..utility.file_tools import read_pickle self.results = pd.read_csv(f'{result_dir}/fit_summary.csv', index_col=0) self.cols = self.results.columns[self.results.columns.isin( allowed_col)].values self.seq_list = self.results[~self.results[self.cols].isna().any( axis=1)].index.values self._bs_prefix = 'bs_' if 'bs_kA_2.5%' in self.results.columns else '' if Path(dataset_dir).is_file(): dataset = read_pickle(dataset_dir) else: dataset = read_pickle(dataset_dir + '/seq_table.pkl') self.truth = dataset.truth self.truth['input_counts'] = dataset.table.original.reindex( self.truth.index).s0 self.truth['mean_counts'] = dataset.table.original.reindex( self.truth.index).mean(axis=1) logging.info(f'{self.truth.shape[0]} sequences simulated, ' f'{self.results.shape[0]} fitted, ' f'{len(self.seq_list)} has valid results')
def generate_params(param_input): """Parse single distribution input and reformat as generated results """ from types import GeneratorType if isinstance(param_input, (list, np.ndarray, pd.Series)): if len(param_input) == uniq_seq_num: return param_input else: logging.info( 'Size fo input param list and expected uniq_seq_num does not match, ' 'resample to given uniq_seq_num with replacement') return np.random.choice(param_input, replace=True, size=uniq_seq_num) elif isinstance(param_input, GeneratorType): # assume only generate one realization return [next(param_input) for _ in range(uniq_seq_num)] elif callable(param_input): try: # if there is a uniq_seq_num parameter to pass param_output = param_input(size=uniq_seq_num) if isinstance(param_output, (list, np.ndarray, pd.Series)): return param_output elif isinstance(param_output, GeneratorType): return next(param_output) else: logging.error( "Unknown input to draw a distribution value", error_type=TypeError) except TypeError: # if can not pass uniq_seq_num, assume generate single samples param_output = param_input() if isinstance(param_output, GeneratorType): return [ next(param_output) for _ in range(uniq_seq_num) ] elif isinstance(param_output, (float, int)): return [param_input() for _ in range(uniq_seq_num)] else: logging.error( "Unknown callable return type for distribution", error_type=TypeError) else: logging.error("Unknown input to draw a distribution value", error_type=TypeError)
def get_FitResult(self, seq=None): """Get FitResults from a JSON file """ from .least_squares import FitResults if self._bs_record is None: logging.error('No bootstrap or convergence test record found', error_type=TypeError) else: seq_to_hash = self._bs_record if seq is None: return seq_to_hash if isinstance(seq_to_hash[seq], (list, tuple)): # new hierarchical format tg_ix, hash_ix = seq_to_hash[seq] result = FitResults.from_json(json_path=f'{hash_ix}.json', tarfile=self.result_path.joinpath( 'seqs', f'{tg_ix}.tar.gz')) else: # old format if self.result_path.joinpath('seqs', f"{seq_to_hash[seq]}.json").exists(): logging.info(f"load result from {seq_to_hash[seq]}.json") result = FitResults.from_json( self.result_path.joinpath('seqs', f'{seq_to_hash[seq]}.json')) elif self.result_path.joinpath('seqs.tar.gz').exists(): try: result = FitResults.from_json( json_path=f'seqs/{seq_to_hash[seq]}.json', tarfile=self.result_path.joinpath('seqs.tar.gz')) except: result = FitResults.from_json( json_path=f'results/seqs/{seq_to_hash[seq]}.json', tarfile=self.result_path.joinpath('seqs.tar.gz')) if result.data.x_data is None and self.data.y_dataframe is not None: # add from data attribute result.data.x_data = self.data.x_data result.data.y_data = self.data.y_dataframe.loc[seq] return result
def __init__(self, data, data_unit=None, sample_list=None, seq_list=None, data_note=None, use_sparse=True, seq_metadata=None, sample_metadata=None, grouper=None, x_values=None, x_unit=None, note=None, dataset_metadata=None): # initialize metadata from datetime import datetime self.metadata = AttrScope(created_time=datetime.now(), note=note) # add metadata if dataset_metadata is not None: self.metadata.add(dataset_metadata) if sample_metadata is not None: self.metadata.samples = AttrScope(sample_metadata) if seq_metadata is not None: self.metadata.seqs = AttrScope(seq_metadata) logging.info('SeqData created') # add original seq_table self.table = AttrScope(original=SeqTable(data, columns=sample_list, index=seq_list, unit=data_unit, note=data_note, use_sparse=use_sparse)) # add x values if x_values is None: self.x_values = None self.x_unit = None elif isinstance(x_values, (dict, pd.Series)): self.x_values = pd.Series(x_values) self.x_unit = x_unit elif isinstance(x_values, (list, np.ndarray)): self.x_values = pd.Series(x_values, index=self.table.original.samples) self.x_unit = x_unit else: logging.error('Unknown type for x_values', error_type=TypeError) self.x_values = self.x_values[self.table.original.samples] self.x_values = self.x_values[~self.x_values.isna()] if grouper is not None: from .grouper import GrouperCollection self.grouper = GrouperCollection() self.grouper.add(**grouper) self.update_analysis()
def _update_norm_factor(self): """Update norm factor value based on current seq_data.total_amounts and seq_data.full_table""" if (self.full_table is not None) and (self.total_amounts is not None): for sample in self.full_table.columns: if sample not in self.total_amounts.keys(): logging.info( f'Notice: {sample} is not in total_amount, skip this sample' ) self.norm_factor = {} from ..utility.func_tools import is_sparse for sample, amount in self.total_amounts.items(): if sample in self.full_table.columns: if is_sparse(self.full_table[sample]): self.norm_factor[sample] = amount / self.full_table[ sample].sparse.to_dense().sum() else: self.norm_factor[ sample] = amount / self.full_table[sample].sum() else: logging.info(f"Notice: {sample} is not in full_table")
def _hash_inv(self): """Recover the hashed results""" logging.info('Recovering original table from hash...') def get_summary(seq): return self.results.summary.loc[self._seq_to_hash[seq]] # map hash --> seq for results summary self.results.summary = pd.Series( data=list(self._seq_to_hash.keys()), index=list(self._seq_to_hash.keys())).apply(get_summary) # recover the original y_dataframe self.y_dataframe = self._y_dataframe_dup.copy() del self._y_dataframe_dup # recover the original sigma if exists if hasattr(self, '_sigma_dup'): self.sigma = self._sigma_dup.copy() del self._sigma_dup # recover the original seq_to_fit if exists if hasattr(self, '_seq_to_fit'): self.seq_to_fit = self._seq_to_fit_dup.copy() del self._seq_to_fit_dup
def run_subprocess(cmd, name=None, **kwargs): if name is None: name = cmd[0] logging.info(f"Running {name}...") p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=os.environ, **kwargs) logging.info('Output:') while True: output = p.stdout.readline() if not output: break logging.info(output)
def fit(self, parallel_cores=1, point_estimate=True, replicates=False, bootstrap=False, convergence_test=False, stream_to=None, overwrite=False): """Run the estimation Args: parallel_cores (int): number of parallel cores to use. Default 1 point_estimate (bool): if perform point estimation, default True bootstrap (bool): if perform bootstrap uncertainty estimation, default False replicates (bool): if perform replicates for uncertainty estimation, default False convergence_test (bool): if perform convergence test, default False stream_to (str): Directly stream fitting results to disk if output path is given will create a folder with name of seq/hash with pickled dict of fitting results overwrite (bool): if overwrite existing results when stream to disk. Default False. """ from yutility.log import Timer logging.info('Batch fitting starting...') with Timer(): if self.large_dataset and stream_to is None: logging.error( 'You are working with large dataset and stream_to needs to be specified', error_type=ValueError) if not self.large_dataset and stream_to is not None: self.large_dataset = True logging.warning( "You provided `stream_to` so the large_dataset method is used" ) if self.large_dataset: self._hash() self.results.result_path = Path(stream_to) check_dir(self.results.result_path.joinpath('seqs')) dump_json(obj=self._seq_to_hash, path=self.results.result_path.joinpath( 'seqs', 'seq_to_hash.json')) from functools import partial work_fn = partial(_work_fn, point_estimate=point_estimate, replicates=replicates, bootstrap=bootstrap, convergence_test=convergence_test) worker_generator = self._worker_generator(stream_to=stream_to, overwrite=overwrite) if parallel_cores > 1: import multiprocessing as mp pool = mp.Pool(processes=int(parallel_cores)) logging.info( 'Use multiprocessing to fit in {} parallel threads...'. format(parallel_cores)) workers = pool.map(work_fn, worker_generator) else: # single thread logging.info('Fitting in a single thread...') workers = [work_fn(worker) for worker in worker_generator] # print(workers[0].summary()) self.results.summary = pd.DataFrame( {worker.name: worker.summary() for worker in workers}).transpose() self.results.summary.index.name = 'seq' # record result if self.bootstrap: if self.large_dataset: self.results._bs_record = self._seq_to_hash else: self.results._bs_record = { worker.name: worker.results.uncertainty.records for worker in workers } if convergence_test: if self.large_dataset: self.results._conv_record = self._seq_to_hash else: self.results._conv_record = { worker.name: worker.results.convergence.records for worker in workers } if self.large_dataset: self._hash_inv() self.results.to_json(output_dir=stream_to) logging.info('Fitting finished')
def main(): """Main function for fitting""" from k_seq.estimate import BatchFitter from k_seq.model.kinetic import BYOModel work_table, x_data, sigma, seq_data = read_table() if args.bs_method.lower() == 'stratified': try: grouper = getattr(seq_data.grouper, args.stratified_grouper).group except: logging.error('Can not find grouper for stratified bootstrapping', error_type=ValueError) sys.exit(1) else: grouper = None logging.info(f'exclude_zero: {args.exclude_zero}') logging.info(f'inverse_weight: {args.inverse_weight}') logging.info(f'fit_top_n: {args.fit_top_n}') logging.info(f'large_data: {args.large_data}') logging.info(f'convergence: {args.convergence_num > 0}') logging.info(f'bootstrap: {args.bootstrap_num > 0}') batch_fitter = BatchFitter(y_dataframe=work_table, x_data=x_data, sigma=sigma, bounds=[[0, 0], [np.inf, 1]], metrics={'kA': kA}, model=BYOModel.reacted_frac(broadcast=False), exclude_zero=args.exclude_zero, grouper=grouper, bootstrap_num=args.bootstrap_num, bs_record_num=args.bs_record_num, bs_method=args.bs_method, bs_stats={}, conv_reps=args.convergence_num, conv_init_range=((0, 10), (0, 1)), conv_stats={}, large_dataset=True, note=args.note, rnd_seed=args.seed) stream_to = args.output_dir if args.large_data else None batch_fitter.fit(parallel_cores=args.core_num, point_estimate=True, bootstrap=args.bootstrap_num > 0, convergence_test=args.convergence_num > 0, stream_to=stream_to, overwrite=args.overwrite) batch_fitter.summary(save_to=f'{args.output_dir}/fit_summary.csv') batch_fitter.save_model(output_dir=args.output_dir, results=True, bs_record=False, tables=True) # zip seq info os.system( f"cd {str(args.output_dir)} && tar -czf seq.tar.gz seqs && rm -r seqs")
'--inverse_weight', dest='inverse_weight', default=False, action='store_true', help='Use counts (with pseudo counts 0.5) as the sigma in fitting') parser.add_argument('--seed', type=int, default=23, help='Random seed') args = parser.parse_args() check_dir(args.output_dir) dump_json(obj=vars(args), path=f"{args.output_dir}/config.json") args.output_dir = Path(args.output_dir).resolve() return args if __name__ == '__main__': args = parse_args() # pkg_path = args.pkg_path # if pkg_path is not None and pkg_path not in sys.path: # sys.path.insert(0, pkg_path) logging.add_console_handler() logging.add_file_handler(args.output_dir / "LOG") logging.set_level('info') logging.info(f"Log stream to {args.output_dir/'LOG'})") info = logging.info with Timer(): main()
def __init__(self, model, sample_n, x_data, save_to, param1_name, param1_range, param2_name, param2_range, param1_log=False, param2_log=False, model_kwargs=None, bootstrap_num=100, bs_record_num=50, bs_method='data', bs_stats=None, grouper=None, record_full=False, conv_reps=20, conv_stats=None, conv_init_range=None, fitting_kwargs=None, seed=23): from ..utility.func_tools import AttrScope from ..utility.file_tools import dump_json from pathlib import Path # assign model self.model = model self.parameters = None self.model_kwargs = model_kwargs if model_kwargs is not None else {} # assign parameter self.param1 = AttrScope(name=param1_name, range=param1_range, log=param1_log) self.param2 = AttrScope(name=param2_name, range=param2_range, log=param2_log) self.sample_n = sample_n # assign fitting specs fitting_kwargs = fitting_kwargs if fitting_kwargs is not None else {} self.batchfitter_kwargs = dict( bootstrap_num=bootstrap_num, bs_record_num=bs_record_num, bs_method=bs_method, bs_stats=bs_stats, grouper=grouper, record_full=record_full, conv_reps=conv_reps, conv_init_range=conv_init_range, conv_stats=conv_stats, ) self.batchfitter_kwargs.update(fitting_kwargs) # assign x_data if not isinstance(x_data, pd.Series): x_data = pd.Series(x_data) self.x_data = x_data self.y_dataframe = None self.seed = seed self.results = None config = dict(sample_n=sample_n, x_data=list(x_data), param1_name=param1_name, param1_range=param1_range, param1_log=param1_log, param2_name=param2_name, param2_range=param2_range, param2_log=param2_log, bootstrap_num=bootstrap_num, bs_record_num=bs_record_num, bs_method=bs_method, bs_stats=list(bs_stats.keys()) if isinstance( bs_stats, dict) else bs_stats, conv_reps=conv_reps, conv_stats=list(conv_stats.keys()) if isinstance( conv_stats, dict) else conv_stats, conv_init_range=conv_init_range if conv_init_range is None else list(conv_init_range), seed=seed) # create saving path and save config self.save_to = Path(save_to) if not self.save_to.exists(): self.save_to.mkdir(parents=True) logging.info(f'Output dir {str(self.save_to)} created') dump_json(config, path=self.save_to.joinpath('config.json'))
def simulate_samples(self, grid=True, const_err=None, rel_err=None, y_enforce_positive=True): """Simulate a set of samples (param1 and param2)""" logging.info( f"Simulating dataset with const_error: {const_err}, pct_error: {rel_err}, " f"y_enforce_positive: {y_enforce_positive}...") if self.seed is not None: np.random.seed(self.seed) if grid: n_cell = int(np.sqrt(self.sample_n)) + 1 if self.param1.log: param1 = np.logspace(np.log10(self.param1.range[0]), np.log10(self.param1.range[1]), n_cell) else: param1 = np.linspace(self.param1.range[0], self.param1.range[1], n_cell) if self.param2.log: param2 = np.logspace(np.log10(self.param2.range[0]), np.log10(self.param2.range[1]), n_cell) else: param2 = np.linspace(self.param2.range[0], self.param2.range[1], n_cell) self.parameters = pd.DataFrame({ self.param1.name: np.repeat(np.expand_dims(param1, -1), n_cell, -1).T.reshape(-1), self.param2.name: np.repeat(param2, n_cell) }) else: self.parameters = pd.DataFrame({ self.param1.name: _parameter_gen(self.param1.range, self.param1.log, size=self.sample_n), self.param2.name: _parameter_gen(self.param2.range, self.param2.log, size=self.sample_n) }) def partial_model(param): y = self.model(self.x_data, **param.to_dict(), **self.model_kwargs) if not isinstance(y, pd.Series) and isinstance( self.x_data, pd.Series): y = pd.Series(y, index=self.x_data.index) return y self.y_dataframe = self.parameters.apply(partial_model, axis=1) if const_err is not None: self.y_dataframe += np.random.normal(loc=0, scale=const_err, size=self.y_dataframe.shape) if rel_err is not None: self.y_dataframe += np.random.normal(loc=0, scale=self.y_dataframe * rel_err, size=self.y_dataframe.shape) if y_enforce_positive: self.y_dataframe[self.y_dataframe < 0] = 0 logging.info('Simulation done.') self.save_to.joinpath('data').mkdir(exist_ok=True) self.parameters.to_csv(self.save_to.joinpath('data', 'parameters.csv')) self.y_dataframe.to_csv(self.save_to.joinpath('data', 'y.csv')) self.x_data.to_csv(self.save_to.joinpath('data', 'x.csv'))
def simulate_counts(uniq_seq_num, x_values, total_reads, p0_generator=None, kinetic_model=None, count_model=None, total_amount_error=None, param_sample_from_df=None, weights=None, replace=True, reps=1, seed=None, note=None, save_to=None, **param_generators): from ..model import pool if seed is not None: np.random.seed(seed) # default models # kinetic_model: BYO first-order model returns absolute amount # count_model: multinomial if kinetic_model is None: from ..model import kinetic kinetic_model = kinetic.BYOModel.amount_first_order(broadcast=False) logging.info( 'No kinetic model provided, use BYOModel.amount_first_order') if count_model is None: from ..model import count count_model = count.multinomial logging.info('No count model provided, use multinomial distribution') # compose sequence parameter from (with priority high to low) # 1. p0 # 2. param_generator # 3. param_sample_from_df param_table = pd.DataFrame(index=np.arange(uniq_seq_num)) logging.info(f'param_table created, param_table shape {param_table.shape}') # if sample p0 from a generator if p0_generator is not None: param_table['p0'] = PoolParamGenerator.sample_from_iid_dist( p0=p0_generator, uniq_seq_num=uniq_seq_num)['p0'] logging.info( f'p0 added from distribution, param_table shape {param_table.shape}' ) # if extra param_generator detected if param_generators != {}: temp_table = PoolParamGenerator.sample_from_iid_dist( uniq_seq_num=uniq_seq_num, **param_generators) col_name = temp_table.columns[~temp_table.columns.isin(param_table. columns.values)] param_table = pd.concat([param_table, temp_table[col_name]], ignore_index=True, axis=1) logging.info( f'{list(param_generators.keys())} added from distribution, param_table shape {param_table.shape}' ) # if a param dataframe if provided if param_sample_from_df is not None: temp_table = PoolParamGenerator.sample_from_dataframe( df=param_sample_from_df, uniq_seq_num=uniq_seq_num, replace=replace, weights=weights) col_name = temp_table.columns[~temp_table.columns.isin(param_table. columns.values)] param_table = pd.concat([param_table, temp_table[col_name]], axis=1) logging.info( f'{col_name} added from dataframe, param_table shape {param_table.shape}' ) param_table.index.name = 'seq' # get pool model pool_model = pool.PoolModel(count_model=count_model, kinetic_model=kinetic_model, param_table=param_table) x = {} Y = {} total_amount = {} if is_numeric(total_reads): total_reads = np.repeat(total_reads, len(x_values)) for sample_ix, (c, n) in enumerate(zip(x_values, total_reads)): if reps is None or reps == 1: total_amount[f"s{sample_ix}"], Y[ f"s{sample_ix}"] = pool_model.predict(c=c, N=n) x[f"s{sample_ix}"] = {'c': c, 'n': n} else: for rep in range(reps): total_amount[f"s{sample_ix}-{rep}"], Y[ f"s{sample_ix}-{rep}"] = pool_model.predict(c=c, N=n) x[f"s{sample_ix}-{rep}"] = {'c': c, 'N': n} # return x, Y, total_amounts, param_table x = pd.DataFrame.from_dict(x, orient='columns') Y = pd.DataFrame.from_dict(Y, orient='columns') total_amount = pd.Series(total_amount) if total_amount_error is not None: if is_numeric(total_amount_error): total_amount += np.random.normal(loc=0, scale=total_amount_error, size=len(total_amount)) elif callable(total_amount_error): total_amount = total_amount.apply(total_amount_error) else: logging.error('Unknown total_amount_error type', error_type=TypeError) x.index.name = 'param' Y.index.name = 'seq' total_amount.index.name = 'amount' # return x, Y, total_amount, param_table, None from .seq_data import SeqData input_samples = x.loc['c'] input_samples = list(input_samples[input_samples < 0].index) seq_table = SeqData( data=Y, x_values=x.loc['c'], note=note, data_unit='counts', grouper={ 'input': input_samples, 'reacted': [sample for sample in x.columns if sample not in input_samples] }) seq_table.add_sample_total(total_amounts=total_amount.to_dict(), full_table=seq_table.table.original) seq_table.table.abs_amnt = seq_table.sample_total.apply( target=seq_table.table.original) from .transform import ReactedFractionNormalizer reacted_frac = ReactedFractionNormalizer(input_samples=input_samples, reduce_method='median', remove_empty=True) seq_table.table.reacted_frac = reacted_frac.apply(seq_table.table.abs_amnt) from .filters import DetectedTimesFilter seq_table.table.seq_in_all_smpl_reacted_frac = DetectedTimesFilter( min_detected_times=seq_table.table.reacted_frac.shape[1])( seq_table.table.reacted_frac) seq_table.truth = param_table if save_to is not None: from pathlib import Path save_path = Path(save_to) if save_path.suffix == '': save_path.mkdir(parents=True, exist_ok=True) total_amount.to_csv(f'{save_path}/dna_amount.csv') x.to_csv(f'{save_path}/x.csv') Y.to_csv(f'{save_path}/Y.csv') param_table.to_csv(f'{save_path}/truth.csv') seq_table.to_pickle(f"{save_path}/seq_table.pkl") else: logging.error('save_to should be a directory', error_type=TypeError) return x, Y, total_amount, param_table, seq_table
def __init__(self, y_dataframe, x_data, model, x_label=None, y_label=None, seq_to_fit=None, sigma=None, bounds=None, init_guess=None, opt_method='trf', exclude_zero=False, metrics=None, rnd_seed=None, curve_fit_kwargs=None, replicates=None, bootstrap_num=0, bs_record_num=0, bs_method='pct_res', bs_stats=None, grouper=None, record_full=False, conv_reps=0, conv_init_range=None, conv_stats=None, note=None, large_dataset=False, verbose=1, result_path=None): from ..utility.func_tools import AttrScope, get_func_params super().__init__() logging.info('Creating the BatchFitter...') self.model = model self.note = note # parse y_dataframe from ..utility.file_tools import table_object_to_dataframe self.y_dataframe = table_object_to_dataframe(y_dataframe) # process seq_to_fit if seq_to_fit is not None: if isinstance(seq_to_fit, (list, np.ndarray, pd.Series)): self.seq_to_fit = list(seq_to_fit) elif isinstance(seq_to_fit, int): self.seq_to_fit = y_dataframe.index[:seq_to_fit].values else: logging.error( 'Unknown seq_to_fit type, is it list-like or int?', error_type=TypeError) else: self.seq_to_fit = seq_to_fit # prep fitting params shared by all fittings if isinstance(x_data, pd.Series): self.x_data = x_data[y_dataframe.columns.values] elif len(x_data) != y_dataframe.shape[1]: logging.error( 'x_data length and table column number does not match', error_type=ValueError) else: self.x_data = np.array(x_data) if sigma is not None: if np.shape(sigma) != np.shape(self.y_dataframe): logging.error( 'Shape of sigma does not match the shape of y_dataframe', error_type=ValueError) self.sigma = sigma if bounds is None: bounds = (-np.inf, np.inf) if len(x_data) <= 1: logging.warning( "Number of data points less than 2, bootstrap will not be performed" ) bootstrap_num = 0 self.bootstrap = bootstrap_num > 0 # contains arguments should pass to the single estimator self.fit_params = AttrScope( x_data=self.x_data, x_label=x_label, y_label=y_label, model=self.model, bounds=bounds, init_guess=init_guess, opt_method=opt_method, exclude_zero=exclude_zero, metrics=metrics, rnd_seed=rnd_seed, curve_fit_kwargs=curve_fit_kwargs, replicates=replicates, bootstrap_num=bootstrap_num, bs_record_num=bs_record_num, bs_method=bs_method, bs_stats=bs_stats, grouper=grouper if bs_method == 'stratified' else None, record_full=record_full, conv_reps=conv_reps, conv_init_range=conv_init_range, conv_stats=conv_stats, verbose=verbose, ) if result_path is None: self.results = BatchFitResults(estimator=self) else: self.results = BatchFitResults.load_result(result_path) self.large_dataset = large_dataset self.results.large_dataset = large_dataset self.workers = None logging.info('BatchFitter created')
def test_logging_can_log(): logging.info('Some info') logging.warning("Some warning") with raises(ValueError): logging.error("let's get some ValueError", error_type=ValueError)