def to_json(self, save_to_file=None): """Save the estimator configuration as a json file, except for `model`, `bs_stats`, `conv_stats` as these are not json-able """ config_dict = self.to_dict() _ = config_dict.pop('model', None) if 'bs_stats' in config_dict.keys(): config_dict['bs_stats'] = {key: func.__repr__ for key, func in config_dict['bs_stats']} if 'conv_stats' in config_dict.keys(): config_dict['conv_stats'] = {key: func.__repr__ for key, func in config_dict['conv_stats']} if 'grouper' in config_dict.keys(): config_dict['grouper'] = config_dict['grouper'].group if save_to_file: from pathlib import Path path = Path(save_to_file) if path.suffix == '.json': # its a named json file check_dir(path.parent) dump_json(obj=config_dict, path=path, indent=2) elif path.suffix == '': # its a directory check_dir(path) dump_json(obj=config_dict, path=str(path) + '/config.json', indent=2) else: logging.error('Unrecognized saving path', error_type=NameError) else: return dump_json(config_dict, indent=0)
def to_pickle(self, output_dir, bs_record=True, conv_record=True): """Save fitting results as a single pickled dict, suitable for small dataset. For large dataset `to_json` is preferred Args: output_dir (str): path to saved results, should have suffix of ``.pkl`` bs_record (bool): if output bs_record, default True conv_record (bool): if output conv_record, default True """ check_dir(Path(output_dir).parent) data_to_dump = {'summary': self.summary} if bs_record: bs_record = self.bs_record() if isinstance(bs_record, dict): # check is type 1 data_to_dump['bs_record'] = bs_record else: logging.error('bs_record is not a loaded dict of pd.DataFrame', error_type=TypeError) if conv_record: conv_record = self.conv_record() if isinstance(conv_record, dict): # check is type 1 data_to_dump['conv_record'] = conv_record else: logging.error( 'conv_record is not a loaded dict of pd.DataFrame', error_type=TypeError) dump_pickle(obj=data_to_dump, path=output_dir)
def __init__(self, estimator, bootstrap_num, bs_record_num, bs_method, grouper=None, bs_stats=None, record_full=False): self.bs_method = bs_method if bs_method == 'stratified': from ..data.grouper import Grouper if isinstance(grouper, Grouper): grouper = grouper.group if isinstance(grouper, dict): self.grouper = grouper else: logging.error( 'Unsupported grouper type for stratified bootstrap', error_type=TypeError) self.estimator = estimator self.bootstrap_num = bootstrap_num self.bs_record_num = bs_record_num self.bs_stats = bs_stats self.record_full = record_full
def seq_variance(seq_table, grouper): """Get the spread (standard deviation) of sequence abundance across replicates, provided by grouper Returns: if single group, returns a pd.DataFrame with columns ('mean', 'sd') if multiple groups, returns two pd.DataFrame (mean, sd) with columns of each group """ from .grouper import Grouper if isinstance(grouper, Grouper): sub_tables = grouper.get_table(target=seq_table) elif isinstance(grouper, (list, pd.Series, dict)): from .grouper import get_group sub_tables = get_group(seq_table, grouper) else: logging.error("Unknown types of grouper", TypeError) sub_tables = None if isinstance(sub_tables, pd.DataFrame): return pd.DataFrame({ 'mean': sub_tables.mean(axis=1), 'sd': sub_tables.std(axis=1) }) elif isinstance(sub_tables, dict): mean = {} sd = {} for key, subtable in sub_tables.items(): mean[key] = subtable.mean(axis=1) sd[key] = subtable.std(axis=1) return pd.DataFrame(mean), pd.DataFrame(sd)
def func(target, input_samples, reduce_method='median', remove_empty=True): method_mapper = { 'med': np.nanmedian, 'median': np.nanmedian, 'mean': np.nanmean, 'avg': np.nanmean } if callable(reduce_method): base = reduce_method(target[input_samples]) else: if reduce_method.lower() in method_mapper.keys(): base = method_mapper[reduce_method](target[input_samples], axis=1) else: logging.error('Unknown reduce_method', ValueError) mask = base > 0 # if any does not exist in input samples reacted_frac = target.loc[mask, ~target.columns.isin(input_samples)].divide( base[mask], axis=0) if remove_empty: return reacted_frac.loc[reacted_frac.sum(axis=1) > 0] else: return reacted_frac
def main(): """Main function for fitting""" from k_seq.estimate import BatchFitter from k_seq.model.kinetic import BYOModel work_table, x_data, sigma, seq_data = read_table() if args.bs_method.lower() == 'stratified': try: grouper = getattr(seq_data.grouper, args.stratified_grouper).group except: logging.error('Can not find grouper for stratified bootstrapping', error_type=ValueError) sys.exit(1) else: grouper = None logging.info(f'exclude_zero: {args.exclude_zero}') logging.info(f'inverse_weight: {args.inverse_weight}') logging.info(f'fit_top_n: {args.fit_top_n}') logging.info(f'large_data: {args.large_data}') logging.info(f'convergence: {args.convergence_num > 0}') logging.info(f'bootstrap: {args.bootstrap_num > 0}') batch_fitter = BatchFitter(y_dataframe=work_table, x_data=x_data, sigma=sigma, bounds=[[0, 0], [np.inf, 1]], metrics={'kA': kA}, model=BYOModel.reacted_frac(broadcast=False), exclude_zero=args.exclude_zero, grouper=grouper, bootstrap_num=args.bootstrap_num, bs_record_num=args.bs_record_num, bs_method=args.bs_method, bs_stats={}, conv_reps=args.convergence_num, conv_init_range=((0, 10), (0, 1)), conv_stats={}, large_dataset=True, note=args.note, rnd_seed=args.seed) stream_to = args.output_dir if args.large_data else None batch_fitter.fit(parallel_cores=args.core_num, point_estimate=True, bootstrap=args.bootstrap_num > 0, convergence_test=args.convergence_num > 0, stream_to=stream_to, overwrite=args.overwrite) batch_fitter.summary(save_to=f'{args.output_dir}/fit_summary.csv') batch_fitter.save_model(output_dir=args.output_dir, results=True, bs_record=False, tables=True) # zip seq info os.system( f"cd {str(args.output_dir)} && tar -czf seq.tar.gz seqs && rm -r seqs")
def format_stat(res): if isinstance(res, (int, float, bool, dict)): return res elif isinstance(res, pd.Series): return res.to_dict() else: logging.error('Unrecognized return value for bs_stats', error_type=TypeError)
def target(self, value): """You update the target, you update the mask""" if value is None: self._target = None elif isinstance(value, pd.DataFrame): # otherwise do the filtering the update the mask self._target = value self.mask = self.get_mask(target=value) else: logging.error("target can only be pd.DataFrame or SeqTable", error_type=TypeError)
def get_uncertainty_accuracy(self, param, pred_type='bs_ci95'): """Return the accuracy of uncertainty estimation if uncertainty range includes the truth""" if pred_type in ['bs_ci95', 'bootstrap_ci95']: return self._get_bs_ci95_accuracy(param=param) elif pred_type in ['bs_sd', 'bootstrap_sd']: return self._get_bs_sd_accuracy(param=param) elif pred_type in ['rep_sd']: return self._get_rep_sd_accuracy(param=param) else: logging.error( "Unknown pred_type, choose from 'bs_ci95', 'bs_sd', 'rep_sd'")
def add_curve(data, plot_args): if isinstance(data, dict): y_ = model(xs, **data) elif isinstance(data, pd.Series): y_ = model(xs, **data.to_dict()) else: logging.error( 'Unknown parameter input type, should be pd.Series or dict', error_type=TypeError) ax.plot(xs, y_, marker=None, **plot_args)
def get_file_list(file_root, pattern=None, file_list=None, black_list=None, full_path=True): """Return files under the given `file root` match the `template` if applicable, folders are not included Args: file_root (str of list of str): root directory/directories to search pattern (str): optional, include all the files under directories if None file_list (list of str): optional, only includes the files with names in the file_list if exists black_list (list of str): optional, file names included in black_list will be excluded full_path (bool): if return the full path or only name of the file, by default, if file_root is one string, only file name will be returned; if file_root contains multiple strings, full path will be returned Returns: list of str (file names) or path.Path (full directory) """ if pattern is None: pattern = '*' else: pattern = '*{}*'.format(pattern) if black_list is None: black_list = [] if isinstance(file_root, (str, Path)): files = [ file for file in Path(file_root).glob(pattern) if file.name not in black_list ] elif isinstance(file_root, list): files = [] for root_path in file_root: files += [ file for file in Path(root_path).glob(pattern) if file.name not in black_list ] else: logging.error('count_files should be a string or list of string', error_type=TypeError) if file_list is not None: files = [file for file in files if str(file.name) in file_list] if full_path: return files else: return [file.name for file in files]
def __iter__(self): """Group iterator to return a generator of subtables""" if self.target is None: logging.error( 'seq_data.target is None, please assign before iteration', error_type=ValueError) if self.type == 0: if self.axis == 0: return (self.target.loc[ix] for ix in self.group) else: return (self.target[ix] for ix in self.group) else: return (self.get_table(group) for group in self.group.keys())
def split(self, target=None, remove_zero=False): if target is None: target = self.target if target is None: logging.error("Please indicate target seq_table to group") if self.type == 0: return self.get_table(target=target, remove_zero=remove_zero) else: return { group: self.get_table(target=target, group=group, remove_zero=remove_zero) for group in self.group.keys() }
def bs_method(self, bs_method): implemented_methods = { 'pct_res': 'pct_res', 'resample percent residues': 'pct_res', 'rel_res': 'pct_res', 'resample data points': 'data', 'data': 'data', 'stratified': 'stratified', } if bs_method in implemented_methods.keys(): self._bs_method = bs_method else: logging.error(f'Bootstrap method {bs_method} is not implemented', error_type=NotImplementedError)
def multinomial(p, N, seed=None): """Multinomial distribution for a given probability p and total number of draws""" if seed is not None: np.random.seed(seed) if np.sum(p) != 1: p = np.array(p) / np.sum(p) from scipy.stats import multinomial if isinstance(N, (list, np.ndarray, pd.Series)): return np.array([multinomial.rvs(n=int(n), p=p) for n in N]) elif is_numeric(N): return multinomial.rvs(n=int(N), p=p) else: logging.error("Unknown N type", error_type=TypeError)
def first_order(c, k, A, alpha, t, broadcast=False): if check_scalar(c): c = np.array([to_scalar(c)]) else: c = np.array(c) if check_scalar(k): k = np.array([to_scalar(k)]) else: k = np.array(k) if check_scalar(A): A = np.array([to_scalar(A)]) else: A = np.array(A) if broadcast: # dim param # 0 A # 1 k # 2 c y = np.outer(A, (1 - np.exp(-alpha * t * np.outer(k, c)))) y = y.reshape((len(A), len(k), len(c))) y[:, :, c < 0] = 1 dim_to_squeeze = [] for dim in (0, 1, 2): if y.shape[dim] == 1: dim_to_squeeze.append(dim) else: # dim param # 0 k, A # 1 c if len(k) != len(A): logging.error( 'k and A should have same length when broadcasting is disabled', error_type=ValueError) y = np.expand_dims(A, -1) * (1 - np.exp(-alpha * t * np.outer(k, c))) y[:, c < 0] = 1 dim_to_squeeze = [] for dim in (0, 1): if y.shape[dim] == 1: dim_to_squeeze.append(dim) y = np.squeeze(y, axis=tuple(dim_to_squeeze)) return y
def __init__(self, count_model, kinetic_model=None, param_table=None, note=None, **params): """Initialize a pool model with given kinetic models and count_model Args: count_model (`ModelBase` or `callable`): model for sequencing counts kinetic_model (`ModalBase` or `callable`): model for pool kinetics, no react if not given **params: """ def _static_pool(p0): """Static pool with no reaction""" return p0 super().__init__() if kinetic_model is None: self.kinetic_model = _static_pool elif isclass(kinetic_model) and issubclass(kinetic_model, ModelBase): self.kinetic_model = kinetic_model.func elif callable(kinetic_model): self.kinetic_model = kinetic_model else: logging.error('model should be a ModelBase subclass or a callable', error_type=TypeError) if isclass(count_model) and issubclass(count_model, ModelBase): self.count_model = count_model.func elif callable(count_model): self.count_model = count_model else: logging.error('model should be a ModelBase subclass or a callable', error_type=TypeError) self.kinetic_params = get_func_params(self.kinetic_model, required_only=True) self.count_params = get_func_params(self.count_model, required_only=True) if param_table is not None: params.update( {col: param_table[col] for col in param_table.columns}) self.params = params self.note = note
def get_FitResult(self, seq=None): """Get FitResults from a JSON file """ from .least_squares import FitResults if self._bs_record is None: logging.error('No bootstrap or convergence test record found', error_type=TypeError) else: seq_to_hash = self._bs_record if seq is None: return seq_to_hash if isinstance(seq_to_hash[seq], (list, tuple)): # new hierarchical format tg_ix, hash_ix = seq_to_hash[seq] result = FitResults.from_json(json_path=f'{hash_ix}.json', tarfile=self.result_path.joinpath( 'seqs', f'{tg_ix}.tar.gz')) else: # old format if self.result_path.joinpath('seqs', f"{seq_to_hash[seq]}.json").exists(): logging.info(f"load result from {seq_to_hash[seq]}.json") result = FitResults.from_json( self.result_path.joinpath('seqs', f'{seq_to_hash[seq]}.json')) elif self.result_path.joinpath('seqs.tar.gz').exists(): try: result = FitResults.from_json( json_path=f'seqs/{seq_to_hash[seq]}.json', tarfile=self.result_path.joinpath('seqs.tar.gz')) except: result = FitResults.from_json( json_path=f'results/seqs/{seq_to_hash[seq]}.json', tarfile=self.result_path.joinpath('seqs.tar.gz')) if result.data.x_data is None and self.data.y_dataframe is not None: # add from data attribute result.data.x_data = self.data.x_data result.data.y_data = self.data.y_dataframe.loc[seq] return result
def func(target, norm_factor): """Normalize counts in `name` by `norm_factor`""" if not isinstance(target, pd.DataFrame): logging.error('name needs to be pd.DataFrame') def sample_normalize(col): return col.astype('float') * norm_factor[col.name] sample_list = [] for sample in target.columns: if sample in norm_factor.keys(): sample_list.append(sample) else: logging.warning( f'Sample {sample} is not in spike-in norm_factor, skip this sample' ) return target[sample_list].apply(sample_normalize, axis=0)
def __init__(self, group, target=None, axis=1): """Initialize a Grouper instance Args: group (list or dict): list creates a Type 0 Grouper (single group) and dict creates a Type 1 Grouper (multiple groups) target (pd.DataFrame): optional, target seq_table axis (0 or 1): axis to apply the grouper """ if isinstance(group, (list, np.ndarray, pd.Series, str)): self.type = 0 self.group = list(group) elif isinstance(group, dict): self.type = 1 self.group = {key: list(members) for key, members in group.items()} else: logging.error('group should be list-like or dictionary') self.target = target self.axis = axis
def read_table_files(file_path, col_name=None, header=1): """Read common seq_table files - .xls or .xlsx: first sheet will be read with first row as header - .csv: read the csv files with first row as header, separator is ',' - .tsv: read the tsv files with first row as header, separator is '/t' """ from pathlib import Path import pandas as pd file_path = Path(file_path) if file_path.suffix in ['xls', 'xlsx']: df = pd.read_excel(io=file_path, sheet_name=0, header=header) elif file_path.suffix in ['csv']: df = pd.read_csv(file_path, header=header) elif file_path.suffix in ['tsv']: df = pd.read_csv(file_path, header=header, sep='/t') else: logging.error('File type not identified', error_type=TypeError) return df[col_name]
def get_table(self, group=None, target=None, axis=None, remove_zero=False): """Return a sub-seq_table from target given group""" if target is None: target = self.target if target is None: logging.error("Please indicate target seq_table to group") if axis is None: axis = self.axis if self.type == 0: # ignore group argument return slice_table(table=target, keys=self.group, axis=axis, remove_empty=remove_zero) else: if group is None: logging.error('Please indicate the group') return slice_table(table=target, keys=self.group[group], axis=axis, remove_empty=remove_zero)
def table_object_to_dataframe(obj, table_name=None): """Convert object (`file path`, `SeqData`) to `pd.DataFrame` """ from pathlib import Path, PosixPath import pandas as pd from ..data.seq_data import SeqData if isinstance(obj, (str, Path, PosixPath)): if Path(obj).is_file(): if Path(obj).suffix == '.csv': return pd.read_csv(Path(obj), index_col=0) elif Path(obj).suffix in ['.pkl', '.pickle']: obj = read_pickle(obj) else: logging.error(f'{obj} is not a valid file', error_type=FileNotFoundError) else: logging.error(f'{obj} is not a valid file', error_type=FileNotFoundError) if isinstance(obj, pd.DataFrame): return obj elif isinstance(obj, SeqData): if table_name is None: return obj.table.original else: return getattr(obj.table, table_name) else: logging.error('SeqTable should be a `pd.DataFrame` or `SeqData`', error_type=TypeError)
def __init__(self, data, data_unit=None, sample_list=None, seq_list=None, data_note=None, use_sparse=True, seq_metadata=None, sample_metadata=None, grouper=None, x_values=None, x_unit=None, note=None, dataset_metadata=None): # initialize metadata from datetime import datetime self.metadata = AttrScope(created_time=datetime.now(), note=note) # add metadata if dataset_metadata is not None: self.metadata.add(dataset_metadata) if sample_metadata is not None: self.metadata.samples = AttrScope(sample_metadata) if seq_metadata is not None: self.metadata.seqs = AttrScope(seq_metadata) logging.info('SeqData created') # add original seq_table self.table = AttrScope(original=SeqTable(data, columns=sample_list, index=seq_list, unit=data_unit, note=data_note, use_sparse=use_sparse)) # add x values if x_values is None: self.x_values = None self.x_unit = None elif isinstance(x_values, (dict, pd.Series)): self.x_values = pd.Series(x_values) self.x_unit = x_unit elif isinstance(x_values, (list, np.ndarray)): self.x_values = pd.Series(x_values, index=self.table.original.samples) self.x_unit = x_unit else: logging.error('Unknown type for x_values', error_type=TypeError) self.x_values = self.x_values[self.table.original.samples] self.x_values = self.x_values[~self.x_values.isna()] if grouper is not None: from .grouper import GrouperCollection self.grouper = GrouperCollection() self.grouper.add(**grouper) self.update_analysis()
def get_func_params(func, required_only=True): """Get the name of arguments for a function (callable), or the arguments in __init__ for a Class (self not included) Args: func (`callable`): the function required_only (bool): if exclude arguments with default values Returns: a list of arguments name in order """ from inspect import signature if not callable(func): logging.error('func is not a callable', error_type=TypeError) sign = signature(func) if required_only: return [ key for key, param in sign.parameters.items() if param.default is sign.return_annotation ] else: return list(sign.parameters.keys())
def filter_axis(self, filter, axis=0, remove_empty=False, inplace=False): allowed_axis = { 'sample': 1, 'observation': 1, 1: 1, 'seq': 0, 'sequences': 0, 'seqs': 0, 0: 0 } if isinstance(axis, str): axis = axis.lower() if axis not in allowed_axis.keys(): logging.error("Unknown axis, please use 'sample'/1 or 'sequence'/1", error_type=ValueError) else: axis = allowed_axis[axis] if inplace: sliced = slice_table(self, axis=axis, keys=filter, remove_empty=remove_empty) self.reindex(index=sliced.index, columns=sliced.columns, copy=False) return slice_table(self, axis=axis, keys=filter, remove_empty=remove_empty)
def get_est_results(self, param, pred_type='point_est'): """Return the estimation (pred) and truth of given parameter""" if pred_type in [ 'pe', 'point_est', 'point est', 'point_estimation', 'point estimation' ]: pred = self.results[param] elif pred_type in ['mean', 'bs_mean', 'bootstrap_mean']: pred = self.results[self._bs_prefix + param + '_mean'] elif pred_type in ['median', 'bs_median', 'bootstrap_median']: pred = self.results[self._bs_prefix + param + '_50%'] elif pred_type in ['rep_mean', 'replicate_mean']: pred = self.results[self._bs_prefix + param + '_mean'] else: logging.error( "Unknown pred_type, choose from 'point_est', 'bs_mean', 'bs_median', 'rep_mean'", ValueError) truth = self.truth[param] return pd.DataFrame({ 'pred': pred[self.seq_list], 'truth': truth[self.seq_list] })
def spike_in_amount(self, spike_in_amount): """Check and reformat spike_in_amount type, and update norm_factors""" if isinstance(spike_in_amount, (list, np.ndarray)): # if unkey array, must be same length as base_table's columns if len(self.base_table.columns) != len(spike_in_amount): logging.error( 'Length of spike_in_amount does not match sample number', ValueError) else: self._spike_in_amount = pd.Series( data=spike_in_amount, index=self.base_table.columns) elif isinstance(spike_in_amount, dict): self._spike_in_amount = pd.Series(spike_in_amount) elif isinstance(spike_in_amount, pd.Series): self._spike_in_amount = spike_in_amount else: logging.error( 'Unknown spike_in_amount type, it should be list-like, pd.Series, or dict' ) if hasattr(self, 'spike_in_members'): self._update_norm_factors()
def filter(cls, target, axis=None, remove_empty=False, reverse=False, **kwargs): """Classmethod to directly apply filters""" mask = cls._get_mask(target=target, axis=axis, **kwargs) if reverse: mask = ~mask if axis is None: logging.error('Please indicate axis to filter', error_type=ValueError) if axis == 0: return slice_table(table=target, keys=target.index[mask], axis=0, remove_empty=remove_empty) else: return slice_table(table=target, keys=target.columns[mask], axis=1, remove_empty=remove_empty)
def generate_params(param_input): """Parse single distribution input and reformat as generated results """ from types import GeneratorType if isinstance(param_input, (list, np.ndarray, pd.Series)): if len(param_input) == uniq_seq_num: return param_input else: logging.info( 'Size fo input param list and expected uniq_seq_num does not match, ' 'resample to given uniq_seq_num with replacement') return np.random.choice(param_input, replace=True, size=uniq_seq_num) elif isinstance(param_input, GeneratorType): # assume only generate one realization return [next(param_input) for _ in range(uniq_seq_num)] elif callable(param_input): try: # if there is a uniq_seq_num parameter to pass param_output = param_input(size=uniq_seq_num) if isinstance(param_output, (list, np.ndarray, pd.Series)): return param_output elif isinstance(param_output, GeneratorType): return next(param_output) else: logging.error( "Unknown input to draw a distribution value", error_type=TypeError) except TypeError: # if can not pass uniq_seq_num, assume generate single samples param_output = param_input() if isinstance(param_output, GeneratorType): return [ next(param_output) for _ in range(uniq_seq_num) ] elif isinstance(param_output, (float, int)): return [param_input() for _ in range(uniq_seq_num)] else: logging.error( "Unknown callable return type for distribution", error_type=TypeError) else: logging.error("Unknown input to draw a distribution value", error_type=TypeError)