Ejemplo n.º 1
0
    def from_json(cls,
                  path_to_folder,
                  estimator=None,
                  model=None,
                  x_data=None,
                  y_dataframe=None,
                  sigma=None):
        """Load results from folder of results with json format"""
        result = cls(estimator=estimator,
                     model=model,
                     x_data=x_data,
                     y_dataframe=y_dataframe,
                     sigma=sigma)
        path_to_folder = Path(path_to_folder)
        info(f'loading data from {str(path_to_folder)}...')
        if path_to_folder.joinpath('summary.csv').exists():
            result.summary = pd.read_csv(
                path_to_folder.joinpath('summary.csv'), index_col=0)
        elif path_to_folder.joinpath('summary.json').exists():
            result.summary = pd.read_json(
                path_to_folder.joinpath('summary.json'))
            result.summary.index.name = 'seq'

        if path_to_folder.joinpath('seqs').exists():
            info("'seqs' folder found")
            seq_to_hash = read_json(
                path_to_folder.joinpath('seqs', 'seq_to_hash.json'))
        elif path_to_folder.joinpath('seqs.tar.gz').exists():
            # for previous saving formats
            import tarfile
            info("'seqs.tar.gz' found")
            with tarfile.open(path_to_folder.joinpath('seqs.tar.gz'),
                              mode='r:gz') as tf:
                import json
                try:
                    seq_to_hash = json.load(
                        tf.extractfile('seqs/seq_to_hash.json'))
                except:
                    seq_to_hash = json.load(
                        tf.extractfile('results/seqs/seq_to_hash.json'))
        else:
            logging.warning(
                "'seqs' folder or 'seqs.tar.gz' not found - no individual sequence fitting results loaded"
            )
            seq_to_hash = None

        result._bs_record = seq_to_hash
        result._conv_record = seq_to_hash
        result.result_path = Path(path_to_folder)
        result.large_dataset = True
        return result
Ejemplo n.º 2
0
    def func(target, norm_factor):
        """Normalize name seq_table w.r.t norm_factor

        Returns:
            pd.DataFrame of normalized name seq_table with only samples provided in norm_factor
        """
        def sample_normalize(col):
            return col.astype('float') * norm_factor[col.name]

        sample_list = []
        for sample in target.columns:
            if sample in norm_factor.keys():
                sample_list.append(sample)
            else:
                logging.warning(
                    f'Sample {sample} is not in total amount norm_factor, skip this sample'
                )

        return target[sample_list].apply(sample_normalize, axis=0)
Ejemplo n.º 3
0
    def func(target, norm_factor):
        """Normalize counts in `name` by `norm_factor`"""

        if not isinstance(target, pd.DataFrame):
            logging.error('name needs to be pd.DataFrame')

        def sample_normalize(col):
            return col.astype('float') * norm_factor[col.name]

        sample_list = []
        for sample in target.columns:
            if sample in norm_factor.keys():
                sample_list.append(sample)
            else:
                logging.warning(
                    f'Sample {sample} is not in spike-in norm_factor, skip this sample'
                )

        return target[sample_list].apply(sample_normalize, axis=0)
Ejemplo n.º 4
0
    def __init__(self,
                 y_dataframe,
                 x_data,
                 model,
                 x_label=None,
                 y_label=None,
                 seq_to_fit=None,
                 sigma=None,
                 bounds=None,
                 init_guess=None,
                 opt_method='trf',
                 exclude_zero=False,
                 metrics=None,
                 rnd_seed=None,
                 curve_fit_kwargs=None,
                 replicates=None,
                 bootstrap_num=0,
                 bs_record_num=0,
                 bs_method='pct_res',
                 bs_stats=None,
                 grouper=None,
                 record_full=False,
                 conv_reps=0,
                 conv_init_range=None,
                 conv_stats=None,
                 note=None,
                 large_dataset=False,
                 verbose=1,
                 result_path=None):

        from ..utility.func_tools import AttrScope, get_func_params

        super().__init__()

        logging.info('Creating the BatchFitter...')

        self.model = model
        self.note = note

        # parse y_dataframe
        from ..utility.file_tools import table_object_to_dataframe
        self.y_dataframe = table_object_to_dataframe(y_dataframe)

        # process seq_to_fit
        if seq_to_fit is not None:
            if isinstance(seq_to_fit, (list, np.ndarray, pd.Series)):
                self.seq_to_fit = list(seq_to_fit)
            elif isinstance(seq_to_fit, int):
                self.seq_to_fit = y_dataframe.index[:seq_to_fit].values
            else:
                logging.error(
                    'Unknown seq_to_fit type, is it list-like or int?',
                    error_type=TypeError)
        else:
            self.seq_to_fit = seq_to_fit

        # prep fitting params shared by all fittings
        if isinstance(x_data, pd.Series):
            self.x_data = x_data[y_dataframe.columns.values]
        elif len(x_data) != y_dataframe.shape[1]:
            logging.error(
                'x_data length and table column number does not match',
                error_type=ValueError)
        else:
            self.x_data = np.array(x_data)

        if sigma is not None:
            if np.shape(sigma) != np.shape(self.y_dataframe):
                logging.error(
                    'Shape of sigma does not match the shape of y_dataframe',
                    error_type=ValueError)
        self.sigma = sigma

        if bounds is None:
            bounds = (-np.inf, np.inf)

        if len(x_data) <= 1:
            logging.warning(
                "Number of data points less than 2, bootstrap will not be performed"
            )
            bootstrap_num = 0
        self.bootstrap = bootstrap_num > 0

        # contains arguments should pass to the single estimator
        self.fit_params = AttrScope(
            x_data=self.x_data,
            x_label=x_label,
            y_label=y_label,
            model=self.model,
            bounds=bounds,
            init_guess=init_guess,
            opt_method=opt_method,
            exclude_zero=exclude_zero,
            metrics=metrics,
            rnd_seed=rnd_seed,
            curve_fit_kwargs=curve_fit_kwargs,
            replicates=replicates,
            bootstrap_num=bootstrap_num,
            bs_record_num=bs_record_num,
            bs_method=bs_method,
            bs_stats=bs_stats,
            grouper=grouper if bs_method == 'stratified' else None,
            record_full=record_full,
            conv_reps=conv_reps,
            conv_init_range=conv_init_range,
            conv_stats=conv_stats,
            verbose=verbose,
        )
        if result_path is None:
            self.results = BatchFitResults(estimator=self)
        else:
            self.results = BatchFitResults.load_result(result_path)
        self.large_dataset = large_dataset
        self.results.large_dataset = large_dataset
        self.workers = None

        logging.info('BatchFitter created')
Ejemplo n.º 5
0
    def fit(self,
            parallel_cores=1,
            point_estimate=True,
            replicates=False,
            bootstrap=False,
            convergence_test=False,
            stream_to=None,
            overwrite=False):
        """Run the estimation
        Args:
            parallel_cores (int): number of parallel cores to use. Default 1
            point_estimate (bool): if perform point estimation, default True
            bootstrap (bool): if perform bootstrap uncertainty estimation, default False
            replicates (bool): if perform replicates for uncertainty estimation, default False
            convergence_test (bool): if perform convergence test, default False
            stream_to (str): Directly stream fitting results to disk if output path is given
                will create a folder with name of seq/hash with pickled dict of fitting results
            overwrite (bool): if overwrite existing results when stream to disk. Default False.
        """

        from yutility.log import Timer
        logging.info('Batch fitting starting...')

        with Timer():
            if self.large_dataset and stream_to is None:
                logging.error(
                    'You are working with large dataset and stream_to needs to be specified',
                    error_type=ValueError)
            if not self.large_dataset and stream_to is not None:
                self.large_dataset = True
                logging.warning(
                    "You provided `stream_to` so the large_dataset method is used"
                )

            if self.large_dataset:
                self._hash()
                self.results.result_path = Path(stream_to)
                check_dir(self.results.result_path.joinpath('seqs'))
                dump_json(obj=self._seq_to_hash,
                          path=self.results.result_path.joinpath(
                              'seqs', 'seq_to_hash.json'))

            from functools import partial
            work_fn = partial(_work_fn,
                              point_estimate=point_estimate,
                              replicates=replicates,
                              bootstrap=bootstrap,
                              convergence_test=convergence_test)
            worker_generator = self._worker_generator(stream_to=stream_to,
                                                      overwrite=overwrite)
            if parallel_cores > 1:
                import multiprocessing as mp
                pool = mp.Pool(processes=int(parallel_cores))
                logging.info(
                    'Use multiprocessing to fit in {} parallel threads...'.
                    format(parallel_cores))
                workers = pool.map(work_fn, worker_generator)
            else:
                # single thread
                logging.info('Fitting in a single thread...')
                workers = [work_fn(worker) for worker in worker_generator]

            # print(workers[0].summary())
            self.results.summary = pd.DataFrame(
                {worker.name: worker.summary()
                 for worker in workers}).transpose()
            self.results.summary.index.name = 'seq'
            # record result
            if self.bootstrap:
                if self.large_dataset:
                    self.results._bs_record = self._seq_to_hash
                else:
                    self.results._bs_record = {
                        worker.name: worker.results.uncertainty.records
                        for worker in workers
                    }
            if convergence_test:
                if self.large_dataset:
                    self.results._conv_record = self._seq_to_hash
                else:
                    self.results._conv_record = {
                        worker.name: worker.results.convergence.records
                        for worker in workers
                    }

            if self.large_dataset:
                self._hash_inv()
                self.results.to_json(output_dir=stream_to)

            logging.info('Fitting finished')
Ejemplo n.º 6
0
def test_logging_can_log():
    logging.info('Some info')
    logging.warning("Some warning")
    with raises(ValueError):
        logging.error("let's get some ValueError", error_type=ValueError)
Ejemplo n.º 7
0
    def _fit(self, model=None, x_data=None, y_data=None, sigma=None, bounds="unspecified",
             metrics=None, init_guess=None, curve_fit_kwargs=None):

        from scipy.optimize import curve_fit
        from ..utility.func_tools import update_none
        from ..utility.func_tools import get_func_params

        model = update_none(model, self.model)
        parameters = get_func_params(model, required_only=True)[1:]
        x_data = update_none(x_data, self.x_data)
        y_data = update_none(y_data, self.y_data)
        sigma = update_none(sigma, self.config.sigma)
        if len(x_data) != len(sigma):
            sigma = None
            logging.debug('Sigma is ignored as it has different length as x_data')
        if bounds == "unspecified":
            bounds = self.config.bounds
        if bounds is None:
            bounds = (-np.inf, np.inf)
        metrics = update_none(metrics, self.config.metrics)

        init_guess = update_none(init_guess, self.config.init_guess)
        curve_fit_kwargs = update_none(curve_fit_kwargs, self.config.curve_fit_kwargs)

        try:
            if not init_guess:
                # by default, use a random guess form (0, 1)
                init_guess = [np.random.random() for _ in parameters]
            if curve_fit_kwargs is None:
                curve_fit_kwargs = {}
            params, pcov = curve_fit(f=model, xdata=x_data, ydata=y_data,
                                     sigma=sigma, bounds=bounds, p0=init_guess, **curve_fit_kwargs)
            if metrics is not None:
                metrics_res = pd.Series({name: fn(params) for name, fn in metrics.items()})
            else:
                metrics_res = None
        except RuntimeError:
            logging.warning(
                f"RuntimeError on \n"
                f'\tx = {x_data}\n'
                f'\ty={y_data}\n'
                f'\tsigma={sigma}'
            )
            params = np.full(fill_value=np.nan, shape=len(parameters))
            pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters)))
            if metrics is not None:
                metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()})
            else:
                metrics_res = None
        except ValueError:
            logging.warning(
                f"ValueError on \n"
                f'\tx={x_data}\n'
                f'\ty={y_data}\n'
                f'\tsigma={sigma}'
            )
            params = np.full(fill_value=np.nan, shape=len(parameters))
            pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters)))
            if metrics is not None:
                metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()})
            else:
                metrics_res = None
        except:
            logging.warning(
                f"Other error observed on\n"
                f'\tx={x_data}\n'
                f'\ty={y_data}\n'
                f'\tsigma={sigma}'
            )
            params = np.full(fill_value=np.nan, shape=len(parameters))
            pcov = np.full(fill_value=np.nan, shape=(len(parameters), len(parameters)))
            if metrics is not None:
                metrics_res = pd.Series({name: np.nan for name, fn in metrics.items()})
            else:
                metrics_res = None

        return {
            'params': pd.Series(data=params, index=parameters),
            'pcov': pd.DataFrame(data=pcov, index=parameters, columns=parameters),
            'metrics': metrics_res
        }