コード例 #1
0
    def calc_simple_ratios(self, label):
        """
        Calculate simplified (original Enrich) ratios scores.
        This method does not produce standard errors.
        """
        if self.store_check("/main/{}/scores".format(label)):
            return

        log_message(
            logging_callback=logging.info,
            msg="Calculating simple ratios ({})".format(label),
            extra={"oname": self.name},
        )

        c_last = "c_{}".format(self.store_timepoints()[-1])
        df = self.store_select(
            key="/main/{}/counts".format(label), columns=["c_0", "{}".format(c_last)]
        )

        # perform operations on the numpy values of the
        # dataframe for easier broadcasting
        num = df[c_last].values.astype("float") / df[c_last].sum(axis="index")
        denom = df["c_0"].values.astype("float") / df["c_0"].sum(axis="index")
        ratios = num / denom

        # make it a data frame again
        ratios = pd.DataFrame(data=ratios, index=df.index, columns=["ratio"])
        ratios["score"] = np.log2(ratios["ratio"])
        ratios["SE"] = np.nan
        ratios = ratios[["score", "SE", "ratio"]]  # re-order columns

        self.store_put(
            key="/main/{}/scores".format(label),
            value=ratios,
            data_columns=ratios.columns,
        )
コード例 #2
0
    def calc_ratios(self, label):
        """
        Calculate frequency ratios and standard errors between the
        last timepoint and the input. Ratios can be calculated using
        one of three methods:
            - wt
            - complete
            - full
        """
        if self.store_check("/main/{}/scores".format(label)):
            return

        log_message(
            logging_callback=logging.info,
            msg="Calculating ratios ({})".format(label),
            extra={"oname": self.name},
        )
        c_last = "c_{}".format(self.store_timepoints()[-1])
        df = self.store_select(key="/main/{}/counts".format(label),
                               columns=["c_0", "{}".format(c_last)])

        if self.logr_method == "wt":
            if VARIANTS in self.store_labels():
                wt_label = VARIANTS
            elif IDENTIFIERS in self.store_labels():
                wt_label = IDENTIFIERS
            else:
                raise ValueError("Failed to use wild type log "
                                 "ratio method, suitable data "
                                 "table not present [{}]".format(self.name))

            shared_counts = self.store_select(
                key="/main/{}/counts".format(wt_label),
                columns=["c_0", "{}".format(c_last)],
                where="index='{}'".format(WILD_TYPE_VARIANT),
            )

            # wild type not found
            if len(shared_counts) == 0:
                raise ValueError("Failed to use wild type log "
                                 "ratio method, wild type "
                                 "sequence not present [{}]".format(self.name))

            shared_counts = shared_counts.values + 0.5

        elif self.logr_method == "complete":
            shared_counts = (self.store_select(
                key="/main/{}/counts".format(label),
                columns=["c_0", "{}".format(c_last)],
            ).sum(axis="index").values + 0.5)

        elif self.logr_method == "full":
            shared_counts = (self.store_select(
                key="/main/{}/counts_unfiltered".format(label),
                columns=["c_0", "{}".format(c_last)],
            ).sum(axis="index", skipna=True).values + 0.5)
        else:
            raise ValueError('Invalid log ratio method "{}" '
                             "[{}]".format(self.logr_method, self.name))

        ratios = np.log(df[["c_0", c_last]].values +
                        0.5) - np.log(shared_counts)
        ratios = ratios[:, 1] - ratios[:, 0]  # selected - input
        ratios = pd.DataFrame(ratios, index=df.index, columns=["logratio"])

        shared_variance = np.sum(1.0 / shared_counts)
        summed = np.sum(1.0 / (df[["c_0", c_last]].values + 0.5), axis=1)

        ratios["variance"] = summed + shared_variance
        ratios["score"] = ratios["logratio"]
        ratios["SE"] = np.sqrt(ratios["variance"])

        # re-order columns
        ratios = ratios[["score", "SE", "logratio", "variance"]]
        self.store_put(
            key="/main/{}/scores".format(label),
            value=ratios,
            data_columns=ratios.columns,
        )
コード例 #3
0
    def calc_log_ratios(self, label):
        """
        Calculate the log ratios that will be fit using the linear models.
        """
        if self.store_check("/main/{}/log_ratios".format(label)):
            return

        log_message(
            logging_callback=logging.info,
            msg="Calculating log ratios ({})".format(label),
            extra={"oname": self.name},
        )

        ratios = self.store_select("/main/{}/counts".format(label))
        index = ratios.index
        c_n = ["c_{}".format(x) for x in self.store_timepoints()]
        ratios = np.log(ratios + 0.5)

        # perform operations on the numpy values of the data
        # frame for easier broadcasting
        ratios = ratios[c_n].values
        if self.logr_method == "wt":
            if VARIANTS in self.store_labels():
                wt_label = VARIANTS
            elif IDENTIFIERS in self.store_labels():
                wt_label = IDENTIFIERS
            else:
                raise ValueError(
                    "Failed to use wild type log ratio method, "
                    "suitable data table not "
                    "present [{}]".format(self.name)
                )

            wt_counts = self.store_select(
                key="/main/{}/counts".format(wt_label),
                columns=c_n,
                where="index='{}'".format(WILD_TYPE_VARIANT),
            )

            if len(wt_counts) == 0:  # wild type not found
                raise ValueError(
                    "Failed to use wild type log ratio method, "
                    "wild type sequence not "
                    "present [{}]".format(self.name)
                )
            ratios = ratios - np.log(wt_counts.values + 0.5)

        elif self.logr_method == "complete":
            ratios = ratios - np.log(
                self.store_select(key="/main/{}/counts".format(label), columns=c_n)
                .sum(axis="index")
                .values
                + 0.5
            )
        elif self.logr_method == "full":
            ratios = ratios - np.log(
                self.store_select(
                    key="/main/{}/counts_unfiltered".format(label), columns=c_n
                )
                .sum(axis="index", skipna=True)
                .values
                + 0.5
            )
        else:
            raise ValueError(
                'Invalid log ratio method "{}" [{}]'.format(self.logr_method, self.name)
            )

        # make it a data frame again
        columns = ["L_{}".format(x) for x in self.store_timepoints()]
        ratios = pd.DataFrame(data=ratios, index=index, columns=columns)
        self.store_put(
            key="/main/{}/log_ratios".format(label),
            value=ratios,
            data_columns=ratios.columns,
        )
コード例 #4
0
    def calc_regression(self, label):
        """
        Calculate least squares regression for *label*. If *weighted* is
        ``True``, calculates weighted least squares; else ordinary least
        squares.

        Regression results are stored in ``'/main/label/scores'``

        """
        req_tables = ["/main/{}/log_ratios".format(label)]
        if self.weighted:
            req_tables.append("/main/{}/weights".format(label))

        for req_table in req_tables:
            if not self.store_check(req_table):
                raise ValueError(
                    "Required table {} does not "
                    "exist [{}].".format(req_table, self.name)
                )

        if self.store_check("/main/{}/scores".format(label)):
            return
        elif "/main/{}/scores".format(label) in list(self.store_keys()):
            # need to remove the current keys because we are using append
            self.store_remove("/main/{}/scores".format(label))

        method = "WLS" if self.weighted else "OLS"
        log_message(
            logging_callback=logging.info,
            msg="Calculating {} regression coefficients " "({})".format(method, label),
            extra={"oname": self.name},
        )

        longest = (
            self.store_select(
                key="/main/{}/log_ratios".format(label), columns=["index"]
            )
            .index.map(len)
            .max()
        )
        chunk = 1

        # -------------------- REG COMPUTATION --------------------------- #
        selection = ["/main/{}/log_ratios".format(label)]
        if self.weighted:
            selection.append("/main/{}/weights".format(label))

        selection = self.store_select_as_multiple(keys=selection, chunk=True)
        for data in selection:
            log_message(
                logging_callback=logging.info,
                msg="Calculating {} for chunk {} ({} rows)".format(
                    method, chunk, len(data.index)
                ),
                extra={"oname": self.name},
            )

            result = data.apply(
                self.row_apply_function,
                axis="columns",
                args=[self.store_timepoints(), self.weighted],
            )
            # append is required because it takes the
            # "min_itemsize" argument, and put doesn't
            self.store_append(
                key="/main/{}/scores".format(label),
                value=result,
                min_itemsize={"index": longest},
            )
            chunk += 1

        # ----------------------- POST ------------------------------------ #
        # need to read from the file, calculate percentiles, and rewrite it
        log_message(
            logging_callback=logging.info,
            msg="Calculating slope standard error " "percentiles ({})".format(label),
            extra={"oname": self.name},
        )
        data = self.store_get("/main/{}/scores".format(label))
        data["score"] = data["slope"]
        data["SE"] = data["SE_slope"]
        data["SE_pctile"] = [
            stats.percentileofscore(data["SE"], x, "weak") for x in data["SE"]
        ]

        # reorder columns
        reorder_selector = [
            "score",
            "SE",
            "SE_pctile",
            "slope",
            "intercept",
            "SE_slope",
            "t",
            "pvalue_raw",
        ]
        data = data[reorder_selector]
        self.store_put(
            key="/main/{}/scores".format(label), value=data, data_columns=data.columns
        )
コード例 #5
0
    def calc_weights(self, label):
        """
        Calculate the regression weights (1 / variance).
        """
        if self.store_check("/main/{}/weights".format(label)):
            return

        log_message(
            logging_callback=logging.info,
            msg="Calculating regression weights ({})".format(label),
            extra={"oname": self.name},
        )
        variances = self.store_select("/main/{}/counts".format(label))
        c_n = ["c_{}".format(x) for x in self.store_timepoints()]
        index = variances.index

        # perform operations on the numpy values of the
        # data frame for easier broadcasting
        # var_left = 1.0 / (variances[c_n].values + 0.5)
        # var_right = 1.0 / (variances[['c_0']].values + 0.5)
        # variances = var_left + var_right
        variances = 1.0 / (variances[c_n].values + 0.5)

        # -------------------------- WT NORM ----------------------------- #
        if self.logr_method == "wt":
            if VARIANTS in self.store_labels():
                wt_label = VARIANTS
            elif IDENTIFIERS in self.store_labels():
                wt_label = IDENTIFIERS
            else:
                raise ValueError(
                    "Failed to use wild type log ratio method, "
                    "suitable data table not present [{}]".format(self.name)
                )
            wt_counts = self.store_select(
                key="/main/{}/counts".format(wt_label),
                columns=c_n,
                where="index='{}'".format(WILD_TYPE_VARIANT),
            )

            # wild type not found
            if len(wt_counts) == 0:
                raise ValueError(
                    "Failed to use wild type log ratio method, wild type "
                    "sequence not present [{}]".format(self.name)
                )
            variances = variances + 1.0 / (wt_counts.values + 0.5)

        # ---------------------- COMPLETE NORM ----------------------------- #
        elif self.logr_method == "complete":
            variances = variances + 1.0 / (
                self.store_select(key="/main/{}/counts".format(label), columns=c_n)
                .sum(axis="index")
                .values
                + 0.5
            )

        # ------------------------- FULL NORM ----------------------------- #
        elif self.logr_method == "full":
            variances = variances + 1.0 / (
                self.store_select(
                    key="/main/{}/counts_unfiltered".format(label), columns=c_n
                )
                .sum(axis="index", skipna=True)
                .values
                + 0.5
            )

        # ---------------------------- WUT? ------------------------------- #
        else:
            raise ValueError(
                'Invalid log ratio method "{}" [{}]'.format(self.logr_method, self.name)
            )

        # weights are reciprocal of variances
        variances = 1.0 / variances

        # make it a data frame again
        variances = pd.DataFrame(
            data=variances,
            index=index,
            columns=["W_{}".format(x) for x in self.store_timepoints()],
        )
        self.store_put(
            key="/main/{}/weights".format(label),
            value=variances,
            data_columns=variances.columns,
        )