Esempio n. 1
0
    def score(self, table):

        prepared_table, __ = prepare_data_table(
            table, score_columns=self.score_columns)
        texp = Experiment(prepared_table)
        score = self.classifier.score(texp, True)
        texp["d_score"] = (score - self.mu) / self.nu

        s_values, q_values = lookup_s_and_q_values_from_error_table(
            texp["d_score"].values, self.error_stat.df)
        texp["m_score"] = q_values
        texp["s_value"] = s_values
        logging.info("mean m_score = %e, std_dev m_score = %e" %
                     (np.mean(q_values), np.std(q_values, ddof=1)))
        logging.info("mean s_value = %e, std_dev s_value = %e" %
                     (np.mean(s_values), np.std(s_values, ddof=1)))
        texp.add_peak_group_rank()

        df = table.join(texp[["d_score", "m_score", "peak_group_rank"]])

        if CONFIG.get("compute.probabilities"):
            df = self.add_probabilities(df, texp)

        if CONFIG.get("target.compress_results"):
            to_drop = [
                n for n in df.columns
                if n.startswith("var_") or n.startswith("main_")
            ]
            df.drop(to_drop, axis=1, inplace=True)

        return df
Esempio n. 2
0
def cleanup_and_check(df):
    score_columns = ["main_score"
                     ] + [c for c in df.columns if c.startswith("var_")]
    # this is fast but not easy to read
    # find peak groups with in valid scores:
    sub_df = df.loc[:, score_columns]
    flags = ~pd.isnull(sub_df)
    valid_rows = flags.all(axis=1)

    df_cleaned = df.loc[valid_rows, :]

    # decoy / non decoy sub tables
    df_decoy = df_cleaned[df_cleaned["is_decoy"].eq(True)]
    df_target = df_cleaned[df_cleaned["is_decoy"].eq(False)]

    # groups
    decoy_groups = set(df_decoy["tg_id"])
    target_groups = set(df_target["tg_id"])

    n_decoy = len(decoy_groups)
    n_target = len(target_groups)

    msg = "data set contains %d decoy and %d target transition groups" % (
        n_decoy, n_target)
    logging.info(msg)
    if n_decoy < 10 or n_target < 10:
        logging.error("need at least 10 decoy groups ans 10 non decoy groups")
        raise Exception(
            "need at least 10 decoy groups ans 10 non decoy groups. %s" % msg)

    return df_cleaned
Esempio n. 3
0
 def apply_scorer_out_of_core(self, pathes, delim, check_cols, loaded_scorer):
     self.check_table_headers(pathes, delim, check_cols)
     with timer():
         logging.info("apply scorer to input data")
         result, __, used_weights = self._apply_scorer_out_of_core(pathes, delim, loaded_scorer)
         logging.info("processing input data finished")
     return result, None, used_weights
Esempio n. 4
0
def cleanup_and_check(df):
    score_columns = ["main_score"] + [c for c in df.columns if c.startswith("var_")]
    # this is fast but not easy to read
    # find peak groups with in valid scores:
    sub_df = df.loc[:, score_columns]
    flags = ~pd.isnull(sub_df)
    valid_rows = flags.all(axis=1)

    df_cleaned = df.loc[valid_rows, :]

    # decoy / non decoy sub tables
    df_decoy = df_cleaned[df_cleaned["is_decoy"].eq(True)]
    df_target = df_cleaned[df_cleaned["is_decoy"].eq(False)]

    # groups
    decoy_groups = set(df_decoy["tg_id"])
    target_groups = set(df_target["tg_id"])

    n_decoy = len(decoy_groups)
    n_target = len(target_groups)

    msg = "data set contains %d decoy and %d target transition groups" % (n_decoy, n_target)
    logging.info(msg)
    if n_decoy < 10 or n_target < 10:
        logging.error("need at least 10 decoy groups ans 10 non decoy groups")
        raise Exception("need at least 10 decoy groups ans 10 non decoy groups. %s" % msg)

    return df_cleaned
Esempio n. 5
0
    def score(self, table):

        prepared_table, __ = prepare_data_table(table, score_columns=self.score_columns)
        texp = Experiment(prepared_table)
        score = self.classifier.score(texp, True)
        texp["d_score"] = (score - self.mu) / self.nu

        s_values, q_values = lookup_s_and_q_values_from_error_table(texp["d_score"].values,
                                                                    self.error_stat.df)
        texp["m_score"] = q_values
        texp["s_value"] = s_values
        logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values),
                                                                  np.std(q_values, ddof=1)))
        logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values),
                                                                  np.std(s_values, ddof=1)))
        texp.add_peak_group_rank()

        df = table.join(texp[["d_score", "m_score", "peak_group_rank"]])

        if CONFIG.get("compute.probabilities"):
            df = self.add_probabilities(df, texp)

        if CONFIG.get("target.compress_results"):
            to_drop = [n for n in df.columns if n.startswith("var_") or n.startswith("main_")]
            df.drop(to_drop, axis=1, inplace=True)

        return df
Esempio n. 6
0
 def apply_scorer_out_of_core(self, pathes, delim, check_cols,
                              loaded_scorer):
     self.check_table_headers(pathes, delim, check_cols)
     with timer():
         logging.info("apply scorer to input data")
         result, __, used_weights = self._apply_scorer_out_of_core(
             pathes, delim, loaded_scorer)
         logging.info("processing input data finished")
     return result, None, used_weights
Esempio n. 7
0
    def apply_weights(self, pathes, delim_in, check_cols, loaded_weights):

        self.check_table_headers(pathes, delim_in, check_cols)
        tables = list(self.read_tables_iter(pathes, delim_in))
        with timer():
            logging.info("apply weights")
            result, scorer, trained_weights = self._apply_weights(tables, loaded_weights)
            logging.info("processing input data finished")
        return result, scorer, trained_weights
Esempio n. 8
0
    def learn_and_apply_out_of_core(self, pathes, delim, check_cols):

        self.check_table_headers(pathes, delim, check_cols)
        with timer():

            logging.info("learn and apply classifier out of core")
            result, scorer, trained_weights = self._learn_and_apply_out_of_core(pathes, delim)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 9
0
    def apply_scorer(self, pathes, delim, check_cols, loaded_scorer):
        self.check_table_headers(pathes, delim, check_cols)
        tables = list(self.read_tables_iter(pathes, delim))

        with timer():
            logging.info("apply scorer to input data")
            result, __, trained_weights = self._apply_scorer(tables, loaded_scorer)
            scorer = None
            logging.info("processing input data finished")
        return result, scorer, trained_weights
Esempio n. 10
0
    def apply_weights_out_of_core(self, pathes, delim, check_cols, loaded_weights):
        self.check_table_headers(pathes, delim, check_cols)
        with timer():

            logging.info("apply weights out of core")
            result, scorer, trained_weights = self._apply_weights_out_of_core(pathes, delim,
                                                                              loaded_weights)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 11
0
    def apply_weights(self, pathes, delim_in, check_cols, loaded_weights):

        self.check_table_headers(pathes, delim_in, check_cols)
        tables = list(self.read_tables_iter(pathes, delim_in))
        with timer():
            logging.info("apply weights")
            result, scorer, trained_weights = self._apply_weights(
                tables, loaded_weights)
            logging.info("processing input data finished")
        return result, scorer, trained_weights
Esempio n. 12
0
    def apply_scorer(self, pathes, delim, check_cols, loaded_scorer):
        self.check_table_headers(pathes, delim, check_cols)
        tables = list(self.read_tables_iter(pathes, delim))

        with timer():
            logging.info("apply scorer to input data")
            result, __, trained_weights = self._apply_scorer(
                tables, loaded_scorer)
            scorer = None
            logging.info("processing input data finished")
        return result, scorer, trained_weights
Esempio n. 13
0
    def apply_weights_out_of_core(self, pathes, delim, check_cols,
                                  loaded_weights):
        self.check_table_headers(pathes, delim, check_cols)
        with timer():

            logging.info("apply weights out of core")
            result, scorer, trained_weights = self._apply_weights_out_of_core(
                pathes, delim, loaded_weights)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 14
0
    def learn_and_apply_out_of_core(self, pathes, delim, check_cols):

        self.check_table_headers(pathes, delim, check_cols)
        with timer():

            logging.info("learn and apply classifier out of core")
            result, scorer, trained_weights = self._learn_and_apply_out_of_core(
                pathes, delim)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 15
0
    def learn_and_apply(self, pathes, delim, check_cols):

        self.check_table_headers(pathes, delim, check_cols)
        tables = list(self.read_tables_iter(pathes, delim))
        with timer():

            logging.info("learn and apply classifier from input data")
            result, scorer, trained_weights = self._learn_and_apply(tables)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 16
0
    def learn_and_apply(self, pathes, delim, check_cols):

        self.check_table_headers(pathes, delim, check_cols)
        tables = list(self.read_tables_iter(pathes, delim))
        with timer():

            logging.info("learn and apply classifier from input data")
            result, scorer, trained_weights = self._learn_and_apply(tables)
            logging.info("processing input data finished")

        return result, scorer, trained_weights
Esempio n. 17
0
    def run(self):

        self.prefix = self.check_pathes()
        dirname = self.determine_output_dir_name()
        out_pathes = self.create_out_pathes(dirname)

        extra_writes = dict(self.extra_writes(dirname))

        to_check = list(v for p in out_pathes for v in p.values())
        to_check.extend(extra_writes.values())

        if not CONFIG.get("target.overwrite"):
            error = check_if_any_exists(to_check)
            if error:
                return False

        self.check_cols = ["transition_group_id", "run_id", "decoy"]
        if CONFIG.get("export.mayu"):
            self.check_cols += mayu_cols()
            if 'm_score' in self.check_cols:
                self.check_cols.remove(
                    'm_score')  # The m_score is calculated by the learner
                #  and should not be in the OpenSwathWorkflow output

        logging.info("config settings:")
        for k, v in sorted(CONFIG.config.items()):
            logging.info("    %s: %s" % (k, v))

        start_at = time.time()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            (result, scorer, weights) = self.run_algo()

        needed = time.time() - start_at

        set_pandas_print_options()
        self.print_summary(result)
        pvalues = None if scorer is None else scorer.target_pvalues
        self.save_results(result, extra_writes, out_pathes, pvalues)

        self.save_scorer(scorer, extra_writes)
        self.save_weights(weights, extra_writes)

        seconds = int(needed)
        msecs = int(1000 * (needed - seconds))
        minutes = int(needed / 60.0)

        print "NEEDED",
        if minutes:
            print minutes, "minutes and",

        print "%d seconds and %d msecs wall time" % (seconds, msecs)
        print
Esempio n. 18
0
def _main(args):

    pathes = parse_cmdline(args)

    apply_scorer = CONFIG.get("apply_scorer")
    apply_weights = CONFIG.get("apply_weights")
    prefix = CONFIG.get("target.prefix")
    merge_results = CONFIG.get("multiple_files.merge_results")
    delim_in = CONFIG.get("delim.in")
    delim_out = CONFIG.get("delim.out")
    out_of_core = CONFIG.get("out_of_core")

    random_seed = CONFIG.get("random_seed")
    num_processes = CONFIG.get("num_processes")

    if random_seed is not None and num_processes != 1:
        raise Exception(
            "Setting random seed does not work if you run pyprophet with multiple "
            "processes. Using more than one process is rarely faster.")

    if random_seed is not None:
        np.random.seed(random_seed)

    if apply_scorer and apply_weights:
        raise Exception("can not apply scorer and weights at the same time")

    learning_mode = not apply_scorer and not apply_weights

    if learning_mode:
        if out_of_core:
            PyProphetOutOfCoreLearner(pathes, prefix, merge_results, delim_in,
                                      delim_out).run()
        else:
            PyProphetLearner(pathes, prefix, merge_results, delim_in,
                             delim_out).run()

    elif apply_weights:
        if out_of_core:
            PyProphetOutOfCoreWeightApplier(pathes, prefix, merge_results,
                                            apply_weights, delim_in,
                                            delim_out).run()
        else:
            PyProphetWeightApplier(pathes, prefix, merge_results,
                                   apply_weights, delim_in, delim_out).run()

    else:
        if out_of_core:
            logging.info(
                "out_of_core setting ignored: this parameter has no influence for "
                "applying a persisted scorer")
        PyProphetOutOfCoreScorerApplier(pathes, prefix, merge_results,
                                        apply_scorer, delim_in,
                                        delim_out).run()
Esempio n. 19
0
    def run(self):

        self.prefix = self.check_pathes()
        dirname = self.determine_output_dir_name()
        out_pathes = self.create_out_pathes(dirname)

        extra_writes = dict(self.extra_writes(dirname))

        to_check = list(v for p in out_pathes for v in p.values())
        to_check.extend(extra_writes.values())

        if not CONFIG.get("target.overwrite"):
            error = check_if_any_exists(to_check)
            if error:
                return False

        self.check_cols = ["transition_group_id", "run_id", "decoy"]
        if CONFIG.get("export.mayu"):
            self.check_cols += mayu_cols()

        logging.info("config settings:")
        for k, v in sorted(CONFIG.config.items()):
            logging.info("    %s: %s" % (k, v))

        start_at = time.time()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            (result, scorer, weights) = self.run_algo()

        compress = CONFIG.get("target.compress_results")
        needed = time.time() - start_at

        set_pandas_print_options()
        self.print_summary(result)
        self.save_results(result, extra_writes, out_pathes)

        self.save_scorer(scorer, extra_writes)
        self.save_weights(weights, extra_writes)

        seconds = int(needed)
        msecs = int(1000 * (needed - seconds))
        minutes = int(needed / 60.0)

        print "NEEDED",
        if minutes:
            print minutes, "minutes and",

        print "%d seconds and %d msecs wall time" % (seconds, msecs)
        print
Esempio n. 20
0
    def _build_result(self, tables, final_classifier, score_columns, experiment,
                      all_test_target_scores, all_test_decoy_scores):

        merge_results = CONFIG.get("multiple_files.merge_results")
        weights = final_classifier.get_parameters()
        scorer = Scorer(final_classifier, score_columns, experiment, all_test_target_scores,
                        all_test_decoy_scores, merge_results)

        scored_tables = list(scorer.score_many(tables))

        final_statistics, summary_statistics = scorer.get_error_stats()

        result = Result(summary_statistics, final_statistics, scored_tables)

        logging.info("calculated scoring and statistics")
        return result, scorer, weights
Esempio n. 21
0
    def _apply_weights_on_exp(self, experiment, loaded_weights):

        learner = self.semi_supervised_learner

        logging.info("start application of pretrained weights")
        clf_scores = learner.score(experiment, loaded_weights)
        experiment.set_and_rerank("classifier_score", clf_scores)

        all_test_target_scores = experiment.get_top_target_peaks()["classifier_score"]
        all_test_decoy_scores = experiment.get_top_decoy_peaks()["classifier_score"]
        logging.info("finished pretrained scoring")

        ws = [loaded_weights.flatten()]
        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        return final_classifier, all_test_target_scores, all_test_decoy_scores
Esempio n. 22
0
def timer(name=""):
    start_at = time.time()

    yield

    needed = time.time() - start_at
    hours = int(needed / 3600)
    needed -= hours * 3600

    minutes = int(needed / 60)
    needed -= minutes * 60

    if name:
        logging.info("time needed for %s: %02d:%02d:%.1f" % (name, hours, minutes, needed))
    else:
        logging.info("time needed: %02d:%02d:%.1f" % (hours, minutes, needed))
Esempio n. 23
0
def timer(name=""):
    start_at = time.time()

    yield

    needed = time.time() - start_at
    hours = int(needed / 3600)
    needed -= hours * 3600

    minutes = int(needed / 60)
    needed -= minutes * 60

    if name:
        logging.info("time needed for %s: %02d:%02d:%.1f" %
                     (name, hours, minutes, needed))
    else:
        logging.info("time needed: %02d:%02d:%.1f" % (hours, minutes, needed))
Esempio n. 24
0
    def determine_output_dir_name(self):

        # from now on: paramterchecks above only for learning mode

        dirname = CONFIG.get("target.dir")
        if dirname is None:
            dirnames = set(os.path.dirname(path) for path in self.pathes)
            # is always ok for not learning_mode, which includes that pathes has only one entry
            if len(dirnames) > 1:
                raise Exception("could not derive common directory name of input files, please use "
                                "--target.dir option")
            dirname = dirnames.pop()

        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)
            logging.info("created folder %s" % dirname)
        return dirname
Esempio n. 25
0
    def _build_result(self, tables, final_classifier, score_columns,
                      experiment, all_test_target_scores,
                      all_test_decoy_scores):

        merge_results = CONFIG.get("multiple_files.merge_results")
        weights = final_classifier.get_parameters()
        scorer = Scorer(final_classifier, score_columns, experiment,
                        all_test_target_scores, all_test_decoy_scores,
                        merge_results)

        scored_tables = list(scorer.score_many(tables))

        final_statistics, summary_statistics = scorer.get_error_stats()

        result = Result(summary_statistics, final_statistics, scored_tables)

        logging.info("calculated scoring and statistics")
        return result, scorer, weights
Esempio n. 26
0
    def _apply_weights_on_exp(self, experiment, loaded_weights):

        learner = self.semi_supervised_learner

        logging.info("start application of pretrained weights")
        clf_scores = learner.score(experiment, loaded_weights)
        experiment.set_and_rerank("classifier_score", clf_scores)

        all_test_target_scores = experiment.get_top_target_peaks(
        )["classifier_score"]
        all_test_decoy_scores = experiment.get_top_decoy_peaks(
        )["classifier_score"]
        logging.info("finished pretrained scoring")

        ws = [loaded_weights.flatten()]
        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        return final_classifier, all_test_target_scores, all_test_decoy_scores
Esempio n. 27
0
    def learn_randomized(self, experiment):
        assert isinstance(experiment, Experiment)

        num_iter = CONFIG.get("semi_supervised_learner.num_iter")
        logging.info("start learn_randomized")

        fraction = CONFIG.get("xeval.fraction")
        is_test = CONFIG.get("is_test")
        experiment.split_for_xval(fraction, is_test)
        train = experiment.get_train_peaks()

        train.rank_by("main_score")

        params, clf_scores = self.start_semi_supervised_learning(train)

        train.set_and_rerank("classifier_score", clf_scores)

        # semi supervised iteration:
        for inner in range(num_iter):
            params, clf_scores = self.iter_semi_supervised_learning(train)
            train.set_and_rerank("classifier_score", clf_scores)

        # after semi supervised iteration: classify full dataset
        clf_scores = self.score(experiment, params)
        mu, nu = mean_and_std_dev(clf_scores)
        experiment.set_and_rerank("classifier_score", clf_scores)

        td_scores = experiment.get_top_decoy_peaks()["classifier_score"]

        mu, nu = mean_and_std_dev(td_scores)
        experiment["classifier_score"] = (experiment["classifier_score"] -
                                          mu) / nu
        experiment.rank_by("classifier_score")

        top_test_peaks = experiment.get_top_test_peaks()

        top_test_target_scores = top_test_peaks.get_target_peaks(
        )["classifier_score"]
        top_test_decoy_scores = top_test_peaks.get_decoy_peaks(
        )["classifier_score"]

        logging.info("end learn_randomized")

        return top_test_target_scores, top_test_decoy_scores, params
Esempio n. 28
0
    def _learn(self, experiment):
        is_test = CONFIG.get("is_test")
        if is_test:  # for reliable results
            experiment.df.sort("tg_id", ascending=True, inplace=True)

        learner = self.semi_supervised_learner
        ws = []

        neval = CONFIG.get("xeval.num_iter")
        num_processes = CONFIG.get("num_processes")
        all_test_target_scores = []
        all_test_decoy_scores = []

        logging.info("learn and apply scorer")
        logging.info("start %d cross evals using %d processes" %
                     (neval, num_processes))

        if num_processes == 1:
            for k in range(neval):
                (ttt_scores, ttd_scores,
                 w) = learner.learn_randomized(experiment)
                all_test_target_scores.extend(ttt_scores)
                all_test_decoy_scores.extend(ttd_scores)
                ws.append(w.flatten())
        else:
            pool = multiprocessing.Pool(processes=num_processes)
            while neval:
                remaining = max(0, neval - num_processes)
                todo = neval - remaining
                neval -= todo
                args = ((learner, "learn_randomized", (experiment, )), ) * todo
                res = pool.map(unwrap_self_for_multiprocessing, args)
                ttt_scores = [ti for r in res for ti in r[0]]
                ttd_scores = [ti for r in res for ti in r[1]]
                ws.extend([r[2] for r in res])
                all_test_target_scores.extend(ttt_scores)
                all_test_decoy_scores.extend(ttd_scores)
        logging.info("finished cross evals")
        logging.info("")

        # only use socres from last iteration to build statistical model:
        if CONFIG.get("semi_supervised_learner.stat_best"):
            all_test_target_scores = ttt_scores
            all_test_decoy_scores = ttd_scores

        # we only use weights from last iteration if indicated:
        if CONFIG.get("semi_supervised_learner.use_best"):
            ws = [ws[-1]]

        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        return final_classifier, all_test_target_scores, all_test_decoy_scores
Esempio n. 29
0
def _main(args):

    pathes = parse_cmdline(args)

    apply_scorer = CONFIG.get("apply_scorer")
    apply_weights = CONFIG.get("apply_weights")
    prefix = CONFIG.get("target.prefix")
    merge_results = CONFIG.get("multiple_files.merge_results")
    delim_in = CONFIG.get("delim.in")
    delim_out = CONFIG.get("delim.out")
    out_of_core = CONFIG.get("out_of_core")

    random_seed = CONFIG.get("random_seed")

    if random_seed is not None:
        random.seed(random_seed)

    if apply_scorer and apply_weights:
        raise Exception("can not apply scorer and weights at the same time")

    learning_mode = not apply_scorer and not apply_weights

    if learning_mode:
        if out_of_core:
            PyProphetOutOfCoreLearner(pathes, prefix, merge_results, delim_in, delim_out).run()
        else:
            PyProphetLearner(pathes, prefix, merge_results, delim_in, delim_out).run()

    elif apply_weights:
        if out_of_core:
            PyProphetOutOfCoreWeightApplier(
                pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run()
        else:
            PyProphetWeightApplier(
                pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run()

    else:
        if out_of_core:
            logging.info("out_of_core setting ignored: this parameter has no influence for "
                         "applying a persisted scorer")
        PyProphetOutOfCoreScorerApplier(
            pathes, prefix, merge_results, apply_scorer, delim_in, delim_out).run()
Esempio n. 30
0
    def learn_randomized(self, experiment):
        assert isinstance(experiment, Experiment)

        num_iter = CONFIG.get("semi_supervised_learner.num_iter")
        logging.info("start learn_randomized")

        fraction = CONFIG.get("xeval.fraction")
        is_test = CONFIG.get("is_test")
        experiment.split_for_xval(fraction, is_test)
        train = experiment.get_train_peaks()

        train.rank_by("main_score")

        params, clf_scores = self.start_semi_supervised_learning(train)

        train.set_and_rerank("classifier_score", clf_scores)

        # semi supervised iteration:
        for inner in range(num_iter):
            params, clf_scores = self.iter_semi_supervised_learning(train)
            train.set_and_rerank("classifier_score", clf_scores)

        # after semi supervised iteration: classify full dataset
        clf_scores = self.score(experiment, params)
        mu, nu = mean_and_std_dev(clf_scores)
        experiment.set_and_rerank("classifier_score", clf_scores)

        td_scores = experiment.get_top_decoy_peaks()["classifier_score"]

        mu, nu = mean_and_std_dev(td_scores)
        experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu
        experiment.rank_by("classifier_score")

        top_test_peaks = experiment.get_top_test_peaks()

        top_test_target_scores = top_test_peaks.get_target_peaks()["classifier_score"]
        top_test_decoy_scores = top_test_peaks.get_decoy_peaks()["classifier_score"]

        logging.info("end learn_randomized")

        return top_test_target_scores, top_test_decoy_scores, params
Esempio n. 31
0
    def _learn(self, experiment):
        is_test = CONFIG.get("is_test")
        if is_test:  # for reliable results
            experiment.df.sort("tg_id", ascending=True, inplace=True)

        learner = self.semi_supervised_learner
        ws = []

        neval = CONFIG.get("xeval.num_iter")
        num_processes = CONFIG.get("num_processes")
        all_test_target_scores = []
        all_test_decoy_scores = []

        logging.info("learn and apply scorer")
        logging.info("start %d cross evals using %d processes" % (neval, num_processes))

        if num_processes == 1:
            for k in range(neval):
                (ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment)
                all_test_target_scores.extend(ttt_scores)
                all_test_decoy_scores.extend(ttd_scores)
                ws.append(w.flatten())
        else:
            pool = multiprocessing.Pool(processes=num_processes)
            while neval:
                remaining = max(0, neval - num_processes)
                todo = neval - remaining
                neval -= todo
                args = ((learner, "learn_randomized", (experiment, )), ) * todo
                res = pool.map(unwrap_self_for_multiprocessing, args)
                ttt_scores = [ti for r in res for ti in r[0]]
                ttd_scores = [ti for r in res for ti in r[1]]
                ws.extend([r[2] for r in res])
                all_test_target_scores.extend(ttt_scores)
                all_test_decoy_scores.extend(ttd_scores)
        logging.info("finished cross evals")
        logging.info("")

        # only use socres from last iteration to build statistical model:
        if CONFIG.get("semi_supervised_learner.stat_best"):
            all_test_target_scores = ttt_scores
            all_test_decoy_scores = ttd_scores

        # we only use weights from last iteration if indicated:
        if CONFIG.get("semi_supervised_learner.use_best"):
            ws = [ws[-1]]

        final_classifier = self.semi_supervised_learner.averaged_learner(ws)

        return final_classifier, all_test_target_scores, all_test_decoy_scores
Esempio n. 32
0
def posterior_pg_prob(dvals, target_scores, decoy_scores, error_stat, number_target_peaks,
                      number_target_pg,
                      given_scores, lambda_):
    """Compute posterior probabilities for each peakgroup

    - Estimate the true distribution by using all target peakgroups above the
      given the cutoff (estimated FDR as given as input). Assume gaussian distribution.

    - Estimate the false/decoy distribution by using all decoy peakgroups.
      Assume gaussian distribution.

    """

    # Note that num_null and num_total are the sum of the
    # cross-validated statistics computed before, therefore the total
    # number of data points selected will be
    #   len(data) /  xeval.fraction * xeval.num_iter
    #
    logging.info("Posterior Probability estimation:")
    logging.info("Estimated number of null %.2f out of a total of %s."
                 % (error_stat.num_null, error_stat.num_total))

    prior_chrom_null = error_stat.num_null / error_stat.num_total
    number_true_chromatograms = (1.0 - prior_chrom_null) * number_target_peaks
    prior_peakgroup_true = number_true_chromatograms / number_target_pg

    logging.info("Prior for a peakgroup: %s" % (number_true_chromatograms / number_target_pg))
    logging.info("Prior for a chromatogram: %s" % (1.0 - prior_chrom_null))
    logging.info("Estimated number of true chromatograms: %s out of %s" %
                 (number_true_chromatograms, number_target_peaks))
    logging.info("Number of target data: %s" % number_target_pg)
    logging.info("")

    # Estimate a suitable cutoff in discriminant score (d_score)
    # target_scores = experiment.get_top_target_peaks().df["d_score"]
    # decoy_scores = experiment.get_top_decoy_peaks().df["d_score"]
    estimated_cutoff = find_cutoff(target_scores, decoy_scores, lambda_, 0.15, False, False)

    target_scores_above = target_scores[target_scores > estimated_cutoff]

    # Use all decoys and top-peaks of top target chromatograms to
    # parametrically estimate the two distributions

    p_decoy = scipy.stats.norm.pdf(given_scores, np.mean(dvals), np.std(dvals, ddof=1))
    p_target = scipy.stats.norm.pdf(
        given_scores, np.mean(target_scores_above), np.std(target_scores_above, ddof=1))

    # Bayesian inference
    # Posterior probabilities for each peakgroup
    pp_pg_pvalues = p_target * prior_peakgroup_true / (p_target * prior_peakgroup_true
                                                       + p_decoy * (1.0 - prior_peakgroup_true))

    return pp_pg_pvalues
Esempio n. 33
0
 def read_tables_iter(self, pathes, delim):
     logging.info("process %s" % ", ".join(pathes))
     for path in pathes:
         part = read_csv(path, delim)
         yield part
Esempio n. 34
0
 def log_summary(self):
     logging.info("summary input file:")
     logging.info("   %d lines" % len(self.df))
     logging.info("   %d transition groups" % len(self.df.tg_id.unique()))
     logging.info("   %d scores including main score" %
                  (len(self.df.columns.values) - 6))
Esempio n. 35
0
def posterior_pg_prob(dvals, target_scores, decoy_scores, error_stat,
                      number_target_peaks, number_target_pg, given_scores,
                      lambda_):
    """Compute posterior probabilities for each peakgroup

    - Estimate the true distribution by using all target peakgroups above the
      given the cutoff (estimated FDR as given as input). Assume gaussian distribution.

    - Estimate the false/decoy distribution by using all decoy peakgroups.
      Assume gaussian distribution.

    """

    # Note that num_null and num_total are the sum of the
    # cross-validated statistics computed before, therefore the total
    # number of data points selected will be
    #   len(data) /  xeval.fraction * xeval.num_iter
    #
    logging.info("Posterior Probability estimation:")
    logging.info("Estimated number of null %.2f out of a total of %s." %
                 (error_stat.num_null, error_stat.num_total))

    prior_chrom_null = error_stat.num_null / error_stat.num_total
    number_true_chromatograms = (1.0 - prior_chrom_null) * number_target_peaks
    prior_peakgroup_true = number_true_chromatograms / number_target_pg

    logging.info("Prior for a peakgroup: %s" %
                 (number_true_chromatograms / number_target_pg))
    logging.info("Prior for a chromatogram: %s" % (1.0 - prior_chrom_null))
    logging.info("Estimated number of true chromatograms: %s out of %s" %
                 (number_true_chromatograms, number_target_peaks))
    logging.info("Number of target data: %s" % number_target_pg)
    logging.info("")

    # Estimate a suitable cutoff in discriminant score (d_score)
    # target_scores = experiment.get_top_target_peaks().df["d_score"]
    # decoy_scores = experiment.get_top_decoy_peaks().df["d_score"]
    estimated_cutoff = find_cutoff(target_scores, decoy_scores, lambda_, 0.15,
                                   False, False)

    target_scores_above = target_scores[target_scores > estimated_cutoff]

    # Use all decoys and top-peaks of top target chromatograms to
    # parametrically estimate the two distributions

    p_decoy = scipy.stats.norm.pdf(given_scores, np.mean(dvals),
                                   np.std(dvals, ddof=1))
    p_target = scipy.stats.norm.pdf(given_scores, np.mean(target_scores_above),
                                    np.std(target_scores_above, ddof=1))

    # Bayesian inference
    # Posterior probabilities for each peakgroup
    pp_pg_pvalues = p_target * prior_peakgroup_true / (
        p_target * prior_peakgroup_true + p_decoy *
        (1.0 - prior_peakgroup_true))

    return pp_pg_pvalues
Esempio n. 36
0
 def read_tables_iter(self, pathes, delim):
     logging.info("process %s" % ", ".join(pathes))
     for path in pathes:
         part = read_csv(path, delim)
         yield part
Esempio n. 37
0
 def log_summary(self):
     logging.info("summary input file:")
     logging.info("   %d lines" % len(self.df))
     logging.info("   %d transition groups" % len(self.df.tg_id.unique()))
     logging.info("   %d scores including main score" % (len(self.df.columns.values) - 6))