Beispiel #1
0
 def chisq_test(self, shuffled=0):
     """performs the chisq test
     Parameters
     ----------
     shuffled
         pvalue is estimated via resampling from the observed data,
         preserving the marginals
     """
     stat = calc_chisq(self.observed.array, self.expected.array)
     if not shuffled:
         pval = chisqprob(stat, self.df)
     else:
         pval = estimate_pval(self.observed.array,
                              calc_chisq,
                              num_reps=shuffled)
     title = "Chisq-test for independence"
     return TestResult(
         self.observed,
         self.expected,
         self.residuals,
         "chisq",
         stat,
         self.df,
         pval,
         test_name=title,
     )
Beispiel #2
0
    def pvalue(self):
        """returns p-value from chisqprob(LR, df)

        None if LR < 0"""
        if self.LR == 0:
            pvalue = 1
        elif self.LR > 0:
            pvalue = chisqprob(self.LR, self.df)
        else:
            pvalue = None
        return pvalue
Beispiel #3
0
    def G_independence(self, pseudo_count=0, williams=True, shuffled=0):
        """performs the independence G test
        Parameters
        ----------
        pseudo_count : int
            added to observed to avoid zero division
        williams : bool
            Applies Williams correction for small sample size
        shuffled : int
            pvalue is estimated via resampling shuffled times from the observed
            data, preserving the marginals
        """
        assert type(pseudo_count) == int, f"{pseudo_count} not an integer"
        obs = self.observed
        exp = self.expected
        if pseudo_count and (obs.array == 0).any():
            obs = obs.template.wrap(obs.array + pseudo_count)
            exp = calc_expected(obs.array)
            exp = obs.template.wrap(exp)

        assert type(shuffled) == int, f"{shuffled} not an integer"
        G = calc_G(
            obs.array,
            exp.array,
            williams=williams,
        )
        if not shuffled:
            pval = chisqprob(G, self.df)
        else:
            pval = estimate_pval(obs.array, calc_G, num_reps=shuffled)

        title = "G-test for independence"
        amendments = ""
        if pseudo_count:
            amendments = f"pseudo_count={pseudo_count}, "

        if williams:
            amendments = f"{amendments}Williams correction"

        if amendments:
            title = f"{title} (with {amendments})"

        result = TestResult(
            obs,
            exp,
            self.residuals,
            "G",
            G,
            self.df,
            pval,
            test_name=title,
        )
        return result
Beispiel #4
0
 def G_independence(self, pseudo_count=0, williams=True, shuffled=0):
     """performs the independence G test
     Parameters
     ----------
     pseudo_count : int
         added to observed to avoid zero division
     shuffled : int
         pvalue is estimated via resampling shuffled times from the observed
         data, preserving the marginals
     """
     assert type(pseudo_count) == int, f"{pseudo_count} not an integer"
     assert type(shuffled) == int, f"{shuffled} not an integer"
     G = calc_G(
         self.observed.array,
         self.expected.array,
         pseudo_count=pseudo_count,
         williams=williams,
     )
     if not shuffled:
         pval = chisqprob(G, self.df)
     else:
         pval = estimate_pval(self.observed.array,
                              calc_G,
                              num_reps=shuffled)
     title = "G-test for independence"
     if williams:
         title = f"{title} (with Williams correction)"
     result = TestResult(
         self.observed,
         self.expected,
         self.residuals,
         "G",
         G,
         self.df,
         pval,
         test_name=title,
     )
     return result
def get_position_effects(table, position_sets, group_label=None):
    pos_results = {}
    grouped = group_label is not None
    if grouped:
        assert len(table.distinct_values(group_label)) == 2

    for position_set in position_sets:
        if not grouped:
            counts = motif_count.get_combined_counts(table, position_set)
        else:
            counts = get_grouped_combined_counts(table, position_set,
                                                 group_label=group_label)
        rel_entropy, deviance, df, stats, formula = \
            log_lin.position_effect(counts, group_label=group_label)
        if deviance < 0:
            p = 1.0
        else:
            p = chisqprob(deviance, df)

        pos_results[position_set] = dict(rel_entropy=rel_entropy,
                                         deviance=deviance, df=df, stats=stats,
                                         formula=formula, prob=p)
    return pos_results
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite,
         dry_run, verbose):
    args = locals()

    table = LoadTable(countsfile, sep='\t')
    if not dry_run:
        log_file_path = os.path.join(util.abspath(outpath),
                                     'spectra_analysis.log')
        LOGGER.log_file_path = log_file_path
        LOGGER.log_message(str(args), label='vars')

    LOGGER.input_file(countsfile)
    # if there's a strand symmetry argument then we don't need a second file
    if strand_symmetry:
        group_label = 'strand'
        counts_table = util.spectra_table(table, group_label)

    if not strand_symmetry:
        group_label = 'group'

        # be sure there's two files
        counts_table2 = LoadTable(countsfile2, sep='\t')
        LOGGER.input_file(countsfile2)
        counts_table2 = counts_table2.with_new_column('group',
                                                      lambda x: '2', columns=counts_table2.header[0])
        counts_table1 = table.with_new_column('group',
                                              lambda x: '1', columns=table.header[0])

        counts_table1 = util.spectra_table(counts_table1, group_label)
        counts_table2 = util.spectra_table(counts_table2, group_label)

        # now combine
        header = ['group'] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = LoadTable(header=header, rows=raw1 + raw2)

        if verbose:
            print(counts_table)

    # spectra table has [count, start, end, group] order
    # we reduce comparisons to a start base
    results = []
    saveable = {}
    for start_base in counts_table.distinct_values('start'):
        subtable = counts_table.filtered('start == "%s"' % start_base)
        columns = [c for c in counts_table.header if c != 'start']
        subtable = subtable.get_columns(columns)
        total_re, dev, df, collated, formula = log_lin.spectra_difference(
            subtable, group_label)
        r = [list(x) for x in collated.to_records(index=False)]

        if not strand_symmetry:
            grp_labels = {'1': countsfile,
                          '2': countsfile2}
            grp_index = list(collated.columns).index('group')
            for row in r:
                row[grp_index] = grp_labels[row[grp_index]]

        p = chisqprob(dev, df)
        if p < 1e-6:
            prob = "%.2e" % p
        else:
            prob = "%.6f" % p

        for row in r:
            row.insert(0, start_base)
            row.append(prob)

        results += r

        significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df,
                        "p=%s" % p]

        stats = "  :  ".join(significance)
        print("Start base=%s  %s" % (start_base, stats))
        saveable[start_base] = dict(rel_entropy=total_re, deviance=dev,
                                    df=df, prob=p,
                                    formula=formula, stats=collated.to_json())

    table = LoadTable(header=['start_base'] + list(collated.columns) +
                             ['prob'],
                      rows=results, digits=5).sorted(columns='ret')
    json_path = None

    outpath = util.abspath(outpath)
    if not dry_run:
        util.makedirs(outpath)
        json_path = os.path.join(outpath, 'spectra_analysis.json')
        dump_json(saveable, json_path)
        LOGGER.output_file(json_path)
        table_path = os.path.join(outpath, 'spectra_summary.txt')
        table.write(table_path, sep='\t')
        LOGGER.output_file(table_path)
        LOGGER.log_message(str(significance), label="significance")