Example #1
0
    def test_one_sample(self, a):
        mn = get_this_file_or_timestamped(
            os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))

        os.remove(mn)

        a.measure_coverage(samples=a.samples[:1])

        mn = get_this_file_or_timestamped(mn)
        assert file_exists_and_not_empty(mn)
        assert pd.read_csv(mn, index_col=0).shape[1] == 1
Example #2
0
    def test_missing_input_with_permissive(self, a):
        mn = get_this_file_or_timestamped(
            os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))

        os.remove(mn)
        os.remove(a.samples[0].aligned_filtered_bam)

        a.measure_coverage(samples=a.samples[:2], permissive=True)

        mn = get_this_file_or_timestamped(mn)
        assert file_exists_and_not_empty(mn)
        assert pd.read_csv(mn, index_col=0).shape[1] == 1
Example #3
0
    def test_distributed(self, a):
        mn = get_this_file_or_timestamped(
            os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))

        os.remove(mn)

        a.measure_coverage(distributed=True,
                           computing_configuration="localhost")

        # Check job files for each sample exist
        fs = list()
        for s in a.samples:
            f = os.path.join(s.sample_root, "coverage",
                             s.name + ".peak_set_coverage.")
            for end in ["sh", "bed"]:
                fs.append(f + end)
        assert all([file_exists_and_not_empty(f) for f in fs])

        # # has to be done separately for log files because they'll empty
        # # just check for existence
        fs = list()
        for s in a.samples:
            f = os.path.join(s.sample_root, "coverage",
                             s.name + ".peak_set_coverage.")
            for end in ["log"]:
                fs.append(f + end)
        assert all([os.path.exists(f) for f in fs])
Example #4
0
def test_remove_factor(atac_analysis_many_factors):
    import pandas as pd

    a = atac_analysis_many_factors
    a.matrix_norm = a.matrix_norm.dropna()

    prefix = os.path.join(a.results_dir,
                          "unsupervised_analysis_{}".format(a.data_type),
                          a.name)
    # inspect
    a.unsupervised_analysis(output_prefix="before", steps=["pca_association"])

    f = prefix + ".before.pca.variable_principle_components_association.csv"
    p = pd.read_csv(get_this_file_or_timestamped(f))

    # extract the name of the factor with highest contribution
    factor = p.iloc[p.query("pc == 1")["p_value"].idxmin()]["attribute"]
    # check if it's significant
    assert p.query(
        "attribute == '{}' and pc < 15".format(factor))["p_value"].min() < 0.05

    # remove factor without regard for the other factors
    m = a.remove_factor_from_matrix(factor=factor, assign=False, save=False)
    a.unsupervised_analysis(matrix=m,
                            output_prefix="after_simple",
                            steps=["pca_association"])

    f = prefix + ".after_simple.pca.variable_principle_components_association.csv"
    p2 = pd.read_csv(get_this_file_or_timestamped(f))
    assert p2.query(
        "attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05

    # remove factor accounting for the other factors
    m = a.remove_factor_from_matrix(
        factor=factor,
        covariates=[x for x in a.group_attributes if x != factor],
        assign=False,
        save=False,
    )
    a.unsupervised_analysis(matrix=m,
                            output_prefix="after_covariates",
                            steps=["pca_association"])

    f = prefix + ".after_covariates.pca.variable_principle_components_association.csv"
    p3 = pd.read_csv(get_this_file_or_timestamped(f))
    assert p3.query(
        "attribute == '{}' and pc < 15".format(factor))["p_value"].min() > 0.05
Example #5
0
    def test_missing_input_no_permissive(self, a):
        mn = get_this_file_or_timestamped(
            os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))

        os.remove(mn)
        os.remove(a.samples[0].aligned_filtered_bam)

        with pytest.raises(IOError):
            a.measure_coverage(samples=a.samples[:1])
Example #6
0
    def test_no_arguments(self, a):
        mn = get_this_file_or_timestamped(
            os.path.join(a.results_dir, a.name + ".matrix_raw.csv"))

        os.remove(mn)

        a.measure_coverage()

        assert file_exists_and_not_empty(mn)
Example #7
0
    def test_analysis_serialization(self, tmp_path):

        tmp_path = str(tmp_path)

        pickle_file = os.path.join(tmp_path, "analysis.pickle")
        a = Analysis(pickle_file=pickle_file)
        assert not file_exists(pickle_file)
        a.to_pickle()
        assert file_exists(pickle_file)
        assert file_not_empty(pickle_file)

        previous_size = os.stat(
            get_this_file_or_timestamped(pickle_file)).st_size
        a.random = np.random.random((100, 100))
        a.to_pickle()
        new_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size
        assert new_size > previous_size

        previous_size = os.stat(
            get_this_file_or_timestamped(pickle_file)).st_size
        a.random = np.random.random((100, 100))
        a.to_pickle(timestamp=True)
        assert len(glob.glob(os.path.join(tmp_path, "*.pickle"))) == 2
Example #8
0
def test_get_right_timestamped_file(tmpdir):
    from ngs_toolkit.utils import get_this_file_or_timestamped

    target = os.path.join(tmpdir, "human.grch38.genomic_context.bed")
    assert get_this_file_or_timestamped(target) == target

    outs = [
        "human.grch38.genomic_context.2019-09-03-11:46:42.bed",
        "human.grch38.genomic_context.exon.2019-09-03-11:46:36.bed",
        "human.grch38.genomic_context.genebody.2019-09-03-11:46:36.bed",
        "human.grch38.genomic_context.intergenic.2019-09-03-11:46:41.bed",
        "human.grch38.genomic_context.intron.2019-09-03-11:46:38.bed",
        "human.grch38.genomic_context.promoter.2019-09-03-11:46:36.bed",
        "human.grch38.genomic_context.utr3.2019-09-03-11:46:40.bed",
        "human.grch38.genomic_context.utr5.2019-09-03-11:46:39.bed"]
    outs = [os.path.join(tmpdir, f) for f in outs]

    # Now with several existing files that also match the regex
    for f in outs:
        with open(f, "w") as handle:
            handle.write(f)

    assert get_this_file_or_timestamped(target) == outs[0]
Example #9
0
def file_exists_and_not_empty(file):
    from ngs_toolkit.utils import get_this_file_or_timestamped

    f = get_this_file_or_timestamped(file)

    return os.path.exists(f) and (os.stat(f).st_size > 0)
Example #10
0
def file_not_empty(file):
    from ngs_toolkit.utils import get_this_file_or_timestamped

    return os.stat(get_this_file_or_timestamped(file)).st_size > 0
Example #11
0
def file_exists(file):
    from ngs_toolkit.utils import get_this_file_or_timestamped

    return os.path.exists(get_this_file_or_timestamped(file))
Example #12
0
 def wrapper(*args, **kwargs):
     for i, _ in enumerate(args):
         if isinstance(args[i], str):
             args = args[:i] + (get_this_file_or_timestamped(
                 args[i]), ) + args[i + 1:]
     return f(*args, **kwargs)
Example #13
0
 def wrapper(file):
     return f(get_this_file_or_timestamped(file))