Ejemplo n.º 1
0
def test_bedslice():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    grouped = bins.groupby('chrom')
    df = util.bedslice(grouped, chromsizes, 'chr1:0-12')
    assert df['chrom'].tolist() == ['chr1', 'chr1']
    assert df['start'].tolist() == [0, 10]
Ejemplo n.º 2
0
def test_parse_region():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    assert util.parse_region(('chr1', 0, 10)) == ('chr1', 0, 10)
    assert util.parse_region('chr1:0-10') == ('chr1', 0, 10)
    assert util.parse_region('chr1:0-',
                             chromsizes) == ('chr1', 0, chromsizes['chr1'])

    # Don't accept undefined end unless chromsizes exists
    # NOTE: parse_region_string works here
    with pytest.raises(ValueError):
        util.parse_region('chr1:0-')

    # catch end < start in non-string case
    with pytest.raises(ValueError):
        util.parse_region(('chr1', 10, 0))

    # catch errors when chromsizes is given
    for region in [
        ('chr1', 0, 1000),
        ('chr1', -5, 10),
        ('DoesNotExist', 0, 10),
            'DoesNotExist',
    ]:
        with pytest.raises(ValueError):
            util.parse_region(region, chromsizes)
Ejemplo n.º 3
0
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, smoothing=True,
                      chunk_size=50000000):
    output_dir = pathlib.Path(output_dir).absolute()
    output_dir.mkdir(exist_ok=True)

    allc_table = pd.read_csv(allc_table_path, sep='\t')
    allc_table.columns = ['allc_path', 'sample', 'group']

    if allc_table['group'].unique().size != 2:
        raise ValueError(
            f"There must be two and only two different groups, got {allc_table['group'].unique().size}."
        )
    group1, group2 = allc_table['group'].unique()
    group1_allc = allc_table.loc[allc_table['group'] == group1,
                                 'allc_path'].tolist()
    group2_allc = allc_table.loc[allc_table['group'] == group2,
                                 'allc_path'].tolist()
    group1_id = allc_table.loc[allc_table['group'] == group1, 'sample'].tolist()
    group2_id = allc_table.loc[allc_table['group'] == group2, 'sample'].tolist()

    chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms)
    if chroms is None:
        chroms = chrom_sizes.index.tolist()
    bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size)
    regions = []
    for _, (chrom, start, end) in bins.iterrows():
        region = f'{chrom}:{start}-{end}'
        regions.append(region)

    for region in regions:
        config_path = f'{output_dir}/{region}.yaml'
        parameters = dict(region=region,
                          allc_paths=group1_allc + group2_allc,
                          group1=group1_id,
                          group2=group2_id,
                          smoothing=smoothing)
        with open(config_path, 'w') as f:
            f.write(yaml.dump(parameters))

    snakefile = f"""
regions = {regions}
rule main:
    input:
        expand('{{region}}.DSS.DML.hdf', region=regions)

rule papermill:
    input:
        nb='{template_path}',
        config='{{region}}.yaml'
    output:
        nb='{{region}}.ipynb',
        data='{{region}}.DSS.DML.hdf'
    shell:
        'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10'
"""
    snakefile_path = f'{output_dir}/Snakefile'
    with open(snakefile_path, 'w') as f:
        f.write(snakefile)
    return snakefile_path
Ejemplo n.º 4
0
def test_genome_segmentation():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    gs = util.GenomeSegmentation(chromsizes, bins)
    df = gs.fetch('chr1')
    assert len(df) == 4
    df = gs.fetch('chr1:2-30')
    assert len(df) == 3
    util.balanced_partition(gs, 2, ['chr1'])
Ejemplo n.º 5
0
def test_get_binsize():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert util.get_binsize(bins) == 10

    # variable-sized bins
    bins = pd.read_csv(op.join(datadir, 'toy.bins.var.bed'),
                       names=['chrom', 'start', 'end'],
                       sep='\t')
    assert util.get_binsize(bins) is None

    # ambiguous case: one bin per chromosome with different lengths
    bins = pd.DataFrame({
        'chrom': ['chr1', 'chr2', 'chr3'],
        'start': [0, 0, 0],
        'end': [100, 200, 300]
    })
    assert util.get_binsize(bins) is None
Ejemplo n.º 6
0
def prepare_snakemake(allc_table_path,
                      output_dir,
                      chrom_sizes_path,
                      template_path,
                      chroms=None,
                      test_covariate='group',
                      match_covariate=None,
                      adjust_covariate=None,
                      cutoff=0.1,
                      min_num_region=3,
                      smooth=True,
                      bp_span=1000,
                      min_in_span=30,
                      max_gap_smooth=2500,
                      max_gap=1000,
                      verbose=True,
                      max_perms=10,
                      stat="stat",
                      block=False,
                      block_size=5000,
                      chrs_per_chunk=1,
                      cpu=40,
                      chunk_size=5000000000):
    output_dir = pathlib.Path(output_dir).absolute()
    output_dir.mkdir(exist_ok=True)

    chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms)
    if chroms is None:
        chroms = chrom_sizes.index.tolist()
    bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size)
    regions = []
    for _, (chrom, start, end) in bins.iterrows():
        region = f'{chrom}:{start}-{end}'
        regions.append(region)

    for region in regions:
        config_path = f'{output_dir}/{region}.yaml'
        parameters = {
            'region': region,
            'allc_table_path': allc_table_path,
            'test_covariate': test_covariate,
            'match_covariate': match_covariate,
            'adjust_covariate': adjust_covariate,
            'cutoff': cutoff,
            'min_num_region': min_num_region,
            'smooth': smooth,
            'bp_span': bp_span,
            'min_in_span': min_in_span,
            'max_gap_smooth': max_gap_smooth,
            'max_gap': max_gap,
            'verbose': verbose,
            'max_perms': max_perms,
            'stat': stat,
            'block': block,
            'block_size': block_size,
            'chrs_per_chunk': chrs_per_chunk,
            'cpu': cpu
        }
        with open(config_path, 'w') as f:
            f.write(yaml.dump(parameters))

    snakefile = f"""
regions = {regions}
rule main:
    input:
        expand('{{region}}.DMR.hdf', region=regions)

rule papermill:
    input:
        nb='{template_path}',
        config='{{region}}.yaml'
    output:
        nb='{{region}}.ipynb',
        data='{{region}}.DMR.hdf'
    threads:
        1 #{cpu}
    shell:
        'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10'
"""
    snakefile_path = f'{output_dir}/Snakefile'
    with open(snakefile_path, 'w') as f:
        f.write(snakefile)
    return regions
Ejemplo n.º 7
0
def test_check_bins():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    bins['chrom'] = bins['chrom'].astype(str)
    bins = util.check_bins(bins, chromsizes)
    assert pd.api.types.is_categorical(bins["chrom"])
Ejemplo n.º 8
0
def test_get_chromsizes():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert np.allclose(util.get_chromsizes(bins), chromsizes)
Ejemplo n.º 9
0
def test_binnify():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert len(bins) == 8
Ejemplo n.º 10
0
def test_read_chromsizes():
    util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))