Ejemplo n.º 1
0
def test_sanitize_records_triu_action():
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    out = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect")(
        chunk.copy()
    )
    is_tril = ~np.array(out["triu"], dtype=bool)
    is_tril_ix = out.index[is_tril]
    assert np.all(out.loc[is_tril_ix, "chrom1"] == chunk.loc[is_tril_ix, "chrom2"])
    assert np.all(out.loc[is_tril_ix, "chrom2"] == chunk.loc[is_tril_ix, "chrom1"])
    assert np.all(out.loc[is_tril_ix, "strand1"] == "+")

    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    out = sanitize_records(bins, schema="pairs", validate=True, tril_action="drop")(
        chunk.copy()
    )
    is_tril = ~np.array(out["triu"], dtype=bool)
    is_tril_ix = out.index[is_tril]
    assert np.all(out.loc[is_tril_ix, "chrom1"] == chunk.loc[is_tril_ix, "chrom2"])
    assert np.all(out.loc[is_tril_ix, "chrom2"] == chunk.loc[is_tril_ix, "chrom1"])
    assert np.all(out.loc[is_tril_ix, "strand1"] == "+")
    assert len(out) == chunk["triu"].sum()

    func = sanitize_records(bins, schema="pairs", validate=True, tril_action="raise")
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    with pytest.raises(BadInputError):
        func(chunk)
Ejemplo n.º 2
0
def test_aggregate_records():
    bins = cooler.binnify(
        cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1
    )
    records = pd.read_csv(
        op.join(datadir, "toy.pairs"),
        sep='\t',
        names=[
            "read_id",
            "chrom1", "pos1",
            "chrom2", "pos2",
            "strand1", "strand2",
            "value"
        ]
    )
    sanitizer = sanitize_records(
        bins,
        schema="pairs",
        validate=False,
        tril_action="reflect",
        is_one_based=False,
        sort=False,
    )
    chunk = sanitizer(records)

    aggregator = aggregate_records()
    aggregator(chunk)
Ejemplo n.º 3
0
def test_sanitize_records_with_nuisance_records():
    text = _insert_lines(valid_data, nuisance_chroms)
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    out = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect")(
        chunk.copy()
    )
    assert ("chr9" not in out["chrom1"]) and ("chr9" not in out["chrom2"])
Ejemplo n.º 4
0
def test_sanitize_triu_action():
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep='\t', names=columns)
    out = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='reflect',
    )(chunk.copy())
    is_tril = ~np.array(out['triu'], dtype=bool)
    is_tril_ix = out.index[is_tril]
    assert np.all(out.loc[is_tril_ix, 'chrom1'] == chunk.loc[is_tril_ix,
                                                             'chrom2'])
    assert np.all(out.loc[is_tril_ix, 'chrom2'] == chunk.loc[is_tril_ix,
                                                             'chrom1'])
    assert np.all(out.loc[is_tril_ix, 'strand1'] == '+')

    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep='\t', names=columns)
    out = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='drop',
    )(chunk.copy())
    is_tril = ~np.array(out['triu'], dtype=bool)
    is_tril_ix = out.index[is_tril]
    assert np.all(out.loc[is_tril_ix, 'chrom1'] == chunk.loc[is_tril_ix,
                                                             'chrom2'])
    assert np.all(out.loc[is_tril_ix, 'chrom2'] == chunk.loc[is_tril_ix,
                                                             'chrom1'])
    assert np.all(out.loc[is_tril_ix, 'strand1'] == '+')
    assert len(out) == chunk['triu'].sum()

    func = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='raise',
    )
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep='\t', names=columns)
    with pytest.raises(BadInputError):
        func(chunk)
Ejemplo n.º 5
0
def test_sanitize_with_nuisance_records():
    text = _insert_lines(valid_data, nuisance_chroms)
    chunk = pd.read_csv(StringIO(text), sep='\t', names=columns)
    out = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='reflect',
    )(chunk.copy())
    assert ('chr9' not in out['chrom1']) and ('chr9' not in out['chrom2'])
Ejemplo n.º 6
0
def test_sanitize_records_with_bad_records():
    func = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect")

    text = _insert_lines(valid_data, oob_lower)
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    with pytest.raises(BadInputError):
        func(chunk)

    text = _insert_lines(valid_data, oob_upper)
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    with pytest.raises(BadInputError):
        func(chunk)
Ejemplo n.º 7
0
def test_sanitize_with_strand_column():
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep='\t', names=columns)
    out = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='reflect',
        sided_fields=('chrom', 'pos', 'strand'),
    )(chunk.copy())
    is_tril = ~np.array(out['triu'], dtype=bool)
    assert np.all(out.loc[is_tril, 'chrom1'] == chunk.loc[is_tril, 'chrom2'])
    assert np.all(out.loc[is_tril, 'chrom2'] == chunk.loc[is_tril, 'chrom1'])
    assert np.all(out.loc[is_tril, 'strand1'] == '-')
Ejemplo n.º 8
0
def test_sanitize_records_with_strand_column():
    text = valid_data
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    out = sanitize_records(
        bins,
        schema="pairs",
        validate=True,
        tril_action="reflect",
        sided_fields=("chrom", "pos", "strand"),
    )(chunk.copy())
    is_tril = ~np.array(out["triu"], dtype=bool)
    assert np.all(out.loc[is_tril, "chrom1"] == chunk.loc[is_tril, "chrom2"])
    assert np.all(out.loc[is_tril, "chrom2"] == chunk.loc[is_tril, "chrom1"])
    assert np.all(out.loc[is_tril, "strand1"] == "-")
Ejemplo n.º 9
0
def test_sanitize_with_bad_records():
    func = sanitize_records(
        bins,
        schema='pairs',
        validate=True,
        tril_action='reflect',
    )

    text = _insert_lines(valid_data, oob_lower)
    chunk = pd.read_table(StringIO(text), names=columns)
    with pytest.raises(BadInputError):
        func(chunk)

    text = _insert_lines(valid_data, oob_upper)
    chunk = pd.read_table(StringIO(text), names=columns)
    with pytest.raises(BadInputError):
        func(chunk)
Ejemplo n.º 10
0
def test_sanitize_records():

    chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns)
    with pytest.raises(ValueError):
        sanitize_records(
            bins,
            schema="doesnotexist",
            validate=True,
            tril_action="reflect",
            is_one_based=True,
            sort=True,
        )(chunk.copy())

    chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns)
    sanitize_records(
        bins,
        schema="pairs",
        validate=True,
        tril_action="reflect",
        is_one_based=True,
        sort=True,
    )(chunk.copy())

    # variable-length bins
    chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns)
    sanitize_records(
        pd.DataFrame({
            'chrom': ['chr1', 'chr1', 'chr2', 'chr2', 'chr3'],
            'start': [0, 150, 0, 100, 0],
            'end': [150, 300, 100, 300, 300],
        }),
        schema="pairs",
        validate=True,
        tril_action="reflect",
        is_one_based=True,
        sort=True,
    )(chunk.copy())

    # input with already enum-encoded chromosomes (decode_chroms=False)
    text = """0\t1\t+\t1\t100\t-\t.\tLL\t1
1\t99\t+\t0\t13\t-\t.\tLL\t0
1\t13\t+\t1\t60\t-\t.\tLL\t1
0\t200\t+\t1\t50\t-\t.\tLL\t1
2\t11\t+\t2\t40\t-\t.\tLL\t1
0\t234\t+\t2\t30\t-\t.\tLL\t1
2\t3\t+\t1\t20\t-\t.\tLL\t0
1\t23\t+\t2\t11\t-\t.\tLL\t1
0\t123\t+\t-1\t200\t-\t.\tLL\t1
"""
    chunk = pd.read_csv(StringIO(text), sep="\t", names=columns)
    sanitize_records(
        bins,
        schema="pairs",
        decode_chroms=False,
        validate=True,
        tril_action="reflect"
    )(chunk.copy())
    # fails on string chromosomes
    chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns)
    with pytest.raises(BadInputError):
        sanitize_records(
            bins,
            schema="pairs",
            decode_chroms=False,
            validate=True,
            tril_action="reflect"
        )(chunk.copy())

    # empty chunk
    out = sanitize_records(
        bins,
        schema="pairs",
        validate=True,
        tril_action="reflect"
    )(chunk.iloc[0:0])
    assert len(out) == 0